LLVM 22.0.0git
NVPTXTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "NVPTXUtilities.h"
11#include "llvm/ADT/STLExtras.h"
17#include "llvm/IR/Constants.h"
19#include "llvm/IR/Intrinsics.h"
20#include "llvm/IR/IntrinsicsNVPTX.h"
21#include "llvm/IR/Value.h"
26#include <optional>
27using namespace llvm;
28
29#define DEBUG_TYPE "NVPTXtti"
30
31// Whether the given intrinsic reads threadIdx.x/y/z.
32static bool readsThreadIndex(const IntrinsicInst *II) {
33 switch (II->getIntrinsicID()) {
34 default: return false;
35 case Intrinsic::nvvm_read_ptx_sreg_tid_x:
36 case Intrinsic::nvvm_read_ptx_sreg_tid_y:
37 case Intrinsic::nvvm_read_ptx_sreg_tid_z:
38 return true;
39 }
40}
41
42static bool readsLaneId(const IntrinsicInst *II) {
43 return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
44}
45
46// Whether the given intrinsic is an atomic instruction in PTX.
47static bool isNVVMAtomic(const IntrinsicInst *II) {
48 switch (II->getIntrinsicID()) {
49 default:
50 return false;
51 case Intrinsic::nvvm_atomic_add_gen_f_cta:
52 case Intrinsic::nvvm_atomic_add_gen_f_sys:
53 case Intrinsic::nvvm_atomic_add_gen_i_cta:
54 case Intrinsic::nvvm_atomic_add_gen_i_sys:
55 case Intrinsic::nvvm_atomic_and_gen_i_cta:
56 case Intrinsic::nvvm_atomic_and_gen_i_sys:
57 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
58 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
59 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
60 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
61 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
62 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
63 case Intrinsic::nvvm_atomic_max_gen_i_cta:
64 case Intrinsic::nvvm_atomic_max_gen_i_sys:
65 case Intrinsic::nvvm_atomic_min_gen_i_cta:
66 case Intrinsic::nvvm_atomic_min_gen_i_sys:
67 case Intrinsic::nvvm_atomic_or_gen_i_cta:
68 case Intrinsic::nvvm_atomic_or_gen_i_sys:
69 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
70 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
71 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
72 case Intrinsic::nvvm_atomic_xor_gen_i_sys:
73 return true;
74 }
75}
76
78 // Without inter-procedural analysis, we conservatively assume that arguments
79 // to __device__ functions are divergent.
80 if (const Argument *Arg = dyn_cast<Argument>(V))
81 return !isKernelFunction(*Arg->getParent());
82
83 if (const Instruction *I = dyn_cast<Instruction>(V)) {
84 // Without pointer analysis, we conservatively assume values loaded from
85 // generic or local address space are divergent.
86 if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
87 unsigned AS = LI->getPointerAddressSpace();
88 return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
89 }
90 // Atomic instructions may cause divergence. Atomic instructions are
91 // executed sequentially across all threads in a warp. Therefore, an earlier
92 // executed thread may see different memory inputs than a later executed
93 // thread. For example, suppose *a = 0 initially.
94 //
95 // atom.global.add.s32 d, [a], 1
96 //
97 // returns 0 for the first thread that enters the critical region, and 1 for
98 // the second thread.
99 if (I->isAtomic())
100 return true;
102 // Instructions that read threadIdx are obviously divergent.
104 return true;
105 // Handle the NVPTX atomic intrinsics that cannot be represented as an
106 // atomic IR instruction.
107 if (isNVVMAtomic(II))
108 return true;
109 }
110 // Conservatively consider the return value of function calls as divergent.
111 // We could analyze callees with bodies more precisely using
112 // inter-procedural analysis.
113 if (isa<CallInst>(I))
114 return true;
115 }
116
117 return false;
118}
119
120// Convert NVVM intrinsics to target-generic LLVM code where possible.
122 IntrinsicInst *II) {
123 // Each NVVM intrinsic we can simplify can be replaced with one of:
124 //
125 // * an LLVM intrinsic,
126 // * an LLVM cast operation,
127 // * an LLVM binary operation, or
128 // * ad-hoc LLVM IR for the particular operation.
129
130 // Some transformations are only valid when the module's
131 // flush-denormals-to-zero (ftz) setting is true/false, whereas other
132 // transformations are valid regardless of the module's ftz setting.
133 enum FtzRequirementTy {
134 FTZ_Any, // Any ftz setting is ok.
135 FTZ_MustBeOn, // Transformation is valid only if ftz is on.
136 FTZ_MustBeOff, // Transformation is valid only if ftz is off.
137 };
138 // Classes of NVVM intrinsics that can't be replaced one-to-one with a
139 // target-generic intrinsic, cast op, or binary op but that we can nonetheless
140 // simplify.
141 enum SpecialCase {
142 SPC_Reciprocal,
143 SCP_FunnelShiftClamp,
144 };
145
146 // SimplifyAction is a poor-man's variant (plus an additional flag) that
147 // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
148 struct SimplifyAction {
149 // Invariant: At most one of these Optionals has a value.
150 std::optional<Intrinsic::ID> IID;
151 std::optional<Instruction::CastOps> CastOp;
152 std::optional<Instruction::BinaryOps> BinaryOp;
153 std::optional<SpecialCase> Special;
154
155 FtzRequirementTy FtzRequirement = FTZ_Any;
156 // Denormal handling is guarded by different attributes depending on the
157 // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
158 bool IsHalfTy = false;
159
160 SimplifyAction() = default;
161
162 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
163 bool IsHalfTy = false)
164 : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
165
166 // Cast operations don't have anything to do with FTZ, so we skip that
167 // argument.
168 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
169
170 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
171 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
172
173 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
174 : Special(Special), FtzRequirement(FtzReq) {}
175 };
176
177 // Try to generate a SimplifyAction describing how to replace our
178 // IntrinsicInstr with target-generic LLVM IR.
179 const SimplifyAction Action = [II]() -> SimplifyAction {
180 switch (II->getIntrinsicID()) {
181 // NVVM intrinsics that map directly to LLVM intrinsics.
182 case Intrinsic::nvvm_ceil_d:
183 return {Intrinsic::ceil, FTZ_Any};
184 case Intrinsic::nvvm_ceil_f:
185 return {Intrinsic::ceil, FTZ_MustBeOff};
186 case Intrinsic::nvvm_ceil_ftz_f:
187 return {Intrinsic::ceil, FTZ_MustBeOn};
188 case Intrinsic::nvvm_floor_d:
189 return {Intrinsic::floor, FTZ_Any};
190 case Intrinsic::nvvm_floor_f:
191 return {Intrinsic::floor, FTZ_MustBeOff};
192 case Intrinsic::nvvm_floor_ftz_f:
193 return {Intrinsic::floor, FTZ_MustBeOn};
194 case Intrinsic::nvvm_fma_rn_d:
195 return {Intrinsic::fma, FTZ_Any};
196 case Intrinsic::nvvm_fma_rn_f:
197 return {Intrinsic::fma, FTZ_MustBeOff};
198 case Intrinsic::nvvm_fma_rn_ftz_f:
199 return {Intrinsic::fma, FTZ_MustBeOn};
200 case Intrinsic::nvvm_fma_rn_f16:
201 return {Intrinsic::fma, FTZ_MustBeOff, true};
202 case Intrinsic::nvvm_fma_rn_ftz_f16:
203 return {Intrinsic::fma, FTZ_MustBeOn, true};
204 case Intrinsic::nvvm_fma_rn_f16x2:
205 return {Intrinsic::fma, FTZ_MustBeOff, true};
206 case Intrinsic::nvvm_fma_rn_ftz_f16x2:
207 return {Intrinsic::fma, FTZ_MustBeOn, true};
208 case Intrinsic::nvvm_fma_rn_bf16:
209 return {Intrinsic::fma, FTZ_MustBeOff, true};
210 case Intrinsic::nvvm_fma_rn_ftz_bf16:
211 return {Intrinsic::fma, FTZ_MustBeOn, true};
212 case Intrinsic::nvvm_fma_rn_bf16x2:
213 return {Intrinsic::fma, FTZ_MustBeOff, true};
214 case Intrinsic::nvvm_fma_rn_ftz_bf16x2:
215 return {Intrinsic::fma, FTZ_MustBeOn, true};
216 case Intrinsic::nvvm_fmax_d:
217 return {Intrinsic::maxnum, FTZ_Any};
218 case Intrinsic::nvvm_fmax_f:
219 return {Intrinsic::maxnum, FTZ_MustBeOff};
220 case Intrinsic::nvvm_fmax_ftz_f:
221 return {Intrinsic::maxnum, FTZ_MustBeOn};
222 case Intrinsic::nvvm_fmax_nan_f:
223 return {Intrinsic::maximum, FTZ_MustBeOff};
224 case Intrinsic::nvvm_fmax_ftz_nan_f:
225 return {Intrinsic::maximum, FTZ_MustBeOn};
226 case Intrinsic::nvvm_fmax_f16:
227 return {Intrinsic::maxnum, FTZ_MustBeOff, true};
228 case Intrinsic::nvvm_fmax_ftz_f16:
229 return {Intrinsic::maxnum, FTZ_MustBeOn, true};
230 case Intrinsic::nvvm_fmax_f16x2:
231 return {Intrinsic::maxnum, FTZ_MustBeOff, true};
232 case Intrinsic::nvvm_fmax_ftz_f16x2:
233 return {Intrinsic::maxnum, FTZ_MustBeOn, true};
234 case Intrinsic::nvvm_fmax_nan_f16:
235 return {Intrinsic::maximum, FTZ_MustBeOff, true};
236 case Intrinsic::nvvm_fmax_ftz_nan_f16:
237 return {Intrinsic::maximum, FTZ_MustBeOn, true};
238 case Intrinsic::nvvm_fmax_nan_f16x2:
239 return {Intrinsic::maximum, FTZ_MustBeOff, true};
240 case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
241 return {Intrinsic::maximum, FTZ_MustBeOn, true};
242 case Intrinsic::nvvm_fmin_d:
243 return {Intrinsic::minnum, FTZ_Any};
244 case Intrinsic::nvvm_fmin_f:
245 return {Intrinsic::minnum, FTZ_MustBeOff};
246 case Intrinsic::nvvm_fmin_ftz_f:
247 return {Intrinsic::minnum, FTZ_MustBeOn};
248 case Intrinsic::nvvm_fmin_nan_f:
249 return {Intrinsic::minimum, FTZ_MustBeOff};
250 case Intrinsic::nvvm_fmin_ftz_nan_f:
251 return {Intrinsic::minimum, FTZ_MustBeOn};
252 case Intrinsic::nvvm_fmin_f16:
253 return {Intrinsic::minnum, FTZ_MustBeOff, true};
254 case Intrinsic::nvvm_fmin_ftz_f16:
255 return {Intrinsic::minnum, FTZ_MustBeOn, true};
256 case Intrinsic::nvvm_fmin_f16x2:
257 return {Intrinsic::minnum, FTZ_MustBeOff, true};
258 case Intrinsic::nvvm_fmin_ftz_f16x2:
259 return {Intrinsic::minnum, FTZ_MustBeOn, true};
260 case Intrinsic::nvvm_fmin_nan_f16:
261 return {Intrinsic::minimum, FTZ_MustBeOff, true};
262 case Intrinsic::nvvm_fmin_ftz_nan_f16:
263 return {Intrinsic::minimum, FTZ_MustBeOn, true};
264 case Intrinsic::nvvm_fmin_nan_f16x2:
265 return {Intrinsic::minimum, FTZ_MustBeOff, true};
266 case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
267 return {Intrinsic::minimum, FTZ_MustBeOn, true};
268 case Intrinsic::nvvm_sqrt_rn_d:
269 return {Intrinsic::sqrt, FTZ_Any};
270 case Intrinsic::nvvm_sqrt_f:
271 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
272 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
273 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
274 // the versions with explicit ftz-ness.
275 return {Intrinsic::sqrt, FTZ_Any};
276 case Intrinsic::nvvm_trunc_d:
277 return {Intrinsic::trunc, FTZ_Any};
278 case Intrinsic::nvvm_trunc_f:
279 return {Intrinsic::trunc, FTZ_MustBeOff};
280 case Intrinsic::nvvm_trunc_ftz_f:
281 return {Intrinsic::trunc, FTZ_MustBeOn};
282
283 // NVVM intrinsics that map to LLVM cast operations.
284 //
285 // Note that llvm's target-generic conversion operators correspond to the rz
286 // (round to zero) versions of the nvvm conversion intrinsics, even though
287 // most everything else here uses the rn (round to nearest even) nvvm ops.
288 case Intrinsic::nvvm_d2i_rz:
289 case Intrinsic::nvvm_f2i_rz:
290 case Intrinsic::nvvm_d2ll_rz:
291 case Intrinsic::nvvm_f2ll_rz:
292 return {Instruction::FPToSI};
293 case Intrinsic::nvvm_d2ui_rz:
294 case Intrinsic::nvvm_f2ui_rz:
295 case Intrinsic::nvvm_d2ull_rz:
296 case Intrinsic::nvvm_f2ull_rz:
297 return {Instruction::FPToUI};
298 // Integer to floating-point uses RN rounding, not RZ
299 case Intrinsic::nvvm_i2d_rn:
300 case Intrinsic::nvvm_i2f_rn:
301 case Intrinsic::nvvm_ll2d_rn:
302 case Intrinsic::nvvm_ll2f_rn:
303 return {Instruction::SIToFP};
304 case Intrinsic::nvvm_ui2d_rn:
305 case Intrinsic::nvvm_ui2f_rn:
306 case Intrinsic::nvvm_ull2d_rn:
307 case Intrinsic::nvvm_ull2f_rn:
308 return {Instruction::UIToFP};
309
310 // NVVM intrinsics that map to LLVM binary ops.
311 case Intrinsic::nvvm_div_rn_d:
312 return {Instruction::FDiv, FTZ_Any};
313
314 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
315 // need special handling.
316 //
317 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
318 // as well.
319 case Intrinsic::nvvm_rcp_rn_d:
320 return {SPC_Reciprocal, FTZ_Any};
321
322 case Intrinsic::nvvm_fshl_clamp:
323 case Intrinsic::nvvm_fshr_clamp:
324 return {SCP_FunnelShiftClamp, FTZ_Any};
325
326 // We do not currently simplify intrinsics that give an approximate
327 // answer. These include:
328 //
329 // - nvvm_cos_approx_{f,ftz_f}
330 // - nvvm_ex2_approx_{d,f,ftz_f}
331 // - nvvm_lg2_approx_{d,f,ftz_f}
332 // - nvvm_sin_approx_{f,ftz_f}
333 // - nvvm_sqrt_approx_{f,ftz_f}
334 // - nvvm_rsqrt_approx_{d,f,ftz_f}
335 // - nvvm_div_approx_{ftz_d,ftz_f,f}
336 // - nvvm_rcp_approx_ftz_d
337 //
338 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
339 // means that fastmath is enabled in the intrinsic. Unfortunately only
340 // binary operators (currently) have a fastmath bit in SelectionDAG, so
341 // this information gets lost and we can't select on it.
342 //
343 // TODO: div and rcp are lowered to a binary op, so these we could in
344 // theory lower them to "fast fdiv".
345
346 default:
347 return {};
348 }
349 }();
350
351 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
352 // can bail out now. (Notice that in the case that IID is not an NVVM
353 // intrinsic, we don't have to look up any module metadata, as
354 // FtzRequirementTy will be FTZ_Any.)
355 if (Action.FtzRequirement != FTZ_Any) {
356 // FIXME: Broken for f64
357 DenormalMode Mode = II->getFunction()->getDenormalMode(
358 Action.IsHalfTy ? APFloat::IEEEhalf() : APFloat::IEEEsingle());
359 bool FtzEnabled = Mode.Output == DenormalMode::PreserveSign;
360
361 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
362 return nullptr;
363 }
364
365 // Simplify to target-generic intrinsic.
366 if (Action.IID) {
367 SmallVector<Value *, 4> Args(II->args());
368 // All the target-generic intrinsics currently of interest to us have one
369 // type argument, equal to that of the nvvm intrinsic's argument.
370 Type *Tys[] = {II->getArgOperand(0)->getType()};
371 return CallInst::Create(
372 Intrinsic::getOrInsertDeclaration(II->getModule(), *Action.IID, Tys),
373 Args);
374 }
375
376 // Simplify to target-generic binary op.
377 if (Action.BinaryOp)
378 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
379 II->getArgOperand(1), II->getName());
380
381 // Simplify to target-generic cast op.
382 if (Action.CastOp)
383 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
384 II->getName());
385
386 // All that's left are the special cases.
387 if (!Action.Special)
388 return nullptr;
389
390 switch (*Action.Special) {
391 case SPC_Reciprocal:
392 // Simplify reciprocal.
394 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
395 II->getArgOperand(0), II->getName());
396
397 case SCP_FunnelShiftClamp: {
398 // Canonicalize a clamping funnel shift to the generic llvm funnel shift
399 // when possible, as this is easier for llvm to optimize further.
400 if (const auto *ShiftConst = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
401 const bool IsLeft = II->getIntrinsicID() == Intrinsic::nvvm_fshl_clamp;
402 if (ShiftConst->getZExtValue() >= II->getType()->getIntegerBitWidth())
403 return IC.replaceInstUsesWith(*II, II->getArgOperand(IsLeft ? 1 : 0));
404
405 const unsigned FshIID = IsLeft ? Intrinsic::fshl : Intrinsic::fshr;
407 II->getModule(), FshIID, II->getType()),
408 SmallVector<Value *, 3>(II->args()));
409 }
410 return nullptr;
411 }
412 }
413 llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
414}
415
416// Returns true/false when we know the answer, nullopt otherwise.
417static std::optional<bool> evaluateIsSpace(Intrinsic::ID IID, unsigned AS) {
420 return std::nullopt; // Got to check at run-time.
421 switch (IID) {
422 case Intrinsic::nvvm_isspacep_global:
424 case Intrinsic::nvvm_isspacep_local:
425 return AS == NVPTXAS::ADDRESS_SPACE_LOCAL;
426 case Intrinsic::nvvm_isspacep_shared:
427 // If shared cluster this can't be evaluated at compile time.
429 return std::nullopt;
431 case Intrinsic::nvvm_isspacep_shared_cluster:
434 case Intrinsic::nvvm_isspacep_const:
435 return AS == NVPTXAS::ADDRESS_SPACE_CONST;
436 default:
437 llvm_unreachable("Unexpected intrinsic");
438 }
439}
440
441// Returns an instruction pointer (may be nullptr if we do not know the answer).
442// Returns nullopt if `II` is not one of the `isspacep` intrinsics.
443//
444// TODO: If InferAddressSpaces were run early enough in the pipeline this could
445// be removed in favor of the constant folding that occurs there through
446// rewriteIntrinsicWithAddressSpace
447static std::optional<Instruction *>
449
450 switch (auto IID = II.getIntrinsicID()) {
451 case Intrinsic::nvvm_isspacep_global:
452 case Intrinsic::nvvm_isspacep_local:
453 case Intrinsic::nvvm_isspacep_shared:
454 case Intrinsic::nvvm_isspacep_shared_cluster:
455 case Intrinsic::nvvm_isspacep_const: {
456 Value *Op0 = II.getArgOperand(0);
457 unsigned AS = Op0->getType()->getPointerAddressSpace();
458 // Peek through ASC to generic AS.
459 // TODO: we could dig deeper through both ASCs and GEPs.
461 if (auto *ASCO = dyn_cast<AddrSpaceCastOperator>(Op0))
462 AS = ASCO->getOperand(0)->getType()->getPointerAddressSpace();
463
464 if (std::optional<bool> Answer = evaluateIsSpace(IID, AS))
465 return IC.replaceInstUsesWith(II,
466 ConstantInt::get(II.getType(), *Answer));
467 return nullptr; // Don't know the answer, got to check at run time.
468 }
469 default:
470 return std::nullopt;
471 }
472}
473
474std::optional<Instruction *>
476 if (std::optional<Instruction *> I = handleSpaceCheckIntrinsics(IC, II))
477 return *I;
479 return I;
480
481 return std::nullopt;
482}
483
488 if (const auto *CI = dyn_cast<CallInst>(U))
489 if (const auto *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
490 // Without this implementation getCallCost() would return the number
491 // of arguments+1 as the cost. Because the cost-model assumes it is a call
492 // since it is classified as a call in the IR. A better cost model would
493 // be to return the number of asm instructions embedded in the asm
494 // string.
495 StringRef AsmStr = IA->getAsmString();
496 const unsigned InstCount =
497 count_if(split(AsmStr, ';'), [](StringRef AsmInst) {
498 // Trim off scopes denoted by '{' and '}' as these can be ignored
499 AsmInst = AsmInst.trim().ltrim("{} \t\n\v\f\r");
500 // This is pretty coarse but does a reasonably good job of
501 // identifying things that look like instructions, possibly with a
502 // predicate ("@").
503 return !AsmInst.empty() &&
504 (AsmInst[0] == '@' || isAlpha(AsmInst[0]) ||
505 AsmInst.find(".pragma") != StringRef::npos);
506 });
507 return InstCount * TargetTransformInfo::TCC_Basic;
508 }
509
511}
512
514 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
516 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
517 // Legalize the type.
518 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
519
520 int ISD = TLI->InstructionOpcodeToISD(Opcode);
521
522 switch (ISD) {
523 default:
524 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
525 Op2Info);
526 case ISD::ADD:
527 case ISD::MUL:
528 case ISD::XOR:
529 case ISD::OR:
530 case ISD::AND:
531 // The machine code (SASS) simulates an i64 with two i32. Therefore, we
532 // estimate that arithmetic operations on i64 are twice as expensive as
533 // those on types that can fit into one machine register.
534 if (LT.second.SimpleTy == MVT::i64)
535 return 2 * LT.first;
536 // Delegate other cases to the basic TTI.
537 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
538 Op2Info);
539 }
540}
541
544 OptimizationRemarkEmitter *ORE) const {
545 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
546
547 // Enable partial unrolling and runtime unrolling, but reduce the
548 // threshold. This partially unrolls small loops which are often
549 // unrolled by the PTX to SASS compiler and unrolling earlier can be
550 // beneficial.
551 UP.Partial = UP.Runtime = true;
552 UP.PartialThreshold = UP.Threshold / 4;
553}
554
559
561 Intrinsic::ID IID) const {
562 switch (IID) {
563 case Intrinsic::nvvm_isspacep_const:
564 case Intrinsic::nvvm_isspacep_global:
565 case Intrinsic::nvvm_isspacep_local:
566 case Intrinsic::nvvm_isspacep_shared:
567 case Intrinsic::nvvm_isspacep_shared_cluster:
568 case Intrinsic::nvvm_prefetch_tensormap: {
569 OpIndexes.push_back(0);
570 return true;
571 }
572 }
573 return false;
574}
575
577 Value *OldV,
578 Value *NewV) const {
579 const Intrinsic::ID IID = II->getIntrinsicID();
580 switch (IID) {
581 case Intrinsic::nvvm_isspacep_const:
582 case Intrinsic::nvvm_isspacep_global:
583 case Intrinsic::nvvm_isspacep_local:
584 case Intrinsic::nvvm_isspacep_shared:
585 case Intrinsic::nvvm_isspacep_shared_cluster: {
586 const unsigned NewAS = NewV->getType()->getPointerAddressSpace();
587 if (const auto R = evaluateIsSpace(IID, NewAS))
588 return ConstantInt::get(II->getType(), *R);
589 return nullptr;
590 }
591 case Intrinsic::nvvm_prefetch_tensormap: {
592 IRBuilder<> Builder(II);
593 return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
594 NewV);
595 }
596 }
597 return nullptr;
598}
599
600unsigned NVPTXTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
601 // 256 bit loads/stores are currently only supported for global address space
602 if (ST->has256BitVectorLoadStore(AddrSpace))
603 return 256;
604 return 128;
605}
606
607unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
608 if (isa<AllocaInst>(V))
609 return ADDRESS_SPACE_LOCAL;
610
611 if (const Argument *Arg = dyn_cast<Argument>(V)) {
612 if (isKernelFunction(*Arg->getParent())) {
613 const NVPTXTargetMachine &TM =
614 static_cast<const NVPTXTargetMachine &>(getTLI()->getTargetMachine());
615 if (TM.getDrvInterface() == NVPTX::CUDA && !Arg->hasByValAttr())
617 } else {
618 // We assume that all device parameters that are passed byval will be
619 // placed in the local AS. Very simple cases will be updated after ISel to
620 // use the device param space where possible.
621 if (Arg->hasByValAttr())
622 return ADDRESS_SPACE_LOCAL;
623 }
624 }
625
626 return -1;
627}
628
630 const Function &F,
631 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
632 if (const auto Val = getMaxClusterRank(F))
633 LB.push_back({"maxclusterrank", *Val});
634
635 const auto MaxNTID = getMaxNTID(F);
636 if (MaxNTID.size() > 0)
637 LB.push_back({"maxntidx", MaxNTID[0]});
638 if (MaxNTID.size() > 1)
639 LB.push_back({"maxntidy", MaxNTID[1]});
640 if (MaxNTID.size() > 2)
641 LB.push_back({"maxntidz", MaxNTID[2]});
642}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file provides the interface for the instcombine pass implementation.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
NVPTX address space definition.
static std::optional< Instruction * > handleSpaceCheckIntrinsics(InstCombiner &IC, IntrinsicInst &II)
static bool isNVVMAtomic(const IntrinsicInst *II)
static Instruction * convertNvvmIntrinsicToLlvm(InstCombiner &IC, IntrinsicInst *II)
static bool readsLaneId(const IntrinsicInst *II)
static std::optional< bool > evaluateIsSpace(Intrinsic::ID IID, unsigned AS)
static bool readsThreadIndex(const IntrinsicInst *II)
This file a TargetTransformInfoImplBase conforming object specific to the NVPTX target machine.
uint64_t IntrinsicInst * II
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
The core instruction combiner logic.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
A wrapper class for inspecting calls to intrinsic functions.
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool isSourceOfDivergence(const Value *V) const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getAssumedAddrSpace(const Value *V) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
StringRef ltrim(char Char) const
Return string with consecutive Char characters starting from the the left removed.
Definition StringRef.h:800
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:301
StringRef trim(char Char) const
Return string with consecutive Char characters starting from the left and right removed.
Definition StringRef.h:824
static constexpr size_t npos
Definition StringRef.h:57
virtual InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const
TargetCostKind
The kind of cost model.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
iterator_range< SplittingIterator > split(StringRef Str, StringRef Separator)
Split the specified string over a separator and return a range-compatible iterable over its partition...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
std::optional< unsigned > getMaxClusterRank(const Function &F)
SmallVector< unsigned, 3 > getMaxNTID(const Function &F)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
bool isKernelFunction(const Function &F)
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...