LLVM 22.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
18#include "NVPTXSubtarget.h"
19#include "NVPTXTargetMachine.h"
21#include "NVPTXUtilities.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/StringRef.h"
38#include "llvm/IR/Argument.h"
39#include "llvm/IR/Attributes.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DataLayout.h"
44#include "llvm/IR/FPEnv.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/IRBuilder.h"
48#include "llvm/IR/Instruction.h"
50#include "llvm/IR/IntrinsicsNVPTX.h"
51#include "llvm/IR/Module.h"
52#include "llvm/IR/Type.h"
53#include "llvm/IR/Value.h"
65#include <algorithm>
66#include <cassert>
67#include <cmath>
68#include <cstdint>
69#include <iterator>
70#include <optional>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76#define DEBUG_TYPE "nvptx-lower"
77
78using namespace llvm;
79
81 "nvptx-sched4reg",
82 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
83
85 "nvptx-fma-level", cl::Hidden,
86 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
87 " 1: do it 2: do it aggressively"),
88 cl::init(2));
89
91 "nvptx-prec-divf32", cl::Hidden,
93 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
95 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
96 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
98 "Use IEEE Compliant F32 div.rnd if available (default)"),
100 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
102
104 "nvptx-prec-sqrtf32", cl::Hidden,
105 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
106 cl::init(true));
107
108/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
109/// does NOT use lg2.approx for log2, so this is disabled by default.
111 "nvptx-approx-log2f32",
112 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
113 cl::init(false));
114
116 "nvptx-force-min-byval-param-align", cl::Hidden,
117 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
118 " params of device functions."),
119 cl::init(false));
120
123 const SDNode &N) const {
124 // If nvptx-prec-div32=N is used on the command-line, always honor it
125 if (UsePrecDivF32.getNumOccurrences() > 0)
126 return UsePrecDivF32;
127
128 const SDNodeFlags Flags = N.getFlags();
129 if (Flags.hasApproximateFuncs())
131
133}
134
136 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
137 if (UsePrecSqrtF32.getNumOccurrences() > 0)
138 return UsePrecSqrtF32;
139
140 if (N) {
141 const SDNodeFlags Flags = N->getFlags();
142 if (Flags.hasApproximateFuncs())
143 return false;
144 }
145
146 return true;
147}
148
153
154static bool IsPTXVectorType(MVT VT) {
155 switch (VT.SimpleTy) {
156 default:
157 return false;
158 case MVT::v2i1:
159 case MVT::v4i1:
160 case MVT::v2i8:
161 case MVT::v4i8:
162 case MVT::v8i8: // <2 x i8x4>
163 case MVT::v16i8: // <4 x i8x4>
164 case MVT::v2i16:
165 case MVT::v4i16:
166 case MVT::v8i16: // <4 x i16x2>
167 case MVT::v2i32:
168 case MVT::v4i32:
169 case MVT::v2i64:
170 case MVT::v2f16:
171 case MVT::v4f16:
172 case MVT::v8f16: // <4 x f16x2>
173 case MVT::v2bf16:
174 case MVT::v4bf16:
175 case MVT::v8bf16: // <4 x bf16x2>
176 case MVT::v2f32:
177 case MVT::v4f32:
178 case MVT::v2f64:
179 case MVT::v4i64:
180 case MVT::v4f64:
181 case MVT::v8i32:
182 case MVT::v8f32:
183 case MVT::v16f16: // <8 x f16x2>
184 case MVT::v16bf16: // <8 x bf16x2>
185 case MVT::v16i16: // <8 x i16x2>
186 case MVT::v32i8: // <8 x i8x4>
187 return true;
188 }
189}
190
191// When legalizing vector loads/stores, this function is called, which does two
192// things:
193// 1. Determines Whether the vector is something we want to custom lower,
194// std::nullopt is returned if we do not want to custom lower it.
195// 2. If we do want to handle it, returns two parameters:
196// - unsigned int NumElts - The number of elements in the final vector
197// - EVT EltVT - The type of the elements in the final vector
198static std::optional<std::pair<unsigned int, MVT>>
200 unsigned AddressSpace) {
201 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
202
203 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
204 VectorEVT.getSizeInBits() == 256)
205 return {{4, MVT::i64}};
206
207 if (!VectorEVT.isSimple())
208 return std::nullopt;
209 const MVT VectorVT = VectorEVT.getSimpleVT();
210
211 if (!VectorVT.isVector()) {
212 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
213 return {{2, MVT::i64}};
214 return std::nullopt;
215 }
216
217 const MVT EltVT = VectorVT.getVectorElementType();
218 const unsigned NumElts = VectorVT.getVectorNumElements();
219
220 // The size of the PTX virtual register that holds a packed type.
221 unsigned PackRegSize;
222
223 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
224 // legal. We can (and should) split that into 2 stores of <2 x double> here
225 // but I'm leaving that as a TODO for now.
226 switch (VectorVT.SimpleTy) {
227 default:
228 return std::nullopt;
229
230 case MVT::v4i64:
231 case MVT::v4f64:
232 // This is a "native" vector type iff the address space is global and the
233 // target supports 256-bit loads/stores
234 if (!CanLowerTo256Bit)
235 return std::nullopt;
237 case MVT::v2i8:
238 case MVT::v2i64:
239 case MVT::v2f64:
240 // This is a "native" vector type
241 return std::pair(NumElts, EltVT);
242
243 case MVT::v16f16: // <8 x f16x2>
244 case MVT::v16bf16: // <8 x bf16x2>
245 case MVT::v16i16: // <8 x i16x2>
246 case MVT::v32i8: // <8 x i8x4>
247 // This can be upsized into a "native" vector type iff the address space is
248 // global and the target supports 256-bit loads/stores.
249 if (!CanLowerTo256Bit)
250 return std::nullopt;
252 case MVT::v2i16: // <1 x i16x2>
253 case MVT::v2f16: // <1 x f16x2>
254 case MVT::v2bf16: // <1 x bf16x2>
255 case MVT::v4i8: // <1 x i8x4>
256 case MVT::v4i16: // <2 x i16x2>
257 case MVT::v4f16: // <2 x f16x2>
258 case MVT::v4bf16: // <2 x bf16x2>
259 case MVT::v8i8: // <2 x i8x4>
260 case MVT::v8f16: // <4 x f16x2>
261 case MVT::v8bf16: // <4 x bf16x2>
262 case MVT::v8i16: // <4 x i16x2>
263 case MVT::v16i8: // <4 x i8x4>
264 PackRegSize = 32;
265 break;
266
267 case MVT::v8f32: // <4 x f32x2>
268 case MVT::v8i32: // <4 x i32x2>
269 // This is a "native" vector type iff the address space is global and the
270 // target supports 256-bit loads/stores
271 if (!CanLowerTo256Bit)
272 return std::nullopt;
274 case MVT::v2f32: // <1 x f32x2>
275 case MVT::v4f32: // <2 x f32x2>
276 case MVT::v2i32: // <1 x i32x2>
277 case MVT::v4i32: // <2 x i32x2>
278 if (!STI.hasF32x2Instructions())
279 return std::pair(NumElts, EltVT);
280 PackRegSize = 64;
281 break;
282 }
283
284 // If we reach here, then we can pack 2 or more elements into a single 32-bit
285 // or 64-bit PTX register and treat the vector as a new vector containing
286 // packed elements.
287
288 // Number of elements to pack in one word.
289 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
290
291 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
292}
293
294/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
295/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
296/// the types as required by the calling convention (with special handling for
297/// i8s).
298/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
299/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
300/// LowerCall, and LowerReturn.
301static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
302 LLVMContext &Ctx, CallingConv::ID CallConv,
303 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
305 uint64_t StartingOffset = 0) {
306 SmallVector<EVT, 16> TempVTs;
307 SmallVector<uint64_t, 16> TempOffsets;
308 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
309
310 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
311 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
312 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
313
314 // Since we actually can load/store b8, we need to ensure that we'll use
315 // the original sized type for any i8s or i8 vectors.
316 if (VT.getScalarType() == MVT::i8) {
317 if (RegisterVT == MVT::i16)
318 RegisterVT = MVT::i8;
319 else if (RegisterVT == MVT::v2i16)
320 RegisterVT = MVT::v2i8;
321 else
322 assert(RegisterVT == MVT::v4i8 &&
323 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
324 }
325
326 // TODO: This is horribly incorrect for cases where the vector elements are
327 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
328 // has existed for as long as NVPTX has and no one has complained, so we'll
329 // leave it for now.
330 for (unsigned I : seq(NumRegs)) {
331 ValueVTs.push_back(RegisterVT);
332 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
333 }
334 }
335}
336
337// We return an EVT that can hold N VTs
338// If the VT is a vector, the resulting EVT is a flat vector with the same
339// element type as VT's element type.
340static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
341 if (N == 1)
342 return VT;
343
344 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
345 VT.getVectorNumElements() * N)
346 : EVT::getVectorVT(C, VT, N);
347}
348
350 const SDLoc &dl, SelectionDAG &DAG) {
351 if (V.getValueType() == VT) {
352 assert(I == 0 && "Index must be 0 for scalar value");
353 return V;
354 }
355
356 if (!VT.isVector())
357 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
358 DAG.getVectorIdxConstant(I, dl));
359
360 return DAG.getNode(
361 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
363}
364
365template <typename T>
366static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
367 SelectionDAG &DAG, T GetElement) {
368 if (N == 1)
369 return GetElement(0);
370
372 for (const unsigned I : llvm::seq(N)) {
373 SDValue Val = GetElement(I);
374 if (Val.getValueType().isVector())
375 DAG.ExtractVectorElements(Val, Values);
376 else
377 Values.push_back(Val);
378 }
379
380 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
381 Values.size());
382 return DAG.getBuildVector(VT, dl, Values);
383}
384
385/// PromoteScalarIntegerPTX
386/// Used to make sure the arguments/returns are suitable for passing
387/// and promote them to a larger size if they're not.
388///
389/// The promoted type is placed in \p PromoteVT if the function returns true.
391 if (VT.isScalarInteger()) {
392 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
393 default:
395 "Promotion is not suitable for scalars of size larger than 64-bits");
396 case 1:
397 return MVT::i1;
398 case 2:
399 case 4:
400 case 8:
401 return MVT::i8;
402 case 16:
403 return MVT::i16;
404 case 32:
405 return MVT::i32;
406 case 64:
407 return MVT::i64;
408 }
409 }
410 return VT;
411}
412
413// Check whether we can merge loads/stores of some of the pieces of a
414// flattened function parameter or return value into a single vector
415// load/store.
416//
417// The flattened parameter is represented as a list of EVTs and
418// offsets, and the whole structure is aligned to ParamAlignment. This
419// function determines whether we can load/store pieces of the
420// parameter starting at index Idx using a single vectorized op of
421// size AccessSize. If so, it returns the number of param pieces
422// covered by the vector op. Otherwise, it returns 1.
423template <typename T>
425 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
426 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
427
428 // Can't vectorize if param alignment is not sufficient.
429 if (ParamAlignment < AccessSize)
430 return 1;
431 // Can't vectorize if offset is not aligned.
432 if (Offsets[Idx] & (AccessSize - 1))
433 return 1;
434
435 EVT EltVT = ValueVTs[Idx];
436 unsigned EltSize = EltVT.getStoreSize();
437
438 // Element is too large to vectorize.
439 if (EltSize >= AccessSize)
440 return 1;
441
442 unsigned NumElts = AccessSize / EltSize;
443 // Can't vectorize if AccessBytes if not a multiple of EltSize.
444 if (AccessSize != EltSize * NumElts)
445 return 1;
446
447 // We don't have enough elements to vectorize.
448 if (Idx + NumElts > ValueVTs.size())
449 return 1;
450
451 // PTX ISA can only deal with 2- and 4-element vector ops.
452 if (NumElts != 4 && NumElts != 2)
453 return 1;
454
455 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
456 // Types do not match.
457 if (ValueVTs[j] != EltVT)
458 return 1;
459
460 // Elements are not contiguous.
461 if (Offsets[j] - Offsets[j - 1] != EltSize)
462 return 1;
463 }
464 // OK. We can vectorize ValueVTs[i..i+NumElts)
465 return NumElts;
466}
467
468// Computes whether and how we can vectorize the loads/stores of a
469// flattened function parameter or return value.
470//
471// The flattened parameter is represented as the list of ValueVTs and
472// Offsets, and is aligned to ParamAlignment bytes. We return a vector
473// of the same size as ValueVTs indicating how each piece should be
474// loaded/stored (i.e. as a scalar, or as part of a vector
475// load/store).
476template <typename T>
479 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
480 bool IsVAArg = false) {
481 // Set vector size to match ValueVTs and mark all elements as
482 // scalars by default.
483
484 if (IsVAArg)
485 return SmallVector<unsigned>(ValueVTs.size(), 1);
486
487 SmallVector<unsigned, 16> VectorInfo;
488
489 const auto GetNumElts = [&](unsigned I) -> unsigned {
490 for (const unsigned AccessSize : {16, 8, 4, 2}) {
491 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
492 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
493 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
494 "Unexpected vectorization size");
495 if (NumElts != 1)
496 return NumElts;
497 }
498 return 1;
499 };
500
501 // Check what we can vectorize using 128/64/32-bit accesses.
502 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
503 const unsigned NumElts = GetNumElts(I);
504 VectorInfo.push_back(NumElts);
505 I += NumElts;
506 }
507 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
508 ValueVTs.size());
509 return VectorInfo;
510}
511
512// NVPTXTargetLowering Constructor.
514 const NVPTXSubtarget &STI)
515 : TargetLowering(TM), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
516 // always lower memset, memcpy, and memmove intrinsics to load/store
517 // instructions, rather
518 // then generating calls to memset, mempcy or memmove.
522
525
526 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
527 // condition branches.
528 setJumpIsExpensive(true);
529
530 // Wide divides are _very_ slow. Try to reduce the width of the divide if
531 // possible.
532 addBypassSlowDiv(64, 32);
533
534 // By default, use the Source scheduling
535 if (sched4reg)
537 else
539
540 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
541 LegalizeAction NoF16Action) {
542 bool IsOpSupported = STI.allowFP16Math();
543 switch (Op) {
544 // Several FP16 instructions are available on sm_80 only.
545 case ISD::FMINNUM:
546 case ISD::FMAXNUM:
547 case ISD::FMAXNUM_IEEE:
548 case ISD::FMINNUM_IEEE:
549 case ISD::FMAXIMUM:
550 case ISD::FMINIMUM:
551 case ISD::FMAXIMUMNUM:
552 case ISD::FMINIMUMNUM:
553 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
554 break;
555 case ISD::FEXP2:
556 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
557 break;
558 }
559 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
560 };
561
562 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
563 LegalizeAction NoBF16Action) {
564 bool IsOpSupported = STI.hasNativeBF16Support(Op);
566 Op, VT, IsOpSupported ? Action : NoBF16Action);
567 };
568
569 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
570 LegalizeAction NoI16x2Action) {
571 bool IsOpSupported = false;
572 // instructions are available on sm_90 only
573 switch (Op) {
574 case ISD::ADD:
575 case ISD::SMAX:
576 case ISD::SMIN:
577 case ISD::UMIN:
578 case ISD::UMAX:
579 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
580 break;
581 }
582 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
583 };
584
585 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
586 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
587 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
588 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
589 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
590 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
591 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
592 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
593 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
594 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
595 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
596 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
597
598 if (STI.hasF32x2Instructions()) {
599 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
600 addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);
601 }
602
603 // Conversion to/from FP16/FP16x2 is always legal.
608
609 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
610 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
611 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
612
613 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
614 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
615
616 // Conversion to/from BFP16/BFP16x2 is always legal.
621
622 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
623 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
624 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
625 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
626
627 // Conversion to/from i16/i16x2 is always legal.
632
637
638 // No support for these operations with v2f32/v2i32
639 setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
640 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
641
644 MVT::v2i32, Expand);
645
646 // Need custom lowering in case the index is dynamic.
647 if (STI.hasF32x2Instructions())
648 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
649 Custom);
650
651 // Custom conversions to/from v2i8.
652 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
653
654 // Only logical ops can be done on v4i8 directly, others must be done
655 // elementwise.
672 MVT::v4i8, Expand);
673
674 // Operations not directly supported by NVPTX.
675 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
676 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
677 MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {
679 setOperationAction(ISD::BR_CC, VT, Expand);
680 }
681
682 // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.
683 setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);
684
685 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
686 // For others we will expand to a SHL/SRA pair.
693
700
703
705 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
706 Expand);
707
708 if (STI.hasHWROT32()) {
711 Custom);
712 }
713
715
716 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
717 setOperationAction(ISD::BRIND, MVT::Other, Expand);
718
719 // We want to legalize constant related memmove and memcopy
720 // intrinsics.
722
723 // FP extload/truncstore is not legal in PTX. We need to expand all these.
724 for (auto FloatVTs :
726 for (MVT ValVT : FloatVTs) {
727 for (MVT MemVT : FloatVTs) {
728 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
729 setTruncStoreAction(ValVT, MemVT, Expand);
730 }
731 }
732 }
733
734 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
735 // how they'll be lowered in ISel anyway, and by doing this a little earlier
736 // we allow for more DAG combine opportunities.
737 for (auto IntVTs :
739 for (MVT ValVT : IntVTs)
740 for (MVT MemVT : IntVTs)
741 if (isTypeLegal(ValVT))
742 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
743
744 // PTX does not support load / store predicate registers
745 setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
746 for (MVT VT : MVT::integer_valuetypes()) {
748 Promote);
749 setTruncStoreAction(VT, MVT::i1, Expand);
750 }
751
752 // Disable generations of extload/truncstore for v2i16/v2i8. The generic
753 // expansion for these nodes when they are unaligned is incorrect if the
754 // type is a vector.
755 //
756 // TODO: Fix the generic expansion for these nodes found in
757 // TargetLowering::expandUnalignedLoad/Store.
759 MVT::v2i8, Expand);
760 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
761
762 // Register custom handling for illegal type loads/stores. We'll try to custom
763 // lower almost all illegal types and logic in the lowering will discard cases
764 // we can't handle.
765 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
767 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
768 setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
769
770 // Custom legalization for LDU intrinsics.
771 // TODO: The logic to lower these is not very robust and we should rewrite it.
772 // Perhaps LDU should not be represented as an intrinsic at all.
775 if (IsPTXVectorType(VT))
777
781 MVT::i1, Expand);
782
783 // This is legal in NVPTX
788
789 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
790 setOperationAction({ISD::STACKRESTORE, ISD::STACKSAVE}, MVT::Other, Custom);
791
792 // TRAP can be lowered to PTX trap
793 setOperationAction(ISD::TRAP, MVT::Other, Legal);
794 // DEBUGTRAP can be lowered to PTX brkpt
795 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
796
797 // Support varargs.
798 setOperationAction(ISD::VASTART, MVT::Other, Custom);
799 setOperationAction(ISD::VAARG, MVT::Other, Custom);
800 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
801 setOperationAction(ISD::VAEND, MVT::Other, Expand);
802
804 {MVT::i16, MVT::i32, MVT::i64}, Legal);
805
807 Promote);
810
811 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
812 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
813 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
814 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
815 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
816 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
817 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
818
819 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
820 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
821 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
822 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
823 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
824 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
825
826 // Other arithmetic and logic ops are unsupported.
830 {MVT::v2i16, MVT::v2i32}, Expand);
831
832 // v2i32 is not supported for any arithmetic operations
837 MVT::v2i32, Expand);
838
843 if (STI.getPTXVersion() >= 43) {
848 }
849
851 setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);
854
855 // PTX does not directly support SELP of i1, so promote to i32 first
857
858 // PTX cannot multiply two i64s in a single instruction.
861
862 // We have some custom DAG combine patterns for these nodes
865 ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM,
866 ISD::FMAXIMUM, ISD::FMINIMUM, ISD::FMAXIMUMNUM,
867 ISD::FMINIMUMNUM, ISD::MUL, ISD::SHL,
869 ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
870 ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
871
872 // setcc for f16x2 and bf16x2 needs special handling to prevent
873 // legalizer's attempt to scalarize it due to v2i1 not being legal.
874 if (STI.allowFP16Math() || STI.hasBF16Math())
876
877 // Vector reduction operations. These may be turned into shuffle or tree
878 // reductions depending on what instructions are available for each type.
880 MVT EltVT = VT.getVectorElementType();
881 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
882 setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
883 ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
884 VT, Custom);
885 }
886 }
887
888 // Promote fp16 arithmetic if fp16 hardware isn't available or the
889 // user passed --nvptx-no-fp16-math. The flag is useful because,
890 // although sm_53+ GPUs have some sort of FP16 support in
891 // hardware, only sm_53 and sm_60 have full implementation. Others
892 // only have token amount of hardware and are likely to run faster
893 // by using fp32 units instead.
894 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
895 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
896 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
897 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
898 // bf16 must be promoted to f32.
899 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
900 if (getOperationAction(Op, MVT::bf16) == Promote)
901 AddPromotedToType(Op, MVT::bf16, MVT::f32);
902 setOperationAction(Op, MVT::v2f32,
903 STI.hasF32x2Instructions() ? Legal : Expand);
904 }
905
906 // On SM80, we select add/mul/sub as fma to avoid promotion to float
907 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
908 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
909 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
911 }
912 }
913 }
914
915 // f16/f16x2 neg was introduced in PTX 60, SM_53.
916 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
917 STI.getPTXVersion() >= 60 &&
918 STI.allowFP16Math();
919 for (const auto &VT : {MVT::f16, MVT::v2f16})
920 setOperationAction(ISD::FNEG, VT,
921 IsFP16FP16x2NegAvailable ? Legal : Expand);
922
923 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
924 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
925 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
926 // (would be) Library functions.
927
928 // These map to conversion instructions for scalar FP types.
929 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
930 ISD::FROUNDEVEN, ISD::FTRUNC}) {
931 setOperationAction(Op, MVT::f16, Legal);
932 setOperationAction(Op, MVT::f32, Legal);
933 setOperationAction(Op, MVT::f64, Legal);
934 setOperationAction(Op, MVT::v2f16, Expand);
935 setOperationAction(Op, MVT::v2bf16, Expand);
936 setOperationAction(Op, MVT::v2f32, Expand);
937 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
938 if (getOperationAction(Op, MVT::bf16) == Promote)
939 AddPromotedToType(Op, MVT::bf16, MVT::f32);
940 }
941
942 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
943 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
944 }
945 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
946 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
947 setOperationAction(ISD::FP_EXTEND, VT, Custom);
949 }
950 }
951
952 // Expand v2f32 = fp_extend
953 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
954 // Expand v2[b]f16 = fp_round v2f32
955 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
956
957 // sm_80 only has conversions between f32 and bf16. Custom lower all other
958 // bf16 conversions.
959 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
960 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
963 VT, Custom);
964 }
967 MVT::bf16, Custom);
968 }
969
970 setOperationAction(ISD::FROUND, MVT::f16, Promote);
971 setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
972 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
973 setOperationAction(ISD::FROUND, MVT::f32, Custom);
974 setOperationAction(ISD::FROUND, MVT::f64, Custom);
975 setOperationAction(ISD::FROUND, MVT::bf16, Promote);
976 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
977
978 // 'Expand' implements FCOPYSIGN without calling an external library.
985
986 // These map to corresponding instructions for f32/f64. f16 must be
987 // promoted to f32. v2f16 is expanded to f16, which is then promoted
988 // to f32.
989 for (const auto &Op :
990 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
991 setOperationAction(Op, MVT::f16, Promote);
992 setOperationAction(Op, MVT::f32, Legal);
993 // only div/rem/sqrt are legal for f64
994 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
995 setOperationAction(Op, MVT::f64, Legal);
996 }
997 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
998 setOperationAction(Op, MVT::bf16, Promote);
999 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1000 }
1001 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
1002
1003 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
1004 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
1005 if (STI.getPTXVersion() >= 65) {
1006 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
1007 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
1008 } else {
1009 setOperationAction(ISD::FABS, MVT::f16, Promote);
1010 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
1011 }
1012 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
1013 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
1014 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
1015 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
1016
1017 for (const auto &Op :
1018 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
1019 setOperationAction(Op, MVT::f32, Legal);
1020 setOperationAction(Op, MVT::f64, Legal);
1021 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1022 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1023 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1024 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1025 if (getOperationAction(Op, MVT::bf16) == Promote)
1026 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1027 setOperationAction(Op, MVT::v2f32, Expand);
1028 }
1029 bool SupportsF32MinMaxNaN =
1030 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1031 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1032 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1033 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1034 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1035 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1036 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1037 setOperationAction(Op, MVT::v2f32, Expand);
1038 }
1039
1040 // Custom lowering for inline asm with 128-bit operands
1043
1044 // FEXP2 support:
1045 // - f32
1046 // - f16/f16x2 (sm_70+, PTX 7.0+)
1047 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1048 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1049 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
1050 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1051 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1052 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1053 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1054 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1055
1056 // FLOG2 supports f32 only
1057 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1058 if (UseApproxLog2F32) {
1059 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
1060 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1061 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1062 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1063 Expand);
1064 }
1065
1066 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1067
1068 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1069
1070 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1071 // type, we need to custom lower it.
1072 setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
1073 Custom);
1074
1075 // Now deduce the information based on the above mentioned
1076 // actions
1077 computeRegisterProperties(STI.getRegisterInfo());
1078
1079 // PTX support for 16-bit CAS is emulated. Only use 32+
1080 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1081 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1083
1084 // Custom lowering for tcgen05.ld vector operands
1086 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1087 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1088 Custom);
1089
1090 // Custom lowering for tcgen05.st vector operands
1092 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1093 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other},
1094 Custom);
1095
1096 // Enable custom lowering for the following:
1097 // * MVT::i128 - clusterlaunchcontrol
1098 // * MVT::i32 - prmt
1099 // * MVT::Other - internal.addrspace.wrap
1100 setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
1101 Custom);
1102}
1103
1104const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
1105
1106#define MAKE_CASE(V) \
1107 case V: \
1108 return #V;
1109
1110 switch ((NVPTXISD::NodeType)Opcode) {
1112 break;
1113
1166 MAKE_CASE(
1168 MAKE_CASE(
1180 MAKE_CASE(
1182 MAKE_CASE(
1184 }
1185 return nullptr;
1186
1187#undef MAKE_CASE
1188}
1189
1192 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1193 VT.getScalarType() == MVT::i1)
1194 return TypeSplitVector;
1196}
1197
1199 int Enabled, int &ExtraSteps,
1200 bool &UseOneConst,
1201 bool Reciprocal) const {
1204 return SDValue();
1205
1206 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1207 ExtraSteps = 0;
1208
1209 SDLoc DL(Operand);
1210 EVT VT = Operand.getValueType();
1211 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1212
1213 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1214 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1215 DAG.getConstant(IID, DL, MVT::i32), Operand);
1216 };
1217
1218 // The sqrt and rsqrt refinement processes assume we always start out with an
1219 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1220 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1221 // any refinement, we must return a regular sqrt.
1222 if (Reciprocal || ExtraSteps > 0) {
1223 if (VT == MVT::f32)
1224 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1225 : Intrinsic::nvvm_rsqrt_approx_f);
1226 else if (VT == MVT::f64)
1227 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1228 else
1229 return SDValue();
1230 } else {
1231 if (VT == MVT::f32)
1232 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1233 : Intrinsic::nvvm_sqrt_approx_f);
1234 else {
1235 // There's no sqrt.approx.f64 instruction, so we emit
1236 // reciprocal(rsqrt(x)). This is faster than
1237 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1238 // x * rsqrt(x).)
1239 return DAG.getNode(
1241 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1242 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1243 }
1244 }
1245}
1246
1248 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1250 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1251 unsigned UniqueCallSite) const {
1252 auto PtrVT = getPointerTy(DL);
1253
1254 std::string Prototype;
1255 raw_string_ostream O(Prototype);
1256 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1257
1258 if (RetTy->isVoidTy()) {
1259 O << "()";
1260 } else {
1261 O << "(";
1262 if (shouldPassAsArray(RetTy)) {
1263 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1264 O << ".param .align " << RetAlign.value() << " .b8 _["
1265 << DL.getTypeAllocSize(RetTy) << "]";
1266 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1267 unsigned size = 0;
1268 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1269 size = ITy->getBitWidth();
1270 } else {
1271 assert(RetTy->isFloatingPointTy() &&
1272 "Floating point type expected here");
1273 size = RetTy->getPrimitiveSizeInBits();
1274 }
1275 // PTX ABI requires all scalar return values to be at least 32
1276 // bits in size. fp16 normally uses .b16 as its storage type in
1277 // PTX, so its size must be adjusted here, too.
1279
1280 O << ".param .b" << size << " _";
1281 } else if (isa<PointerType>(RetTy)) {
1282 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1283 } else {
1284 llvm_unreachable("Unknown return type");
1285 }
1286 O << ") ";
1287 }
1288 O << "_ (";
1289
1290 bool first = true;
1291
1292 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1293 auto AllOuts = ArrayRef(Outs);
1294 for (const unsigned I : llvm::seq(NumArgs)) {
1295 const auto ArgOuts =
1296 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1297 AllOuts = AllOuts.drop_front(ArgOuts.size());
1298
1299 Type *Ty = Args[I].Ty;
1300 if (!first) {
1301 O << ", ";
1302 }
1303 first = false;
1304
1305 if (ArgOuts[0].Flags.isByVal()) {
1306 // Indirect calls need strict ABI alignment so we disable optimizations by
1307 // not providing a function to optimize.
1308 Type *ETy = Args[I].IndirectType;
1309 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1310 Align ParamByValAlign =
1311 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1312
1313 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1314 << ArgOuts[0].Flags.getByValSize() << "]";
1315 } else {
1316 if (shouldPassAsArray(Ty)) {
1317 Align ParamAlign =
1318 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1319 O << ".param .align " << ParamAlign.value() << " .b8 _["
1320 << DL.getTypeAllocSize(Ty) << "]";
1321 continue;
1322 }
1323 // i8 types in IR will be i16 types in SDAG
1324 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1325 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1326 "type mismatch between callee prototype and arguments");
1327 // scalar type
1328 unsigned sz = 0;
1329 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1330 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1331 } else if (isa<PointerType>(Ty)) {
1332 sz = PtrVT.getSizeInBits();
1333 } else {
1334 sz = Ty->getPrimitiveSizeInBits();
1335 }
1336 O << ".param .b" << sz << " _";
1337 }
1338 }
1339
1340 if (FirstVAArg)
1341 O << (first ? "" : ",") << " .param .align "
1342 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1343 O << ")";
1344 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1345 O << " .noreturn";
1346 O << ";";
1347
1348 return Prototype;
1349}
1350
1352 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1353 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1354}
1355
1356Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1357 unsigned Idx,
1358 const DataLayout &DL) const {
1359 if (!CB) {
1360 // CallSite is zero, fallback to ABI type alignment
1361 return DL.getABITypeAlign(Ty);
1362 }
1363
1364 const Function *DirectCallee = CB->getCalledFunction();
1365
1366 if (!DirectCallee) {
1367 // We don't have a direct function symbol, but that may be because of
1368 // constant cast instructions in the call.
1369
1370 // With bitcast'd call targets, the instruction will be the call
1371 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1372 // Check if we have call alignment metadata
1373 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1374 return StackAlign.value();
1375 }
1376 DirectCallee = getMaybeBitcastedCallee(CB);
1377 }
1378
1379 // Check for function alignment information if we found that the
1380 // ultimate target is a Function
1381 if (DirectCallee)
1382 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1383
1384 // Call is indirect, fall back to the ABI type alignment
1385 return DL.getABITypeAlign(Ty);
1386}
1387
1389 const GlobalAddressSDNode *Func) {
1390 if (!Func)
1391 return false;
1392 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1393 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1394 return false;
1395}
1396
1398 const DataLayout &DL,
1399 const TargetLowering &TL) {
1400 if (Ptr->getOpcode() == ISD::FrameIndex) {
1401 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1404
1406 }
1407
1408 // Peel of an addrspacecast to generic and load directly from the specific
1409 // address space.
1410 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1411 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1412 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1413 Ptr = ASC->getOperand(0);
1414 return MachinePointerInfo(ASC->getSrcAddressSpace());
1415 }
1416 }
1417
1418 return MachinePointerInfo();
1419}
1420
1422 if (Flags.isSExt())
1423 return ISD::SIGN_EXTEND;
1424 if (Flags.isZExt())
1425 return ISD::ZERO_EXTEND;
1426 return ISD::ANY_EXTEND;
1427}
1428
1430 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1431 SDLoc dl) {
1432 const EVT ActualVT = V.getValueType();
1433 assert((ActualVT == ExpectedVT ||
1434 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1435 "Non-integer argument type size mismatch");
1436 if (ExpectedVT.bitsGT(ActualVT))
1437 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1438 if (ExpectedVT.bitsLT(ActualVT))
1439 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1440
1441 return V;
1442}
1443
1445 SmallVectorImpl<SDValue> &InVals) const {
1446
1447 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1449 "Support for variadic functions (unsized array parameter) introduced "
1450 "in PTX ISA version 6.0 and requires target sm_30.");
1451
1452 SelectionDAG &DAG = CLI.DAG;
1453 SDLoc dl = CLI.DL;
1454 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1455 SDValue Callee = CLI.Callee;
1456 ArgListTy &Args = CLI.getArgs();
1457 Type *RetTy = CLI.RetTy;
1458 const CallBase *CB = CLI.CB;
1459 const DataLayout &DL = DAG.getDataLayout();
1460 LLVMContext &Ctx = *DAG.getContext();
1461
1462 const auto GetI32 = [&](const unsigned I) {
1463 return DAG.getConstant(I, dl, MVT::i32);
1464 };
1465
1466 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1467 const SDValue CallChain = CLI.Chain;
1468 const SDValue StartChain =
1469 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1470 SDValue DeclareGlue = StartChain.getValue(1);
1471
1472 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1473
1474 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1475 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1476 // loaded/stored using i16, so it's handled here as well.
1477 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1478 SDValue Declare =
1479 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1480 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1481 CallPrereqs.push_back(Declare);
1482 DeclareGlue = Declare.getValue(1);
1483 return Declare;
1484 };
1485
1486 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1487 unsigned Size) {
1488 SDValue Declare = DAG.getNode(
1489 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1490 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1491 CallPrereqs.push_back(Declare);
1492 DeclareGlue = Declare.getValue(1);
1493 return Declare;
1494 };
1495
1496 // Variadic arguments.
1497 //
1498 // Normally, for each argument, we declare a param scalar or a param
1499 // byte array in the .param space, and store the argument value to that
1500 // param scalar or array starting at offset 0.
1501 //
1502 // In the case of the first variadic argument, we declare a vararg byte array
1503 // with size 0. The exact size of this array isn't known at this point, so
1504 // it'll be patched later. All the variadic arguments will be stored to this
1505 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1506 // initially set to 0, so it can be used for non-variadic arguments (which use
1507 // 0 offset) to simplify the code.
1508 //
1509 // After all vararg is processed, 'VAOffset' holds the size of the
1510 // vararg byte array.
1511 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1512 "Non-VarArg function with extra arguments");
1513
1514 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1515 unsigned VAOffset = 0; // current offset in the param array
1516
1517 const SDValue VADeclareParam =
1518 CLI.Args.size() > FirstVAArg
1519 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1520 Align(STI.getMaxRequiredAlignment()), 0)
1521 : SDValue();
1522
1523 // Args.size() and Outs.size() need not match.
1524 // Outs.size() will be larger
1525 // * if there is an aggregate argument with multiple fields (each field
1526 // showing up separately in Outs)
1527 // * if there is a vector argument with more than typical vector-length
1528 // elements (generally if more than 4) where each vector element is
1529 // individually present in Outs.
1530 // So a different index should be used for indexing into Outs/OutVals.
1531 // See similar issue in LowerFormalArguments.
1532 auto AllOuts = ArrayRef(CLI.Outs);
1533 auto AllOutVals = ArrayRef(CLI.OutVals);
1534 assert(AllOuts.size() == AllOutVals.size() &&
1535 "Outs and OutVals must be the same size");
1536 // Declare the .params or .reg need to pass values
1537 // to the function
1538 for (const auto E : llvm::enumerate(Args)) {
1539 const auto ArgI = E.index();
1540 const auto Arg = E.value();
1541 const auto ArgOuts =
1542 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1543 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1544 AllOuts = AllOuts.drop_front(ArgOuts.size());
1545 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1546
1547 const bool IsVAArg = (ArgI >= FirstVAArg);
1548 const bool IsByVal = Arg.IsByVal;
1549
1550 const SDValue ParamSymbol =
1551 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1552
1553 assert((!IsByVal || Arg.IndirectType) &&
1554 "byval arg must have indirect type");
1555 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1556
1557 const Align ArgAlign = [&]() {
1558 if (IsByVal) {
1559 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1560 // so we don't need to worry whether it's naturally aligned or not.
1561 // See TargetLowering::LowerCallTo().
1562 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1564 InitialAlign, DL);
1565 }
1566 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1567 }();
1568
1569 const unsigned TySize = DL.getTypeAllocSize(ETy);
1570 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1571 "type size mismatch");
1572
1573 const SDValue ArgDeclare = [&]() {
1574 if (IsVAArg)
1575 return VADeclareParam;
1576
1577 if (IsByVal || shouldPassAsArray(Arg.Ty))
1578 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1579
1580 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1581 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1582 "Only int and float types are supported as non-array arguments");
1583
1584 return MakeDeclareScalarParam(ParamSymbol, TySize);
1585 }();
1586
1587 if (IsByVal) {
1588 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1589 SDValue SrcPtr = ArgOutVals[0];
1590 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1591 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1592
1593 if (IsVAArg)
1594 VAOffset = alignTo(VAOffset, ArgAlign);
1595
1596 SmallVector<EVT, 4> ValueVTs, MemVTs;
1598 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1599
1600 unsigned J = 0;
1601 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1602 for (const unsigned NumElts : VI) {
1603 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1604 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1605 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1606 SDValue SrcLoad =
1607 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1608
1609 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1610 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1611 SDValue ParamAddr =
1612 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1613 SDValue StoreParam =
1614 DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
1616 CallPrereqs.push_back(StoreParam);
1617
1618 J += NumElts;
1619 }
1620 if (IsVAArg)
1621 VAOffset += TySize;
1622 } else {
1625 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1626 VAOffset);
1627 assert(VTs.size() == Offsets.size() && "Size mismatch");
1628 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1629
1630 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1631 // than 32-bits are sign extended or zero extended, depending on
1632 // whether they are signed or unsigned types. This case applies
1633 // only to scalar parameters and not to aggregate values.
1634 const bool ExtendIntegerParam =
1635 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1636
1637 const auto GetStoredValue = [&](const unsigned I) {
1638 SDValue StVal = ArgOutVals[I];
1640 StVal.getValueType() &&
1641 "OutVal type should always be legal");
1642
1643 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1644 const EVT StoreVT =
1645 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1646
1647 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1648 };
1649
1650 unsigned J = 0;
1651 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1652 for (const unsigned NumElts : VI) {
1653 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1654
1655 unsigned Offset;
1656 if (IsVAArg) {
1657 // TODO: We may need to support vector types that can be passed
1658 // as scalars in variadic arguments.
1659 assert(NumElts == 1 &&
1660 "Vectorization should be disabled for vaargs.");
1661
1662 // Align each part of the variadic argument to their type.
1663 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1664 Offset = VAOffset;
1665
1666 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1667 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1668 } else {
1669 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1670 Offset = Offsets[J];
1671 }
1672
1673 SDValue Ptr =
1674 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1675
1676 const MaybeAlign CurrentAlign = ExtendIntegerParam
1677 ? MaybeAlign(std::nullopt)
1678 : commonAlignment(ArgAlign, Offset);
1679
1680 SDValue Val =
1681 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1682 return GetStoredValue(J + K);
1683 });
1684
1685 SDValue StoreParam =
1686 DAG.getStore(ArgDeclare, dl, Val, Ptr,
1688 CallPrereqs.push_back(StoreParam);
1689
1690 J += NumElts;
1691 }
1692 }
1693 }
1694
1695 // Handle Result
1696 if (!Ins.empty()) {
1697 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1698 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1699 if (shouldPassAsArray(RetTy)) {
1700 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1701 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1702 } else {
1703 MakeDeclareScalarParam(RetSymbol, ResultSize);
1704 }
1705 }
1706
1707 // Set the size of the vararg param byte array if the callee is a variadic
1708 // function and the variadic part is not empty.
1709 if (VADeclareParam) {
1710 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1711 VADeclareParam.getOperand(1),
1712 VADeclareParam.getOperand(2), GetI32(VAOffset),
1713 VADeclareParam.getOperand(4)};
1714 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1715 VADeclareParam->getVTList(), DeclareParamOps);
1716 }
1717
1718 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1719 // If the type of the callsite does not match that of the function, convert
1720 // the callsite to an indirect call.
1721 const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1722
1723 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1724 // between them we must rely on the call site value which is valid for
1725 // indirect calls but is always null for libcalls.
1726 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1727
1728 if (isa<ExternalSymbolSDNode>(Callee)) {
1729 Function* CalleeFunc = nullptr;
1730
1731 // Try to find the callee in the current module.
1732 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1733 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1734
1735 // Set the "libcall callee" attribute to indicate that the function
1736 // must always have a declaration.
1737 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1738 }
1739
1740 if (IsIndirectCall) {
1741 // This is indirect function call case : PTX requires a prototype of the
1742 // form
1743 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1744 // to be emitted, and the label has to used as the last arg of call
1745 // instruction.
1746 // The prototype is embedded in a string and put as the operand for a
1747 // CallPrototype SDNode which will print out to the value of the string.
1748 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1749 std::string Proto =
1750 getPrototype(DL, RetTy, Args, CLI.Outs,
1751 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1752 UniqueCallSite);
1753 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1754 const SDValue PrototypeDeclare = DAG.getNode(
1755 NVPTXISD::CallPrototype, dl, MVT::Other,
1756 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1757 CallPrereqs.push_back(PrototypeDeclare);
1758 }
1759
1760 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1761 const unsigned NumArgs =
1762 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1763 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1764 /// NumParams, Callee, Proto)
1765 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1766 const SDValue Call = DAG.getNode(
1767 NVPTXISD::CALL, dl, MVT::Other,
1768 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1769 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1770
1771 SmallVector<SDValue, 16> LoadChains{Call};
1772 SmallVector<SDValue, 16> ProxyRegOps;
1773 if (!Ins.empty()) {
1776 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1777 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1778
1779 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1780 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1781
1782 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1783 // 32-bits are sign extended or zero extended, depending on whether
1784 // they are signed or unsigned types.
1785 const bool ExtendIntegerRetVal =
1786 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1787
1788 unsigned I = 0;
1789 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1790 for (const unsigned NumElts : VI) {
1791 const MaybeAlign CurrentAlign =
1792 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1793 : commonAlignment(RetAlign, Offsets[I]);
1794
1795 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1796 const EVT LoadVT =
1797 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1798 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1799 SDValue Ptr =
1800 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1801
1802 SDValue R =
1803 DAG.getLoad(VecVT, dl, Call, Ptr,
1805
1806 LoadChains.push_back(R.getValue(1));
1807 for (const unsigned J : llvm::seq(NumElts))
1808 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1809 I += NumElts;
1810 }
1811 }
1812
1813 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1814 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1815 UniqueCallSite + 1, SDValue(), dl);
1816
1817 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1818 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1819 // dangling.
1820 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1821 SDValue Proxy =
1822 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1823 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1824 InVals.push_back(Ret);
1825 }
1826
1827 // set IsTailCall to false for now, until we figure out how to express
1828 // tail call optimization in PTX
1829 CLI.IsTailCall = false;
1830 return CallEnd;
1831}
1832
1834 SelectionDAG &DAG) const {
1835
1836 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1837 const Function &Fn = DAG.getMachineFunction().getFunction();
1838
1840 Fn,
1841 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1842 "requires target sm_52.",
1843 SDLoc(Op).getDebugLoc()));
1844 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1845 Op.getOperand(0)};
1846 return DAG.getMergeValues(Ops, SDLoc());
1847 }
1848
1849 SDLoc DL(Op.getNode());
1850 SDValue Chain = Op.getOperand(0);
1851 SDValue Size = Op.getOperand(1);
1852 uint64_t Align = Op.getConstantOperandVal(2);
1853
1854 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1855 // the default stack alignment should be used.
1856 if (Align == 0)
1858
1859 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1860 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1861
1862 SDValue Alloc =
1863 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1864 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1865 DAG.getTargetConstant(Align, DL, MVT::i32)});
1866
1867 SDValue ASC = DAG.getAddrSpaceCast(
1869
1870 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1871}
1872
1874 SelectionDAG &DAG) const {
1875 SDLoc DL(Op.getNode());
1876 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1877 const Function &Fn = DAG.getMachineFunction().getFunction();
1878
1880 Fn,
1881 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1882 ">= sm_52.",
1883 DL.getDebugLoc()));
1884 return Op.getOperand(0);
1885 }
1886
1887 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1888 SDValue Chain = Op.getOperand(0);
1889 SDValue Ptr = Op.getOperand(1);
1892 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1893}
1894
1896 SelectionDAG &DAG) const {
1897 SDLoc DL(Op.getNode());
1898 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1899 const Function &Fn = DAG.getMachineFunction().getFunction();
1900
1902 Fn,
1903 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1904 "sm_52.",
1905 DL.getDebugLoc()));
1906 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1907 return DAG.getMergeValues(Ops, DL);
1908 }
1909
1910 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1911 SDValue Chain = Op.getOperand(0);
1912 SDValue SS =
1913 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1914 SDValue ASC = DAG.getAddrSpaceCast(
1915 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1916 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1917}
1918
1919// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1920// (see LegalizeDAG.cpp). This is slow and uses local memory.
1921// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1922SDValue
1923NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1924 SDNode *Node = Op.getNode();
1925 SDLoc dl(Node);
1927 unsigned NumOperands = Node->getNumOperands();
1928 for (unsigned i = 0; i < NumOperands; ++i) {
1929 SDValue SubOp = Node->getOperand(i);
1930 EVT VVT = SubOp.getNode()->getValueType(0);
1931 EVT EltVT = VVT.getVectorElementType();
1932 unsigned NumSubElem = VVT.getVectorNumElements();
1933 for (unsigned j = 0; j < NumSubElem; ++j) {
1934 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1935 DAG.getIntPtrConstant(j, dl)));
1936 }
1937 }
1938 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1939}
1940
1942 SelectionDAG &DAG,
1943 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1944 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1945 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1946 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1947 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1948}
1949
1951 SelectionDAG &DAG,
1952 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1953 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1954}
1955
1956/// Reduces the elements using the scalar operations provided. The operations
1957/// are sorted descending in number of inputs they take. The flags on the
1958/// original reduction operation will be propagated to each scalar operation.
1959/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1960/// used in ExpandReductions and SelectionDAG.
1962 const SmallVector<SDValue> &Elements, EVT EltTy,
1963 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1964 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1965 // Build the reduction tree at each level, starting with all the elements.
1966 SmallVector<SDValue> Level = Elements;
1967
1968 unsigned OpIdx = 0;
1969 while (Level.size() > 1) {
1970 // Try to reduce this level using the current operator.
1971 const auto [Op, NumInputs] = Ops[OpIdx];
1972
1973 // Build the next level by partially reducing all elements.
1974 SmallVector<SDValue> ReducedLevel;
1975 unsigned I = 0, E = Level.size();
1976 for (; I + NumInputs <= E; I += NumInputs) {
1977 // Reduce elements in groups of [NumInputs], as much as possible.
1978 ReducedLevel.push_back(DAG.getNode(
1979 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1980 }
1981
1982 if (I < E) {
1983 // Handle leftover elements.
1984
1985 if (ReducedLevel.empty()) {
1986 // We didn't reduce anything at this level. We need to pick a smaller
1987 // operator.
1988 ++OpIdx;
1989 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1990 continue;
1991 }
1992
1993 // We reduced some things but there's still more left, meaning the
1994 // operator's number of inputs doesn't evenly divide this level size. Move
1995 // these elements to the next level.
1996 for (; I < E; ++I)
1997 ReducedLevel.push_back(Level[I]);
1998 }
1999
2000 // Process the next level.
2001 Level = ReducedLevel;
2002 }
2003
2004 return *Level.begin();
2005}
2006
2007// Get scalar reduction opcode
2008static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
2009 switch (ReductionOpcode) {
2010 case ISD::VECREDUCE_FMAX:
2011 return ISD::FMAXNUM;
2012 case ISD::VECREDUCE_FMIN:
2013 return ISD::FMINNUM;
2014 case ISD::VECREDUCE_FMAXIMUM:
2015 return ISD::FMAXIMUM;
2016 case ISD::VECREDUCE_FMINIMUM:
2017 return ISD::FMINIMUM;
2018 default:
2019 llvm_unreachable("unhandled reduction opcode");
2020 }
2021}
2022
2023/// Get 3-input scalar reduction opcode
2024static std::optional<NVPTXISD::NodeType>
2025getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
2026 switch (ReductionOpcode) {
2027 case ISD::VECREDUCE_FMAX:
2028 return NVPTXISD::FMAXNUM3;
2029 case ISD::VECREDUCE_FMIN:
2030 return NVPTXISD::FMINNUM3;
2031 case ISD::VECREDUCE_FMAXIMUM:
2032 return NVPTXISD::FMAXIMUM3;
2033 case ISD::VECREDUCE_FMINIMUM:
2034 return NVPTXISD::FMINIMUM3;
2035 default:
2036 return std::nullopt;
2037 }
2038}
2039
2040/// Lower reductions to either a sequence of operations or a tree if
2041/// reassociations are allowed. This method will use larger operations like
2042/// max3/min3 when the target supports them.
2043SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
2044 SelectionDAG &DAG) const {
2045 SDLoc DL(Op);
2046 const SDNodeFlags Flags = Op->getFlags();
2047 SDValue Vector = Op.getOperand(0);
2048
2049 const unsigned Opcode = Op->getOpcode();
2050 const EVT EltTy = Vector.getValueType().getVectorElementType();
2051
2052 // Whether we can use 3-input min/max when expanding the reduction.
2053 const bool CanUseMinMax3 =
2054 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
2055 STI.getPTXVersion() >= 88 &&
2056 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2057 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2058
2059 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2060 // number of inputs they take.
2061 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2062
2063 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2064 CanUseMinMax3 && Opcode3Elem)
2065 ScalarOps.push_back({*Opcode3Elem, 3});
2066 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2067
2069 DAG.ExtractVectorElements(Vector, Elements);
2070
2071 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2072}
2073
2074SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2075 // Handle bitcasting from v2i8 without hitting the default promotion
2076 // strategy which goes through stack memory.
2077 EVT FromVT = Op->getOperand(0)->getValueType(0);
2078 if (FromVT != MVT::v2i8) {
2079 return Op;
2080 }
2081
2082 // Pack vector elements into i16 and bitcast to final type
2083 SDLoc DL(Op);
2084 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2085 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2086 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2087 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2088 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2089 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2090 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2091 SDValue AsInt = DAG.getNode(
2092 ISD::OR, DL, MVT::i16,
2093 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2094 EVT ToVT = Op->getValueType(0);
2095 return DAG.getBitcast(ToVT, AsInt);
2096}
2097
2098// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2099// would get lowered as two constant loads and vector-packing move.
2100// Instead we want just a constant move:
2101// mov.b32 %r2, 0x40003C00
2102SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2103 SelectionDAG &DAG) const {
2104 EVT VT = Op->getValueType(0);
2105 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2106 return Op;
2107 SDLoc DL(Op);
2108
2109 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2110 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2111 isa<ConstantFPSDNode>(Operand);
2112 })) {
2113 if (VT != MVT::v4i8)
2114 return Op;
2115 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2116 // to optimize calculation of constant parts.
2117 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2118 uint64_t SelectionValue) -> SDValue {
2119 SDValue L = Left;
2120 SDValue R = Right;
2121 if (Cast) {
2122 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2123 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2124 }
2125 return getPRMT(L, R, SelectionValue, DL, DAG);
2126 };
2127 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2128 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2129 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2130 return DAG.getBitcast(VT, PRMT3210);
2131 }
2132
2133 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2134 auto GetOperand = [](SDValue Op, int N) -> APInt {
2135 const SDValue &Operand = Op->getOperand(N);
2136 EVT VT = Op->getValueType(0);
2137 if (Operand->isUndef())
2138 return APInt(32, 0);
2139 APInt Value;
2140 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2141 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2142 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2143 Value = Operand->getAsAPIntVal();
2144 else
2145 llvm_unreachable("Unsupported type");
2146 // i8 values are carried around as i16, so we need to zero out upper bits,
2147 // so they do not get in the way of combining individual byte values
2148 if (VT == MVT::v4i8)
2149 Value = Value.trunc(8);
2150 return Value.zext(32);
2151 };
2152
2153 // Construct a 32-bit constant by shifting into place smaller values
2154 // (elements of the vector type VT).
2155 // For example, if VT has 2 elements, then N == 2:
2156 // ShiftAmount = 32 / N = 16
2157 // Value |= Op0 (b16) << 0
2158 // Value |= Op1 (b16) << 16
2159 // If N == 4:
2160 // ShiftAmount = 32 / N = 8
2161 // Value |= Op0 (b8) << 0
2162 // Value |= Op1 (b8) << 8
2163 // Value |= Op2 (b8) << 16
2164 // Value |= Op3 (b8) << 24
2165 // ...etc
2166 APInt Value(32, 0);
2167 const unsigned NumElements = VT.getVectorNumElements();
2168 assert(32 % NumElements == 0 && "must evenly divide bit length");
2169 const unsigned ShiftAmount = 32 / NumElements;
2170 for (unsigned ElementNo : seq(NumElements))
2171 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2172 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2173 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2174}
2175
2176SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2177 SelectionDAG &DAG) const {
2178 SDValue Index = Op->getOperand(1);
2179 SDValue Vector = Op->getOperand(0);
2180 SDLoc DL(Op);
2181 EVT VectorVT = Vector.getValueType();
2182
2183 if (VectorVT == MVT::v4i8) {
2184 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2185 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2186 DAG.getConstant(0x7770, DL, MVT::i32));
2187 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2188 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2189 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2190 SDNodeFlags Flags;
2191 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2192 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2193 Ext->setFlags(Flags);
2194 return Ext;
2195 }
2196
2197 // Constant index will be matched by tablegen.
2198 if (isa<ConstantSDNode>(Index.getNode()))
2199 return Op;
2200
2201 // Extract individual elements and select one of them.
2202 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2203 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2204 EVT EltVT = VectorVT.getVectorElementType();
2205
2206 SDLoc dl(Op.getNode());
2207 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2208 DAG.getIntPtrConstant(0, dl));
2209 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2210 DAG.getIntPtrConstant(1, dl));
2211 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2213}
2214
2215SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2216 SelectionDAG &DAG) const {
2217 SDValue Vector = Op->getOperand(0);
2218 EVT VectorVT = Vector.getValueType();
2219
2220 if (VectorVT != MVT::v4i8)
2221 return Op;
2222 SDLoc DL(Op);
2223 SDValue Value = Op->getOperand(1);
2224 if (Value->isUndef())
2225 return Vector;
2226
2227 SDValue Index = Op->getOperand(2);
2228
2229 SDValue BFI =
2230 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2231 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2232 DAG.getNode(ISD::MUL, DL, MVT::i32,
2233 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2234 DAG.getConstant(8, DL, MVT::i32)),
2235 DAG.getConstant(8, DL, MVT::i32)});
2236 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2237}
2238
2239SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2240 SelectionDAG &DAG) const {
2241 SDValue V1 = Op.getOperand(0);
2242 EVT VectorVT = V1.getValueType();
2243 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2244 return Op;
2245
2246 // Lower shuffle to PRMT instruction.
2247 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2248 SDValue V2 = Op.getOperand(1);
2249 uint32_t Selector = 0;
2250 for (auto I : llvm::enumerate(SVN->getMask())) {
2251 if (I.value() != -1) // -1 is a placeholder for undef.
2252 Selector |= (I.value() << (I.index() * 4));
2253 }
2254
2255 SDLoc DL(Op);
2256 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2257 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2258 return DAG.getBitcast(Op.getValueType(), PRMT);
2259}
2260/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2261/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2262/// amount, or
2263/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2264/// amount.
2265SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2266 SelectionDAG &DAG) const {
2267 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2268 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2269
2270 EVT VT = Op.getValueType();
2271 unsigned VTBits = VT.getSizeInBits();
2272 SDLoc dl(Op);
2273 SDValue ShOpLo = Op.getOperand(0);
2274 SDValue ShOpHi = Op.getOperand(1);
2275 SDValue ShAmt = Op.getOperand(2);
2276 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2277
2278 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2279 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2280 // {dHi, dLo} = {aHi, aLo} >> Amt
2281 // dHi = aHi >> Amt
2282 // dLo = shf.r.clamp aLo, aHi, Amt
2283
2284 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2285 SDValue Lo =
2286 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2287
2288 SDValue Ops[2] = { Lo, Hi };
2289 return DAG.getMergeValues(Ops, dl);
2290 }
2291 else {
2292 // {dHi, dLo} = {aHi, aLo} >> Amt
2293 // - if (Amt>=size) then
2294 // dLo = aHi >> (Amt-size)
2295 // dHi = aHi >> Amt (this is either all 0 or all 1)
2296 // else
2297 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2298 // dHi = aHi >> Amt
2299
2300 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2301 DAG.getConstant(VTBits, dl, MVT::i32),
2302 ShAmt);
2303 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2304 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2305 DAG.getConstant(VTBits, dl, MVT::i32));
2306 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2307 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2308 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2309
2310 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2311 DAG.getConstant(VTBits, dl, MVT::i32),
2312 ISD::SETGE);
2313 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2314 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2315
2316 SDValue Ops[2] = { Lo, Hi };
2317 return DAG.getMergeValues(Ops, dl);
2318 }
2319}
2320
2321/// LowerShiftLeftParts - Lower SHL_PARTS, which
2322/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2323/// amount, or
2324/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2325/// amount.
2326SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2327 SelectionDAG &DAG) const {
2328 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2329 assert(Op.getOpcode() == ISD::SHL_PARTS);
2330
2331 EVT VT = Op.getValueType();
2332 unsigned VTBits = VT.getSizeInBits();
2333 SDLoc dl(Op);
2334 SDValue ShOpLo = Op.getOperand(0);
2335 SDValue ShOpHi = Op.getOperand(1);
2336 SDValue ShAmt = Op.getOperand(2);
2337
2338 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2339 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2340 // {dHi, dLo} = {aHi, aLo} << Amt
2341 // dHi = shf.l.clamp aLo, aHi, Amt
2342 // dLo = aLo << Amt
2343
2344 SDValue Hi =
2345 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2346 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2347
2348 SDValue Ops[2] = { Lo, Hi };
2349 return DAG.getMergeValues(Ops, dl);
2350 }
2351 else {
2352 // {dHi, dLo} = {aHi, aLo} << Amt
2353 // - if (Amt>=size) then
2354 // dLo = aLo << Amt (all 0)
2355 // dLo = aLo << (Amt-size)
2356 // else
2357 // dLo = aLo << Amt
2358 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2359
2360 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2361 DAG.getConstant(VTBits, dl, MVT::i32),
2362 ShAmt);
2363 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2364 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2365 DAG.getConstant(VTBits, dl, MVT::i32));
2366 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2367 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2368 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2369
2370 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2371 DAG.getConstant(VTBits, dl, MVT::i32),
2372 ISD::SETGE);
2373 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2374 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2375
2376 SDValue Ops[2] = { Lo, Hi };
2377 return DAG.getMergeValues(Ops, dl);
2378 }
2379}
2380
2381/// If the types match, convert the generic copysign to the NVPTXISD version,
2382/// otherwise bail ensuring that mismatched cases are properly expaned.
2383SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2384 SelectionDAG &DAG) const {
2385 EVT VT = Op.getValueType();
2386 SDLoc DL(Op);
2387
2388 SDValue In1 = Op.getOperand(0);
2389 SDValue In2 = Op.getOperand(1);
2390 EVT SrcVT = In2.getValueType();
2391
2392 if (!SrcVT.bitsEq(VT))
2393 return SDValue();
2394
2395 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2396}
2397
2398SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2399 EVT VT = Op.getValueType();
2400
2401 if (VT == MVT::f32)
2402 return LowerFROUND32(Op, DAG);
2403
2404 if (VT == MVT::f64)
2405 return LowerFROUND64(Op, DAG);
2406
2407 llvm_unreachable("unhandled type");
2408}
2409
2410// This is the the rounding method used in CUDA libdevice in C like code:
2411// float roundf(float A)
2412// {
2413// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2414// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2415// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2416// }
2417SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2418 SelectionDAG &DAG) const {
2419 SDLoc SL(Op);
2420 SDValue A = Op.getOperand(0);
2421 EVT VT = Op.getValueType();
2422
2423 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2424
2425 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2426 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2427 const unsigned SignBitMask = 0x80000000;
2428 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2429 DAG.getConstant(SignBitMask, SL, MVT::i32));
2430 const unsigned PointFiveInBits = 0x3F000000;
2431 SDValue PointFiveWithSignRaw =
2432 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2433 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2434 SDValue PointFiveWithSign =
2435 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2436 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2437 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2438
2439 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2440 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2441 SDValue IsLarge =
2442 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2443 ISD::SETOGT);
2444 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2445
2446 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2447 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2448 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2449 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2450 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2451}
2452
2453// The implementation of round(double) is similar to that of round(float) in
2454// that they both separate the value range into three regions and use a method
2455// specific to the region to round the values. However, round(double) first
2456// calculates the round of the absolute value and then adds the sign back while
2457// round(float) directly rounds the value with sign.
2458SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2459 SelectionDAG &DAG) const {
2460 SDLoc SL(Op);
2461 SDValue A = Op.getOperand(0);
2462 EVT VT = Op.getValueType();
2463
2464 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2465
2466 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2467 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2468 DAG.getConstantFP(0.5, SL, VT));
2469 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2470
2471 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2472 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2473 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2474 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2475 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2476 DAG.getConstantFP(0, SL, VT),
2477 RoundedA);
2478
2479 // Add sign to rounded_A
2480 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2481 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2482
2483 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2484 SDValue IsLarge =
2485 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2486 ISD::SETOGT);
2487 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2488}
2489
2491 EVT VT = N->getValueType(0);
2492 EVT NVT = MVT::f32;
2493 if (VT.isVector()) {
2494 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2495 }
2496 SDLoc DL(N);
2497 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2498 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2499 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2500 return DAG.getFPExtendOrRound(Res, DL, VT);
2501}
2502
2503SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2504 SelectionDAG &DAG) const {
2505 if (useF32FTZ(DAG.getMachineFunction())) {
2506 return PromoteBinOpToF32(Op.getNode(), DAG);
2507 }
2508 return Op;
2509}
2510
2511SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2512 SelectionDAG &DAG) const {
2513 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2514
2515 if (Op.getValueType() == MVT::bf16) {
2516 SDLoc Loc(Op);
2517 return DAG.getNode(
2518 ISD::FP_ROUND, Loc, MVT::bf16,
2519 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2520 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2521 }
2522
2523 // Everything else is considered legal.
2524 return Op;
2525}
2526
2527SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2528 SelectionDAG &DAG) const {
2529 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2530
2531 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2532 SDLoc Loc(Op);
2533 return DAG.getNode(
2534 Op.getOpcode(), Loc, Op.getValueType(),
2535 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2536 }
2537
2538 // Everything else is considered legal.
2539 return Op;
2540}
2541
2542SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2543 SelectionDAG &DAG) const {
2544 EVT NarrowVT = Op.getValueType();
2545 SDValue Wide = Op.getOperand(0);
2546 EVT WideVT = Wide.getValueType();
2547 if (NarrowVT.getScalarType() == MVT::bf16) {
2548 const TargetLowering *TLI = STI.getTargetLowering();
2549 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2550 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2551 }
2552 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2553 // This combination was the first to support f32 -> bf16.
2554 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2555 if (WideVT.getScalarType() == MVT::f32) {
2556 return Op;
2557 }
2558 if (WideVT.getScalarType() == MVT::f64) {
2559 SDLoc Loc(Op);
2560 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2561 // the hardware f32 -> bf16 instruction.
2563 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2564 : MVT::f32,
2565 Wide, Loc, DAG);
2566 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2567 }
2568 }
2569 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2570 }
2571 }
2572
2573 // Everything else is considered legal.
2574 return Op;
2575}
2576
2577SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2578 SelectionDAG &DAG) const {
2579 SDValue Narrow = Op.getOperand(0);
2580 EVT NarrowVT = Narrow.getValueType();
2581 EVT WideVT = Op.getValueType();
2582 if (NarrowVT.getScalarType() == MVT::bf16) {
2583 if (WideVT.getScalarType() == MVT::f32 &&
2584 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2585 SDLoc Loc(Op);
2586 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2587 }
2588 if (WideVT.getScalarType() == MVT::f64 &&
2589 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2590 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2591 : MVT::f32;
2592 SDLoc Loc(Op);
2593 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2594 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2595 } else {
2596 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2597 }
2598 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2599 }
2600 }
2601
2602 // Everything else is considered legal.
2603 return Op;
2604}
2605
2607 SDLoc DL(Op);
2608 if (Op.getValueType() != MVT::v2i16)
2609 return Op;
2610 EVT EltVT = Op.getValueType().getVectorElementType();
2611 SmallVector<SDValue> VecElements;
2612 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2613 SmallVector<SDValue> ScalarArgs;
2614 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2615 [&](const SDUse &O) {
2616 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2617 O.get(), DAG.getIntPtrConstant(I, DL));
2618 });
2619 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2620 }
2621 SDValue V =
2622 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2623 return V;
2624}
2625
2627 SDNode *N = Op.getNode();
2628 SDLoc DL(N);
2630
2631 // split the vector argument
2632 for (size_t I = 0; I < N->getNumOperands(); I++) {
2633 SDValue Val = N->getOperand(I);
2634 EVT ValVT = Val.getValueType();
2635 if (ValVT.isVector()) {
2636 EVT EltVT = ValVT.getVectorElementType();
2637 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2638 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2639 DAG.getIntPtrConstant(J, DL)));
2640 } else
2641 Ops.push_back(Val);
2642 }
2643
2645 SDValue Tcgen05StNode =
2646 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2647 MemSD->getMemoryVT(), MemSD->getMemOperand());
2648
2649 return Tcgen05StNode;
2650}
2651
2652static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
2653 switch (IID) {
2654 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2656 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2658 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2660 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2662 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2664 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2666 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2668 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2670 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2672 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2674 case Intrinsic::
2675 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2677 case Intrinsic::
2678 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2680 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2682 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2684 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2686 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2688 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2690 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2692 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2694 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2696 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2698 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2700 case Intrinsic::
2701 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2702 return NVPTXISD::
2703 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2704 case Intrinsic::
2705 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2706 return NVPTXISD::
2707 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2708 };
2709 llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");
2710}
2711
2713 SDNode *N = Op.getNode();
2714 SDLoc DL(N);
2715 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2716
2718 // split the vector argument
2719 for (size_t I = 0; I < N->getNumOperands(); I++) {
2720 if (I == 1)
2721 continue; // skip IID
2722 SDValue Val = N->getOperand(I);
2723 EVT ValVT = Val.getValueType();
2724 if (ValVT.isVector()) {
2725 EVT EltVT = ValVT.getVectorElementType();
2726 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2727 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2728 DAG.getIntPtrConstant(J, DL)));
2729 } else
2730 Ops.push_back(Val);
2731 }
2732
2734 SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(
2735 getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,
2736 MemSD->getMemoryVT(), MemSD->getMemOperand());
2737
2738 return Tcgen05MMANode;
2739}
2740
2741// Lower vector return type of tcgen05.ld intrinsics
2742static std::optional<std::pair<SDValue, SDValue>>
2743lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
2744 SDLoc DL(N);
2745 EVT ResVT = N->getValueType(0);
2746 if (!ResVT.isVector())
2747 return {}; // already legalized.
2748
2749 const unsigned NumElts = ResVT.getVectorNumElements();
2750
2751 // Create the return type of the instructions
2752 SmallVector<EVT, 5> ListVTs;
2753 for (unsigned i = 0; i < NumElts; ++i)
2754 ListVTs.push_back(MVT::i32);
2755
2756 ListVTs.push_back(N->getValueType(1)); // Chain
2757
2758 SDVTList ResVTs = DAG.getVTList(ListVTs);
2759
2760 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
2761 N->getOperand(2)};
2762
2763 if (HasOffset) {
2764 Ops.push_back(N->getOperand(3)); // offset
2765 Ops.push_back(N->getOperand(4)); // Pack flag
2766 } else
2767 Ops.push_back(N->getOperand(3)); // Pack flag
2768
2770 SDValue NewNode =
2772 MemSD->getMemoryVT(), MemSD->getMemOperand());
2773
2774 // split the vector result
2775 SmallVector<SDValue, 4> ScalarRes;
2776 for (unsigned i = 0; i < NumElts; ++i) {
2777 SDValue Res = NewNode.getValue(i);
2778 ScalarRes.push_back(Res);
2779 }
2780
2781 SDValue Chain = NewNode.getValue(NumElts);
2782 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
2783 return {{BuildVector, Chain}};
2784}
2785
2787 SDNode *N = Op.getNode();
2788 SDValue Intrin = N->getOperand(1);
2789
2790 // Get the intrinsic ID
2791 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2792 switch (IntrinNo) {
2793 default:
2794 break;
2795 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2796 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2797 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2798 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2799 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2800 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2801 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2802 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2803 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2804 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2805 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2806 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2807 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2808 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2809 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2810 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2811 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2812 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2813 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2814 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2815 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2816 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2817 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2818 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2819 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2820 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2821 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2822 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2823 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2824 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2825 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2826 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2827 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2828 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2829 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2830 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2831 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2832 return lowerTcgen05St(Op, DAG);
2833 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2834 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2835 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2836 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2837 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2838 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2839 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2840 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2841 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2842 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2843 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2844 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2845 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2846 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2847 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2848 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2849 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2850 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2851 case Intrinsic::
2852 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2853 case Intrinsic::
2854 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2855 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2856 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2857 case Intrinsic::
2858 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2859 case Intrinsic::
2860 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2862 }
2863 return Op;
2864}
2865
2867 SelectionDAG &DAG) {
2868
2869 SDNode *N = Op.getNode();
2870 if (N->getOperand(1).getValueType() != MVT::i128) {
2871 // return, if the operand is already lowered
2872 return SDValue();
2873 }
2874
2875 unsigned IID =
2876 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2877 auto Opcode = [&]() {
2878 switch (IID) {
2879 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2881 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2883 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2885 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2887 default:
2888 llvm_unreachable("unsupported/unhandled intrinsic");
2889 }
2890 }();
2891
2892 SDLoc DL(N);
2893 SDValue TryCancelResponse = N->getOperand(1);
2894 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2895 SDValue TryCancelResponse0 =
2896 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2897 DAG.getIntPtrConstant(0, DL));
2898 SDValue TryCancelResponse1 =
2899 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2900 DAG.getIntPtrConstant(1, DL));
2901
2902 return DAG.getNode(Opcode, DL, N->getVTList(),
2903 {TryCancelResponse0, TryCancelResponse1});
2904}
2905
2907 const unsigned Mode = [&]() {
2908 switch (Op->getConstantOperandVal(0)) {
2909 case Intrinsic::nvvm_prmt:
2911 case Intrinsic::nvvm_prmt_b4e:
2913 case Intrinsic::nvvm_prmt_ecl:
2915 case Intrinsic::nvvm_prmt_ecr:
2917 case Intrinsic::nvvm_prmt_f4e:
2919 case Intrinsic::nvvm_prmt_rc16:
2921 case Intrinsic::nvvm_prmt_rc8:
2923 default:
2924 llvm_unreachable("unsupported/unhandled intrinsic");
2925 }
2926 }();
2927 SDLoc DL(Op);
2928 SDValue A = Op->getOperand(1);
2929 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
2930 : DAG.getConstant(0, DL, MVT::i32);
2931 SDValue Selector = (Op->op_end() - 1)->get();
2932 return getPRMT(A, B, Selector, DL, DAG, Mode);
2933}
2934
2936 switch (Op->getConstantOperandVal(1)) {
2937 default:
2938 return Op;
2939
2940 // These tcgen05 intrinsics return a v2i32, which is legal, so we have to
2941 // lower them through LowerOperation() instead of ReplaceNodeResults().
2942 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
2943 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
2944 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
2945 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))
2946 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
2947 return SDValue();
2948
2949 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
2950 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
2951 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
2952 return SDValue();
2953 }
2954}
2955
2957 switch (Op->getConstantOperandVal(0)) {
2958 default:
2959 return Op;
2960 case Intrinsic::nvvm_prmt:
2961 case Intrinsic::nvvm_prmt_b4e:
2962 case Intrinsic::nvvm_prmt_ecl:
2963 case Intrinsic::nvvm_prmt_ecr:
2964 case Intrinsic::nvvm_prmt_f4e:
2965 case Intrinsic::nvvm_prmt_rc16:
2966 case Intrinsic::nvvm_prmt_rc8:
2967 return lowerPrmtIntrinsic(Op, DAG);
2968 case Intrinsic::nvvm_internal_addrspace_wrap:
2969 return Op.getOperand(1);
2970 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2971 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2972 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2973 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2975 }
2976}
2977
2978// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
2979// Lower these into a node returning the correct type which is zero-extended
2980// back to the correct size.
2982 SDValue V = Op->getOperand(0);
2983 assert(V.getValueType() == MVT::i64 &&
2984 "Unexpected CTLZ/CTPOP type to legalize");
2985
2986 SDLoc DL(Op);
2987 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
2988 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
2989}
2990
2992 unsigned Opcode, SelectionDAG &DAG) {
2993 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
2994
2995 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
2996 if (!AmtConst)
2997 return SDValue();
2998 const auto Amt = AmtConst->getZExtValue() & 63;
2999
3000 SDValue UnpackA =
3001 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
3002 SDValue UnpackB =
3003 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
3004
3005 // Arch is Little endiain: 0 = low bits, 1 = high bits
3006 SDValue ALo = UnpackA.getValue(0);
3007 SDValue AHi = UnpackA.getValue(1);
3008 SDValue BLo = UnpackB.getValue(0);
3009 SDValue BHi = UnpackB.getValue(1);
3010
3011 // The bitfeild consists of { AHi : ALo : BHi : BLo }
3012 //
3013 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
3014 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
3015 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
3016 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
3017 //
3018 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
3019 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
3020 // on the direction. Amt = 32 can be implemented by a packing and unpacking
3021 // move to select and arrange the 32bit values. For simplicity, these cases
3022 // are not handled here explicitly and instead we rely on DAGCombiner to
3023 // remove the no-op funnel shifts we insert.
3024 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
3025 ? std::make_tuple(AHi, ALo, BHi)
3026 : std::make_tuple(ALo, BHi, BLo);
3027
3028 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
3029 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
3030 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
3031
3032 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
3033}
3034
3036 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
3037 SDLoc(Op), Op->getOpcode(), DAG);
3038}
3039
3041 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
3042 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
3043 SDLoc(Op), Opcode, DAG);
3044}
3045
3047 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
3048 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
3049 // the semantics of LLVM's frem.
3050 SDLoc DL(Op);
3051 SDValue X = Op->getOperand(0);
3052 SDValue Y = Op->getOperand(1);
3053 EVT Ty = Op.getValueType();
3054 SDNodeFlags Flags = Op->getFlags();
3055
3056 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
3057 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
3058 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
3060 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
3062
3063 if (Flags.hasNoInfs())
3064 return Sub;
3065
3066 // If Y is infinite, return X
3067 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
3068 SDValue Inf =
3069 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
3070 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
3071 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
3072}
3073
3075 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
3076
3077 SDValue Cond = Op->getOperand(0);
3078 SDValue TrueVal = Op->getOperand(1);
3079 SDValue FalseVal = Op->getOperand(2);
3080 SDLoc DL(Op);
3081
3082 // If both operands are truncated, we push the select through the truncates.
3083 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
3084 FalseVal.getOpcode() == ISD::TRUNCATE) {
3085 TrueVal = TrueVal.getOperand(0);
3086 FalseVal = FalseVal.getOperand(0);
3087
3088 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
3089 ? TrueVal.getValueType()
3090 : FalseVal.getValueType();
3091 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
3092 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
3093 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
3094 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
3095 }
3096
3097 // Otherwise, expand the select into a series of logical operations. These
3098 // often can be folded into other operations either by us or ptxas.
3099 TrueVal = DAG.getFreeze(TrueVal);
3100 FalseVal = DAG.getFreeze(FalseVal);
3101 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
3102 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
3103 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
3104 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
3105 return Or;
3106}
3107
3108SDValue
3110 switch (Op.getOpcode()) {
3111 case ISD::RETURNADDR:
3112 return SDValue();
3113 case ISD::FRAMEADDR:
3114 return SDValue();
3115 case ISD::ADDRSPACECAST:
3116 return LowerADDRSPACECAST(Op, DAG);
3118 return lowerIntrinsicWChain(Op, DAG);
3120 return lowerIntrinsicWOChain(Op, DAG);
3122 return lowerIntrinsicVoid(Op, DAG);
3123 case ISD::BUILD_VECTOR:
3124 return LowerBUILD_VECTOR(Op, DAG);
3125 case ISD::BITCAST:
3126 return LowerBITCAST(Op, DAG);
3128 return Op;
3130 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3132 return LowerINSERT_VECTOR_ELT(Op, DAG);
3134 return LowerVECTOR_SHUFFLE(Op, DAG);
3136 return LowerCONCAT_VECTORS(Op, DAG);
3137 case ISD::VECREDUCE_FMAX:
3138 case ISD::VECREDUCE_FMIN:
3139 case ISD::VECREDUCE_FMAXIMUM:
3140 case ISD::VECREDUCE_FMINIMUM:
3141 return LowerVECREDUCE(Op, DAG);
3142 case ISD::STORE:
3143 return LowerSTORE(Op, DAG);
3144 case ISD::LOAD:
3145 return LowerLOAD(Op, DAG);
3146 case ISD::SHL_PARTS:
3147 return LowerShiftLeftParts(Op, DAG);
3148 case ISD::SRA_PARTS:
3149 case ISD::SRL_PARTS:
3150 return LowerShiftRightParts(Op, DAG);
3151 case ISD::SELECT:
3152 return lowerSELECT(Op, DAG);
3153 case ISD::FROUND:
3154 return LowerFROUND(Op, DAG);
3155 case ISD::FCOPYSIGN:
3156 return LowerFCOPYSIGN(Op, DAG);
3157 case ISD::SINT_TO_FP:
3158 case ISD::UINT_TO_FP:
3159 return LowerINT_TO_FP(Op, DAG);
3160 case ISD::FP_TO_SINT:
3161 case ISD::FP_TO_UINT:
3162 return LowerFP_TO_INT(Op, DAG);
3163 case ISD::FP_ROUND:
3164 return LowerFP_ROUND(Op, DAG);
3165 case ISD::FP_EXTEND:
3166 return LowerFP_EXTEND(Op, DAG);
3167 case ISD::BR_JT:
3168 return LowerBR_JT(Op, DAG);
3169 case ISD::VAARG:
3170 return LowerVAARG(Op, DAG);
3171 case ISD::VASTART:
3172 return LowerVASTART(Op, DAG);
3173 case ISD::FSHL:
3174 case ISD::FSHR:
3175 return lowerFSH(Op, DAG);
3176 case ISD::ROTL:
3177 case ISD::ROTR:
3178 return lowerROT(Op, DAG);
3179 case ISD::ABS:
3180 case ISD::SMIN:
3181 case ISD::SMAX:
3182 case ISD::UMIN:
3183 case ISD::UMAX:
3184 case ISD::ADD:
3185 case ISD::SUB:
3186 case ISD::MUL:
3187 case ISD::SHL:
3188 case ISD::SREM:
3189 case ISD::UREM:
3190 return LowerVectorArith(Op, DAG);
3191 case ISD::DYNAMIC_STACKALLOC:
3192 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3193 case ISD::STACKRESTORE:
3194 return LowerSTACKRESTORE(Op, DAG);
3195 case ISD::STACKSAVE:
3196 return LowerSTACKSAVE(Op, DAG);
3197 case ISD::CopyToReg:
3198 return LowerCopyToReg_128(Op, DAG);
3199 case ISD::FADD:
3200 case ISD::FSUB:
3201 case ISD::FMUL:
3202 // Used only for bf16 on SM80, where we select fma for non-ftz operation
3203 return PromoteBinOpIfF32FTZ(Op, DAG);
3204 case ISD::CTPOP:
3205 case ISD::CTLZ:
3206 return lowerCTLZCTPOP(Op, DAG);
3207 case ISD::FREM:
3208 return lowerFREM(Op, DAG);
3209
3210 default:
3211 llvm_unreachable("Custom lowering not defined for operation");
3212 }
3213}
3214
3215SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3216 SDLoc DL(Op);
3217 SDValue Chain = Op.getOperand(0);
3218 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
3219 SDValue Index = Op.getOperand(2);
3220
3221 unsigned JId = JT->getIndex();
3223 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
3224
3225 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
3226
3227 // Generate BrxStart node
3228 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
3229 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
3230
3231 // Generate BrxItem nodes
3232 assert(!MBBs.empty());
3233 for (MachineBasicBlock *MBB : MBBs.drop_back())
3234 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
3235 DAG.getBasicBlock(MBB), Chain.getValue(1));
3236
3237 // Generate BrxEnd nodes
3238 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
3239 IdV, Chain.getValue(1)};
3240 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
3241
3242 return BrxEnd;
3243}
3244
3245// This will prevent AsmPrinter from trying to print the jump tables itself.
3249
3250SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3251 SelectionDAG &DAG) const {
3253 unsigned SrcAS = N->getSrcAddressSpace();
3254 unsigned DestAS = N->getDestAddressSpace();
3255 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3256 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3257 // Shared and SharedCluster can be converted to each other through generic
3258 // space
3259 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3262 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3263 SDLoc DL(Op.getNode());
3264 const MVT GenerictVT =
3266 SDValue GenericConversion = DAG.getAddrSpaceCast(
3267 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3268 SDValue SharedClusterConversion =
3269 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3270 ADDRESS_SPACE_GENERIC, DestAS);
3271 return SharedClusterConversion;
3272 }
3273
3274 return DAG.getUNDEF(Op.getValueType());
3275 }
3276
3277 return Op;
3278}
3279
3280// This function is almost a copy of SelectionDAG::expandVAArg().
3281// The only diff is that this one produces loads from local address space.
3282SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3283 const TargetLowering *TLI = STI.getTargetLowering();
3284 SDLoc DL(Op);
3285
3286 SDNode *Node = Op.getNode();
3287 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3288 EVT VT = Node->getValueType(0);
3289 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3290 SDValue Tmp1 = Node->getOperand(0);
3291 SDValue Tmp2 = Node->getOperand(1);
3292 const MaybeAlign MA(Node->getConstantOperandVal(3));
3293
3294 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3295 Tmp1, Tmp2, MachinePointerInfo(V));
3296 SDValue VAList = VAListLoad;
3297
3298 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3299 VAList = DAG.getNode(
3300 ISD::ADD, DL, VAList.getValueType(), VAList,
3301 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3302
3303 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3304 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3305 VAList.getValueType()));
3306 }
3307
3308 // Increment the pointer, VAList, to the next vaarg
3309 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3311 DL, VAList.getValueType()));
3312
3313 // Store the incremented VAList to the legalized pointer
3314 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3315 MachinePointerInfo(V));
3316
3317 const Value *SrcV = Constant::getNullValue(
3319
3320 // Load the actual argument out of the pointer VAList
3321 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3322}
3323
3324SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3325 const TargetLowering *TLI = STI.getTargetLowering();
3326 SDLoc DL(Op);
3327 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3328
3329 // Store the address of unsized array <function>_vararg[] in the ap object.
3330 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3331
3332 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3333 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3334 MachinePointerInfo(SV));
3335}
3336
3337/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3338static std::optional<std::pair<SDValue, SDValue>>
3341 const EVT ResVT = LD->getValueType(0);
3342 const EVT MemVT = LD->getMemoryVT();
3343
3344 // If we're doing sign/zero extension as part of the load, avoid lowering to
3345 // a LoadV node. TODO: consider relaxing this restriction.
3346 if (ResVT != MemVT)
3347 return std::nullopt;
3348
3349 const auto NumEltsAndEltVT =
3350 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3351 if (!NumEltsAndEltVT)
3352 return std::nullopt;
3353 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3354
3355 Align Alignment = LD->getAlign();
3356 const auto &TD = DAG.getDataLayout();
3357 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3358 if (Alignment < PrefAlign) {
3359 // This load is not sufficiently aligned, so bail out and let this vector
3360 // load be scalarized. Note that we may still be able to emit smaller
3361 // vector loads. For example, if we are loading a <4 x float> with an
3362 // alignment of 8, this check will fail but the legalizer will try again
3363 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3364 return std::nullopt;
3365 }
3366
3367 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3368 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3369 // loaded type to i16 and propagate the "real" type as the memory type.
3370 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3371
3372 unsigned Opcode;
3373 switch (NumElts) {
3374 default:
3375 return std::nullopt;
3376 case 2:
3377 Opcode = NVPTXISD::LoadV2;
3378 break;
3379 case 4:
3380 Opcode = NVPTXISD::LoadV4;
3381 break;
3382 case 8:
3383 Opcode = NVPTXISD::LoadV8;
3384 break;
3385 }
3386 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3387 ListVTs.push_back(MVT::Other);
3388 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3389
3390 SDLoc DL(LD);
3391
3392 // Copy regular operands
3393 SmallVector<SDValue, 8> OtherOps(LD->ops());
3394
3395 // The select routine does not have access to the LoadSDNode instance, so
3396 // pass along the extension information
3397 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
3398
3399 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3400 LD->getMemOperand());
3401
3402 SmallVector<SDValue> ScalarRes;
3403 if (EltVT.isVector()) {
3405 assert(NumElts * EltVT.getVectorNumElements() ==
3406 ResVT.getVectorNumElements());
3407 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3408 // into individual elements.
3409 for (const unsigned I : llvm::seq(NumElts)) {
3410 SDValue SubVector = NewLD.getValue(I);
3411 DAG.ExtractVectorElements(SubVector, ScalarRes);
3412 }
3413 } else {
3414 for (const unsigned I : llvm::seq(NumElts)) {
3415 SDValue Res = NewLD.getValue(I);
3416 if (LoadEltVT != EltVT)
3417 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3418 ScalarRes.push_back(Res);
3419 }
3420 }
3421
3422 SDValue LoadChain = NewLD.getValue(NumElts);
3423
3424 const MVT BuildVecVT =
3425 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3426 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3427 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3428
3429 return {{LoadValue, LoadChain}};
3430}
3431
3434 const NVPTXSubtarget &STI) {
3435 if (auto Res = replaceLoadVector(N, DAG, STI))
3436 Results.append({Res->first, Res->second});
3437}
3438
3440 const NVPTXSubtarget &STI) {
3441 if (auto Res = replaceLoadVector(N, DAG, STI))
3442 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3443 return SDValue();
3444}
3445
3446// v = ld i1* addr
3447// =>
3448// v1 = ld i8* addr (-> i16)
3449// v = trunc i16 to i1
3451 SDLoc dl(LD);
3452 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3453 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3454 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3455 LD->getBasePtr(), LD->getPointerInfo(),
3456 MVT::i8, LD->getAlign(),
3457 LD->getMemOperand()->getFlags());
3458 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3459 // The legalizer (the caller) is expecting two values from the legalized
3460 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3461 // in LegalizeDAG.cpp which also uses MergeValues.
3462 return DAG.getMergeValues({result, LD->getChain()}, dl);
3463}
3464
3465SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3466 LoadSDNode *LD = cast<LoadSDNode>(Op);
3467
3468 if (Op.getValueType() == MVT::i1)
3469 return lowerLOADi1(LD, DAG);
3470
3471 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3472 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3473 // we allow for more DAG combine opportunities.
3474 if (LD->getExtensionType() == ISD::EXTLOAD) {
3475 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3476 "Unexpected fpext-load");
3477 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3478 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3479 LD->getMemOperand());
3480 }
3481
3482 llvm_unreachable("Unexpected custom lowering for load");
3483}
3484
3486 const NVPTXSubtarget &STI) {
3487 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3488 SDValue Val = N->getOperand(1);
3489 SDLoc DL(N);
3490 const EVT ValVT = Val.getValueType();
3491 const EVT MemVT = N->getMemoryVT();
3492
3493 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3494 // TODO: consider relaxing this restriction.
3495 if (ValVT != MemVT)
3496 return SDValue();
3497
3498 const auto NumEltsAndEltVT =
3499 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3500 if (!NumEltsAndEltVT)
3501 return SDValue();
3502 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3503
3504 const DataLayout &TD = DAG.getDataLayout();
3505
3506 Align Alignment = N->getAlign();
3507 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3508 if (Alignment < PrefAlign) {
3509 // This store is not sufficiently aligned, so bail out and let this vector
3510 // store be scalarized. Note that we may still be able to emit smaller
3511 // vector stores. For example, if we are storing a <4 x float> with an
3512 // alignment of 8, this check will fail but the legalizer will try again
3513 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3514 return SDValue();
3515 }
3516
3517 unsigned Opcode;
3518 switch (NumElts) {
3519 default:
3520 return SDValue();
3521 case 2:
3522 Opcode = NVPTXISD::StoreV2;
3523 break;
3524 case 4:
3525 Opcode = NVPTXISD::StoreV4;
3526 break;
3527 case 8:
3528 Opcode = NVPTXISD::StoreV8;
3529 break;
3530 }
3531
3533
3534 // First is the chain
3535 Ops.push_back(N->getOperand(0));
3536
3537 // Then the split values
3538 if (EltVT.isVector()) {
3540 assert(NumElts * EltVT.getVectorNumElements() ==
3541 ValVT.getVectorNumElements());
3542 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3543 // stored as b32s
3544 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3545 for (const unsigned I : llvm::seq(NumElts)) {
3546 SmallVector<SDValue, 4> SubVectorElts;
3547 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3548 NumEltsPerSubVector);
3549 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3550 }
3551 } else {
3552 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3553 for (const unsigned I : llvm::seq(NumElts)) {
3554 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3555 DAG.getIntPtrConstant(I, DL));
3556
3557 // Since StoreV2 is a target node, we cannot rely on DAG type
3558 // legalization. Therefore, we must ensure the type is legal. For i1 and
3559 // i8, we set the stored type to i16 and propagate the "real" type as the
3560 // memory type.
3561 if (EltVT.getSizeInBits() < 16)
3562 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3563 Ops.push_back(ExtVal);
3564 }
3565 }
3566
3567 // Then any remaining arguments
3568 Ops.append(N->op_begin() + 2, N->op_end());
3569
3570 SDValue NewSt =
3571 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3572 N->getMemoryVT(), N->getMemOperand());
3573
3574 // return DCI.CombineTo(N, NewSt, true);
3575 return NewSt;
3576}
3577
3578SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3579 StoreSDNode *Store = cast<StoreSDNode>(Op);
3580 EVT VT = Store->getMemoryVT();
3581
3582 if (VT == MVT::i1)
3583 return LowerSTOREi1(Op, DAG);
3584
3585 // Lower store of any other vector type, including v2f32 as we want to break
3586 // it apart since this is not a widely-supported type.
3587 return lowerSTOREVector(Op, DAG, STI);
3588}
3589
3590// st i1 v, addr
3591// =>
3592// v1 = zxt v to i16
3593// st.u8 i16, addr
3594SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3595 SDNode *Node = Op.getNode();
3596 SDLoc dl(Node);
3597 StoreSDNode *ST = cast<StoreSDNode>(Node);
3598 SDValue Tmp1 = ST->getChain();
3599 SDValue Tmp2 = ST->getBasePtr();
3600 SDValue Tmp3 = ST->getValue();
3601 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3602 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3603 SDValue Result =
3604 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3605 ST->getAlign(), ST->getMemOperand()->getFlags());
3606 return Result;
3607}
3608
3609SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3610 SelectionDAG &DAG) const {
3611 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3612 // operand so that it can pass the legalization.
3613
3614 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3615 "Custom lowering for 128-bit CopyToReg only");
3616
3617 SDNode *Node = Op.getNode();
3618 SDLoc DL(Node);
3619
3620 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3621 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3622 DAG.getIntPtrConstant(0, DL));
3623 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3624 DAG.getIntPtrConstant(1, DL));
3625
3627 SmallVector<EVT, 3> ResultsType(Node->values());
3628
3629 NewOps[0] = Op->getOperand(0); // Chain
3630 NewOps[1] = Op->getOperand(1); // Dst Reg
3631 NewOps[2] = Lo; // Lower 64-bit
3632 NewOps[3] = Hi; // Higher 64-bit
3633 if (Op.getNumOperands() == 4)
3634 NewOps[4] = Op->getOperand(3); // Glue if exists
3635
3636 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3637}
3638
3639unsigned NVPTXTargetLowering::getNumRegisters(
3640 LLVMContext &Context, EVT VT,
3641 std::optional<MVT> RegisterVT = std::nullopt) const {
3642 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3643 return 1;
3644 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3645}
3646
3647bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3648 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3649 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3650 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3651 Parts[0] = Val;
3652 return true;
3653 }
3654 return false;
3655}
3656
3657// This creates target external symbol for a function parameter.
3658// Name of the symbol is composed from its index and the function name.
3659// Negative index corresponds to special parameter (unsized array) used for
3660// passing variable arguments.
3661SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
3662 EVT T) const {
3663 StringRef SavedStr = nvTM->getStrPool().save(
3665 return DAG.getExternalSymbol(SavedStr.data(), T);
3666}
3667
3668SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
3669 EVT T) const {
3670 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
3671 return DAG.getExternalSymbol(SavedStr.data(), T);
3672}
3673
3675 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3676 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3677 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3678 const DataLayout &DL = DAG.getDataLayout();
3679 LLVMContext &Ctx = *DAG.getContext();
3680 auto PtrVT = getPointerTy(DAG.getDataLayout());
3681
3682 const Function &F = DAG.getMachineFunction().getFunction();
3683
3684 SDValue Root = DAG.getRoot();
3685 SmallVector<SDValue, 16> OutChains;
3686
3687 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3688 // Ins.size() will be larger
3689 // * if there is an aggregate argument with multiple fields (each field
3690 // showing up separately in Ins)
3691 // * if there is a vector argument with more than typical vector-length
3692 // elements (generally if more than 4) where each vector element is
3693 // individually present in Ins.
3694 // So a different index should be used for indexing into Ins.
3695 // See similar issue in LowerCall.
3696
3697 auto AllIns = ArrayRef(Ins);
3698 for (const auto &Arg : F.args()) {
3699 const auto ArgIns = AllIns.take_while(
3700 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
3701 AllIns = AllIns.drop_front(ArgIns.size());
3702
3703 Type *Ty = Arg.getType();
3704
3705 if (ArgIns.empty())
3706 report_fatal_error("Empty parameter types are not supported");
3707
3708 if (Arg.use_empty()) {
3709 // argument is dead
3710 for (const auto &In : ArgIns) {
3711 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
3712 InVals.push_back(DAG.getUNDEF(In.VT));
3713 }
3714 continue;
3715 }
3716
3717 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
3718
3719 // In the following cases, assign a node order of "i+1"
3720 // to newly created nodes. The SDNodes for params have to
3721 // appear in the same order as their order of appearance
3722 // in the original function. "i+1" holds that order.
3723 if (Arg.hasByValAttr()) {
3724 // Param has ByVal attribute
3725 // Return MoveParam(param symbol).
3726 // Ideally, the param symbol can be returned directly,
3727 // but when SDNode builder decides to use it in a CopyToReg(),
3728 // machine instruction fails because TargetExternalSymbol
3729 // (not lowered) is target dependent, and CopyToReg assumes
3730 // the source is lowered.
3731 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
3732 const auto &ByvalIn = ArgIns[0];
3733 assert(getValueType(DL, Ty) == ByvalIn.VT &&
3734 "Ins type did not match function type");
3735 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
3736
3737 SDValue P;
3738 if (isKernelFunction(F)) {
3739 P = ArgSymbol;
3740 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3741 } else {
3742 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
3743 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3744 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
3746 }
3747 InVals.push_back(P);
3748 } else {
3751 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
3752 assert(VTs.size() == ArgIns.size() && "Size mismatch");
3753 assert(VTs.size() == Offsets.size() && "Size mismatch");
3754
3755 const Align ArgAlign = getFunctionArgumentAlignment(
3756 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
3757
3758 unsigned I = 0;
3759 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3760 for (const unsigned NumElts : VI) {
3761 // i1 is loaded/stored as i8
3762 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
3763 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
3764
3765 SDValue VecAddr = DAG.getObjectPtrOffset(
3766 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
3767
3768 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
3769 SDValue P =
3770 DAG.getLoad(VecVT, dl, Root, VecAddr,
3774 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3775 for (const unsigned J : llvm::seq(NumElts)) {
3776 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
3777
3778 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
3779 DAG, dl);
3780 InVals.push_back(Elt);
3781 }
3782 I += NumElts;
3783 }
3784 }
3785 }
3786
3787 if (!OutChains.empty())
3788 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
3789
3790 return Chain;
3791}
3792
3793SDValue
3795 bool isVarArg,
3797 const SmallVectorImpl<SDValue> &OutVals,
3798 const SDLoc &dl, SelectionDAG &DAG) const {
3799 const Function &F = DAG.getMachineFunction().getFunction();
3800 Type *RetTy = F.getReturnType();
3801
3802 if (RetTy->isVoidTy()) {
3803 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
3804 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3805 }
3806
3807 const DataLayout &DL = DAG.getDataLayout();
3808 LLVMContext &Ctx = *DAG.getContext();
3809
3810 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
3811 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
3812
3813 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3814 // 32-bits are sign extended or zero extended, depending on whether
3815 // they are signed or unsigned types.
3816 const bool ExtendIntegerRetVal =
3817 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3818
3821 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
3822 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3823
3824 const auto GetRetVal = [&](unsigned I) -> SDValue {
3825 SDValue RetVal = OutVals[I];
3827 RetVal.getValueType() &&
3828 "OutVal type should always be legal");
3829
3830 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
3831 const EVT StoreVT =
3832 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
3833 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
3834 };
3835
3836 unsigned I = 0;
3837 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
3838 for (const unsigned NumElts : VI) {
3839 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
3840 ? MaybeAlign(std::nullopt)
3841 : commonAlignment(RetAlign, Offsets[I]);
3842
3844 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
3845
3846 SDValue Ptr =
3847 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
3848
3849 Chain = DAG.getStore(Chain, dl, Val, Ptr,
3851
3852 I += NumElts;
3853 }
3854
3855 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3856}
3857
3859 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3860 SelectionDAG &DAG) const {
3861 if (Constraint.size() > 1)
3862 return;
3864}
3865
3866// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3867// TgtMemIntrinsic
3868// because we need the information that is only available in the "Value" type
3869// of destination
3870// pointer. In particular, the address space information.
3872 IntrinsicInfo &Info, const CallInst &I,
3873 MachineFunction &MF, unsigned Intrinsic) const {
3874 switch (Intrinsic) {
3875 default:
3876 return false;
3877 case Intrinsic::nvvm_match_all_sync_i32p:
3878 case Intrinsic::nvvm_match_all_sync_i64p:
3879 Info.opc = ISD::INTRINSIC_W_CHAIN;
3880 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3881 // in order to model data exchange with other threads, but perform no real
3882 // memory accesses.
3883 Info.memVT = MVT::i1;
3884
3885 // Our result depends on both our and other thread's arguments.
3887 return true;
3888 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3889 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3890 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3891 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3892 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3893 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3894 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3895 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3896 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3897 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3898 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3899 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3900 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3901 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3902 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3903 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3904 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3905 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3906 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3907 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3908 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3909 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3910 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3911 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3912 Info.opc = ISD::INTRINSIC_W_CHAIN;
3913 Info.memVT = MVT::v8f16;
3914 Info.ptrVal = I.getArgOperand(0);
3915 Info.offset = 0;
3916 Info.flags = MachineMemOperand::MOLoad;
3917 Info.align = Align(16);
3918 return true;
3919 }
3920 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3921 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3922 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3923 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3924 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3925 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3926 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3927 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3928 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3929 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3930 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3931 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3932 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3933 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3934 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3935 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3936 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3937 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3938 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3939 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3940 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3941 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3942 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3943 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3944 Info.opc = ISD::INTRINSIC_W_CHAIN;
3945 Info.memVT = MVT::v2i32;
3946 Info.ptrVal = I.getArgOperand(0);
3947 Info.offset = 0;
3948 Info.flags = MachineMemOperand::MOLoad;
3949 Info.align = Align(8);
3950 return true;
3951 }
3952
3953 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3954 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3955 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3956 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3957 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3958 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3959 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3960 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3961 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3962 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3963 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3964 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3965 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3966 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3967 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3968 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3969
3970 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3971 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3972 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3973 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3974 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3975 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3976 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3977 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3978 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3979 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3980 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3981 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3982 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3983 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3984 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3985 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3986 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3987 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
3988 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
3989 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
3990 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
3991 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
3992 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
3993 Info.opc = ISD::INTRINSIC_W_CHAIN;
3994 Info.memVT = MVT::v4i32;
3995 Info.ptrVal = I.getArgOperand(0);
3996 Info.offset = 0;
3997 Info.flags = MachineMemOperand::MOLoad;
3998 Info.align = Align(16);
3999 return true;
4000 }
4001
4002 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4003 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4004 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4005 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4006 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4007 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4008 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4009 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4010
4011 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4012 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4013 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4014 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4015 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4016 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4017 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4018 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4019 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4020 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4021 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4022 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4023 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4024 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4025 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4026 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4027 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4028 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4029 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4030 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4031 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4032 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
4033 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
4034 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
4035 Info.opc = ISD::INTRINSIC_W_CHAIN;
4036 Info.memVT = MVT::i32;
4037 Info.ptrVal = I.getArgOperand(0);
4038 Info.offset = 0;
4039 Info.flags = MachineMemOperand::MOLoad;
4040 Info.align = Align(4);
4041 return true;
4042 }
4043
4044 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4045 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4046 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4047 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4048 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4049 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4050 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4051 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4052 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4053 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4054 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4055 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4056 Info.opc = ISD::INTRINSIC_W_CHAIN;
4057 Info.memVT = MVT::v4f16;
4058 Info.ptrVal = I.getArgOperand(0);
4059 Info.offset = 0;
4060 Info.flags = MachineMemOperand::MOLoad;
4061 Info.align = Align(16);
4062 return true;
4063 }
4064
4065 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4066 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4067 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4068 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4069 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4070 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4071 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4072 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4073 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4074 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4075 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4076 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4077 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4078 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4079 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4080 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4081 Info.opc = ISD::INTRINSIC_W_CHAIN;
4082 Info.memVT = MVT::v8f32;
4083 Info.ptrVal = I.getArgOperand(0);
4084 Info.offset = 0;
4085 Info.flags = MachineMemOperand::MOLoad;
4086 Info.align = Align(16);
4087 return true;
4088 }
4089
4090 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4091 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4092 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4093 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4094
4095 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4096 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4097 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4098 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4099
4100 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4101 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4102 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4103 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4104 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4105 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4106 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4107 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4108 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4109 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4110 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4111 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4112 Info.opc = ISD::INTRINSIC_W_CHAIN;
4113 Info.memVT = MVT::v8i32;
4114 Info.ptrVal = I.getArgOperand(0);
4115 Info.offset = 0;
4116 Info.flags = MachineMemOperand::MOLoad;
4117 Info.align = Align(16);
4118 return true;
4119 }
4120
4121 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4122 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4123 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4124 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4125 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4126 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4127 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4128 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4129 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4130 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
4131 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
4132 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
4133 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
4134 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
4135 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
4136 Info.opc = ISD::INTRINSIC_W_CHAIN;
4137 Info.memVT = MVT::v2i32;
4138 Info.ptrVal = I.getArgOperand(0);
4139 Info.offset = 0;
4140 Info.flags = MachineMemOperand::MOLoad;
4141 Info.align = Align(8);
4142 return true;
4143 }
4144
4145 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4146 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4147 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4148 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4149
4150 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4151 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4152 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4153 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4154 Info.opc = ISD::INTRINSIC_W_CHAIN;
4155 Info.memVT = MVT::f64;
4156 Info.ptrVal = I.getArgOperand(0);
4157 Info.offset = 0;
4158 Info.flags = MachineMemOperand::MOLoad;
4159 Info.align = Align(8);
4160 return true;
4161 }
4162
4163 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4164 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4165 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4166 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4167 Info.opc = ISD::INTRINSIC_W_CHAIN;
4168 Info.memVT = MVT::v2f64;
4169 Info.ptrVal = I.getArgOperand(0);
4170 Info.offset = 0;
4171 Info.flags = MachineMemOperand::MOLoad;
4172 Info.align = Align(16);
4173 return true;
4174 }
4175
4176 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4177 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4178 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4179 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4180 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4181 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4182 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4183 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4184 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4185 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4186 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4187 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4188 Info.opc = ISD::INTRINSIC_VOID;
4189 Info.memVT = MVT::v4f16;
4190 Info.ptrVal = I.getArgOperand(0);
4191 Info.offset = 0;
4192 Info.flags = MachineMemOperand::MOStore;
4193 Info.align = Align(16);
4194 return true;
4195 }
4196
4197 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4198 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4199 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4200 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4201 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4202 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4203 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4204 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4205 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4206 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4207 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4208 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4209 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4210 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4211 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4212 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4213 Info.opc = ISD::INTRINSIC_VOID;
4214 Info.memVT = MVT::v8f32;
4215 Info.ptrVal = I.getArgOperand(0);
4216 Info.offset = 0;
4217 Info.flags = MachineMemOperand::MOStore;
4218 Info.align = Align(16);
4219 return true;
4220 }
4221
4222 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4223 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4224 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4225 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4226 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4227 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4228 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4229 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4230 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4231 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4232 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4233 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4234 Info.opc = ISD::INTRINSIC_VOID;
4235 Info.memVT = MVT::v8i32;
4236 Info.ptrVal = I.getArgOperand(0);
4237 Info.offset = 0;
4238 Info.flags = MachineMemOperand::MOStore;
4239 Info.align = Align(16);
4240 return true;
4241 }
4242
4243 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4244 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4245 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4246 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4247 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4248 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4249 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4250 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4251 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4252 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4253 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4254 Info.opc = ISD::INTRINSIC_VOID;
4255 Info.memVT = MVT::v2i32;
4256 Info.ptrVal = I.getArgOperand(0);
4257 Info.offset = 0;
4258 Info.flags = MachineMemOperand::MOStore;
4259 Info.align = Align(8);
4260 return true;
4261 }
4262
4263 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4264 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4265 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4266 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4267 Info.opc = ISD::INTRINSIC_VOID;
4268 Info.memVT = MVT::v2f64;
4269 Info.ptrVal = I.getArgOperand(0);
4270 Info.offset = 0;
4271 Info.flags = MachineMemOperand::MOStore;
4272 Info.align = Align(16);
4273 return true;
4274 }
4275
4276 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4277 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4278 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4279 Info.opc = ISD::INTRINSIC_VOID;
4280 Info.memVT = MVT::i32;
4281 Info.ptrVal = I.getArgOperand(0);
4282 Info.offset = 0;
4283 Info.flags = MachineMemOperand::MOStore;
4284 Info.align = Align(4);
4285 return true;
4286 }
4287
4288 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4289 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4290 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4291 Info.opc = ISD::INTRINSIC_VOID;
4292 Info.memVT = MVT::v4i32;
4293 Info.ptrVal = I.getArgOperand(0);
4294 Info.offset = 0;
4295 Info.flags = MachineMemOperand::MOStore;
4296 Info.align = Align(16);
4297 return true;
4298 }
4299
4300 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4301 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4302 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4303 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4304 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4305 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4306 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4307 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4308 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4309 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4310 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4311 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4312 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4313 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4314 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4315 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4316 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4317 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4318 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4319 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4320 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4321 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4322 auto &DL = I.getDataLayout();
4323 Info.opc = ISD::INTRINSIC_W_CHAIN;
4324 Info.memVT = getValueType(DL, I.getType());
4325 Info.ptrVal = I.getArgOperand(0);
4326 Info.offset = 0;
4328 Info.align.reset();
4329 return true;
4330 }
4331
4332 case Intrinsic::nvvm_prefetch_tensormap: {
4333 auto &DL = I.getDataLayout();
4334 Info.opc = ISD::INTRINSIC_VOID;
4335 Info.memVT = getPointerTy(DL);
4336 Info.ptrVal = I.getArgOperand(0);
4337 Info.offset = 0;
4338 Info.flags =
4340 Info.align.reset();
4341 return true;
4342 }
4343
4344 case Intrinsic::nvvm_ldu_global_i:
4345 case Intrinsic::nvvm_ldu_global_f:
4346 case Intrinsic::nvvm_ldu_global_p: {
4347 Info.opc = ISD::INTRINSIC_W_CHAIN;
4348 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4349 Info.ptrVal = I.getArgOperand(0);
4350 Info.offset = 0;
4351 Info.flags = MachineMemOperand::MOLoad;
4352 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4353
4354 return true;
4355 }
4356 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4357 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4358 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4359 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4360 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4361 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4362 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4363 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4364 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4365 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4366 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4367 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4368 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4369 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4370 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4371 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4372 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4373 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4374 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4375 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4376 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4377 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4378 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4379 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4380 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4381 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4382 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4383 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4384 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4385 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4386 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4387 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4388 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4389 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4390 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4391 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4392 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4393 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4394 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4395 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4396 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4397 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4398 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4399 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4400 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4401 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4402 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4403 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4404 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4405 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4406 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4407 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4408 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4409 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4410 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4411 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4412 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4413 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4414 Info.opc = ISD::INTRINSIC_W_CHAIN;
4415 Info.memVT = MVT::v4f32;
4416 Info.ptrVal = nullptr;
4417 Info.offset = 0;
4418 Info.flags = MachineMemOperand::MOLoad;
4419 Info.align = Align(16);
4420 return true;
4421
4422 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4423 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4424 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4425 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4426 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4427 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4428 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4429 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4430 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4431 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4432 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4433 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4434 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4435 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4436 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4437 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4438 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4439 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4440 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4441 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4442 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4443 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4444 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4445 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4446 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4447 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4448 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4449 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4450 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4451 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4452 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4453 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4454 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4455 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4456 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4457 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4458 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4459 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4460 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4461 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4462 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4463 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4464 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4465 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4466 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4467 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4468 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4469 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4470 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4471 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4472 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4473 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4474 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4475 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4476 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4477 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4478 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4479 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4480 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4481 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4482 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4483 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4484 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4485 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4486 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4487 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4488 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4489 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4490 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4491 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4492 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4493 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4494 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4495 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4496 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4497 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4498 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4499 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4500 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4501 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4502 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4503 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4504 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4505 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4506 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4507 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4508 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4509 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4510 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4511 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4512 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4513 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4514 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4515 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4516 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4517 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4518 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4519 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4520 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4521 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4522 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4523 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4524 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4525 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4526 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4527 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4528 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4529 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4530 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4531 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4532 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4533 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4534 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4535 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4536 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4537 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4538 Info.opc = ISD::INTRINSIC_W_CHAIN;
4539 Info.memVT = MVT::v4i32;
4540 Info.ptrVal = nullptr;
4541 Info.offset = 0;
4542 Info.flags = MachineMemOperand::MOLoad;
4543 Info.align = Align(16);
4544 return true;
4545
4546 case Intrinsic::nvvm_suld_1d_i8_clamp:
4547 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4548 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4549 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4550 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4551 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4552 case Intrinsic::nvvm_suld_2d_i8_clamp:
4553 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4554 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4555 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4556 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4557 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4558 case Intrinsic::nvvm_suld_3d_i8_clamp:
4559 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4560 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4561 case Intrinsic::nvvm_suld_1d_i8_trap:
4562 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4563 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4564 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4565 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4566 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4567 case Intrinsic::nvvm_suld_2d_i8_trap:
4568 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4569 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4570 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4571 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4572 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4573 case Intrinsic::nvvm_suld_3d_i8_trap:
4574 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4575 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4576 case Intrinsic::nvvm_suld_1d_i8_zero:
4577 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4578 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4579 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4580 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4581 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4582 case Intrinsic::nvvm_suld_2d_i8_zero:
4583 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4584 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4585 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4586 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4587 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4588 case Intrinsic::nvvm_suld_3d_i8_zero:
4589 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4590 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4591 Info.opc = ISD::INTRINSIC_W_CHAIN;
4592 Info.memVT = MVT::i8;
4593 Info.ptrVal = nullptr;
4594 Info.offset = 0;
4595 Info.flags = MachineMemOperand::MOLoad;
4596 Info.align = Align(16);
4597 return true;
4598
4599 case Intrinsic::nvvm_suld_1d_i16_clamp:
4600 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4601 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4602 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4603 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4604 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4605 case Intrinsic::nvvm_suld_2d_i16_clamp:
4606 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4607 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4608 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4609 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4610 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4611 case Intrinsic::nvvm_suld_3d_i16_clamp:
4612 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4613 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4614 case Intrinsic::nvvm_suld_1d_i16_trap:
4615 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4616 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4617 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4618 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4619 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4620 case Intrinsic::nvvm_suld_2d_i16_trap:
4621 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4622 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4623 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4624 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4625 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4626 case Intrinsic::nvvm_suld_3d_i16_trap:
4627 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4628 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4629 case Intrinsic::nvvm_suld_1d_i16_zero:
4630 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4631 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4632 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4633 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4634 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4635 case Intrinsic::nvvm_suld_2d_i16_zero:
4636 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4637 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4638 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4639 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4640 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4641 case Intrinsic::nvvm_suld_3d_i16_zero:
4642 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4643 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4644 Info.opc = ISD::INTRINSIC_W_CHAIN;
4645 Info.memVT = MVT::i16;
4646 Info.ptrVal = nullptr;
4647 Info.offset = 0;
4648 Info.flags = MachineMemOperand::MOLoad;
4649 Info.align = Align(16);
4650 return true;
4651
4652 case Intrinsic::nvvm_suld_1d_i32_clamp:
4653 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4654 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4655 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4656 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4657 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4658 case Intrinsic::nvvm_suld_2d_i32_clamp:
4659 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4660 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4661 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4662 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4663 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4664 case Intrinsic::nvvm_suld_3d_i32_clamp:
4665 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4666 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4667 case Intrinsic::nvvm_suld_1d_i32_trap:
4668 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4669 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4670 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4671 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4672 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4673 case Intrinsic::nvvm_suld_2d_i32_trap:
4674 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4675 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4676 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4677 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4678 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4679 case Intrinsic::nvvm_suld_3d_i32_trap:
4680 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4681 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4682 case Intrinsic::nvvm_suld_1d_i32_zero:
4683 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4684 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4685 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4686 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4687 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4688 case Intrinsic::nvvm_suld_2d_i32_zero:
4689 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4690 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4691 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4692 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4693 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4694 case Intrinsic::nvvm_suld_3d_i32_zero:
4695 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4696 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4697 Info.opc = ISD::INTRINSIC_W_CHAIN;
4698 Info.memVT = MVT::i32;
4699 Info.ptrVal = nullptr;
4700 Info.offset = 0;
4701 Info.flags = MachineMemOperand::MOLoad;
4702 Info.align = Align(16);
4703 return true;
4704
4705 case Intrinsic::nvvm_suld_1d_i64_clamp:
4706 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4707 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4708 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4709 case Intrinsic::nvvm_suld_2d_i64_clamp:
4710 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4711 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4712 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4713 case Intrinsic::nvvm_suld_3d_i64_clamp:
4714 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4715 case Intrinsic::nvvm_suld_1d_i64_trap:
4716 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4717 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4718 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4719 case Intrinsic::nvvm_suld_2d_i64_trap:
4720 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4721 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4722 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4723 case Intrinsic::nvvm_suld_3d_i64_trap:
4724 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4725 case Intrinsic::nvvm_suld_1d_i64_zero:
4726 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4727 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4728 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4729 case Intrinsic::nvvm_suld_2d_i64_zero:
4730 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4731 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4732 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4733 case Intrinsic::nvvm_suld_3d_i64_zero:
4734 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4735 Info.opc = ISD::INTRINSIC_W_CHAIN;
4736 Info.memVT = MVT::i64;
4737 Info.ptrVal = nullptr;
4738 Info.offset = 0;
4739 Info.flags = MachineMemOperand::MOLoad;
4740 Info.align = Align(16);
4741 return true;
4742
4743 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
4744 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
4745 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
4746 Info.opc = ISD::INTRINSIC_W_CHAIN;
4747 Info.memVT = MVT::v1i32;
4748 Info.ptrVal = I.getArgOperand(0);
4749 Info.offset = 0;
4750 Info.flags = MachineMemOperand::MOLoad;
4751 Info.align.reset();
4752 return true;
4753 }
4754
4755 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
4756 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
4757 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
4758 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: {
4759 Info.opc = ISD::INTRINSIC_W_CHAIN;
4760 Info.memVT = MVT::v2i32;
4761 Info.ptrVal = I.getArgOperand(0);
4762 Info.offset = 0;
4763 Info.flags = MachineMemOperand::MOLoad;
4764 Info.align.reset();
4765 return true;
4766 }
4767
4768 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
4769 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
4770 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
4771 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
4772 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: {
4773 Info.opc = ISD::INTRINSIC_W_CHAIN;
4774 Info.memVT = MVT::v4i32;
4775 Info.ptrVal = I.getArgOperand(0);
4776 Info.offset = 0;
4777 Info.flags = MachineMemOperand::MOLoad;
4778 Info.align.reset();
4779 return true;
4780 }
4781
4782 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
4783 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
4784 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
4785 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
4786 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: {
4787 Info.opc = ISD::INTRINSIC_W_CHAIN;
4788 Info.memVT = MVT::v8i32;
4789 Info.ptrVal = I.getArgOperand(0);
4790 Info.offset = 0;
4791 Info.flags = MachineMemOperand::MOLoad;
4792 Info.align.reset();
4793 return true;
4794 }
4795
4796 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
4797 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
4798 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
4799 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
4800 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: {
4801 Info.opc = ISD::INTRINSIC_W_CHAIN;
4802 Info.memVT = MVT::v16i32;
4803 Info.ptrVal = I.getArgOperand(0);
4804 Info.offset = 0;
4805 Info.flags = MachineMemOperand::MOLoad;
4806 Info.align.reset();
4807 return true;
4808 }
4809
4810 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
4811 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
4812 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
4813 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
4814 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: {
4815 Info.opc = ISD::INTRINSIC_W_CHAIN;
4816 Info.memVT = MVT::v32i32;
4817 Info.ptrVal = I.getArgOperand(0);
4818 Info.offset = 0;
4819 Info.flags = MachineMemOperand::MOLoad;
4820 Info.align.reset();
4821 return true;
4822 }
4823
4824 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
4825 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
4826 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
4827 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
4828 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: {
4829 Info.opc = ISD::INTRINSIC_W_CHAIN;
4830 Info.memVT = MVT::v64i32;
4831 Info.ptrVal = I.getArgOperand(0);
4832 Info.offset = 0;
4833 Info.flags = MachineMemOperand::MOLoad;
4834 Info.align.reset();
4835 return true;
4836 }
4837
4838 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
4839 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
4840 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
4841 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
4842 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
4843 Info.opc = ISD::INTRINSIC_W_CHAIN;
4844 Info.memVT = MVT::v128i32;
4845 Info.ptrVal = I.getArgOperand(0);
4846 Info.offset = 0;
4847 Info.flags = MachineMemOperand::MOLoad;
4848 Info.align.reset();
4849 return true;
4850 }
4851
4852 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
4853 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
4854 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
4855 Info.opc = ISD::INTRINSIC_VOID;
4856 Info.memVT = MVT::i32;
4857 Info.ptrVal = I.getArgOperand(0);
4858 Info.offset = 0;
4859 Info.flags = MachineMemOperand::MOStore;
4860 Info.align.reset();
4861 return true;
4862 }
4863
4864 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
4865 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
4866 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
4867 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
4868 Info.opc = ISD::INTRINSIC_VOID;
4869 Info.memVT = MVT::v2i32;
4870 Info.ptrVal = I.getArgOperand(0);
4871 Info.offset = 0;
4872 Info.flags = MachineMemOperand::MOStore;
4873 Info.align.reset();
4874 return true;
4875 }
4876
4877 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
4878 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
4879 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
4880 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
4881 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
4882 Info.opc = ISD::INTRINSIC_VOID;
4883 Info.memVT = MVT::v4i32;
4884 Info.ptrVal = I.getArgOperand(0);
4885 Info.offset = 0;
4886 Info.flags = MachineMemOperand::MOStore;
4887 Info.align.reset();
4888 return true;
4889 }
4890
4891 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
4892 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
4893 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
4894 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
4895 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
4896 Info.opc = ISD::INTRINSIC_VOID;
4897 Info.memVT = MVT::v8i32;
4898 Info.ptrVal = I.getArgOperand(0);
4899 Info.offset = 0;
4900 Info.flags = MachineMemOperand::MOStore;
4901 Info.align.reset();
4902 return true;
4903 }
4904
4905 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
4906 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
4907 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
4908 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
4909 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
4910 Info.opc = ISD::INTRINSIC_VOID;
4911 Info.memVT = MVT::v16i32;
4912 Info.ptrVal = I.getArgOperand(0);
4913 Info.offset = 0;
4914 Info.flags = MachineMemOperand::MOStore;
4915 Info.align.reset();
4916 return true;
4917 }
4918
4919 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
4920 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
4921 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
4922 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
4923 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
4924 Info.opc = ISD::INTRINSIC_VOID;
4925 Info.memVT = MVT::v32i32;
4926 Info.ptrVal = I.getArgOperand(0);
4927 Info.offset = 0;
4928 Info.flags = MachineMemOperand::MOStore;
4929 Info.align.reset();
4930 return true;
4931 }
4932
4933 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
4934 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
4935 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
4936 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
4937 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
4938 Info.opc = ISD::INTRINSIC_VOID;
4939 Info.memVT = MVT::v64i32;
4940 Info.ptrVal = I.getArgOperand(0);
4941 Info.offset = 0;
4942 Info.flags = MachineMemOperand::MOStore;
4943 Info.align.reset();
4944 return true;
4945 }
4946
4947 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
4948 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
4949 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
4950 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
4951 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
4952 Info.opc = ISD::INTRINSIC_VOID;
4953 Info.memVT = MVT::v128i32;
4954 Info.ptrVal = I.getArgOperand(0);
4955 Info.offset = 0;
4956 Info.flags = MachineMemOperand::MOStore;
4957 Info.align.reset();
4958 return true;
4959 }
4960 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
4961 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
4962 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
4963 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
4964 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
4965 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
4966 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
4967 case Intrinsic::
4968 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
4969 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
4970 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
4971 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
4972 case Intrinsic::
4973 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {
4974 // We are reading and writing back to TMem
4975 Info.opc = ISD::INTRINSIC_VOID;
4976 Info.memVT = MVT::v4i32;
4977 Info.ptrVal = I.getArgOperand(0);
4978 Info.offset = 0;
4980 Info.align = Align(16);
4981 return true;
4982 }
4983
4984 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
4985 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
4986 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
4987 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
4988 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
4989 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
4990 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
4991 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
4992 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
4993 case Intrinsic::
4994 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
4995 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
4996 case Intrinsic::
4997 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {
4998 // We are reading and writing back to TMem
4999 Info.opc = ISD::INTRINSIC_VOID;
5000 Info.memVT = MVT::v8i32;
5001 Info.ptrVal = I.getArgOperand(0);
5002 Info.offset = 0;
5004 Info.align = Align(16);
5005 return true;
5006 }
5007 }
5008 return false;
5009}
5010
5011/// getFunctionParamOptimizedAlign - since function arguments are passed via
5012/// .param space, we may want to increase their alignment in a way that
5013/// ensures that we can effectively vectorize their loads & stores. We can
5014/// increase alignment only if the function has internal or has private
5015/// linkage as for other linkage types callers may already rely on default
5016/// alignment. To allow using 128-bit vectorized loads/stores, this function
5017/// ensures that alignment is 16 or greater.
5019 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5020 // Capping the alignment to 128 bytes as that is the maximum alignment
5021 // supported by PTX.
5022 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5023
5024 // If a function has linkage different from internal or private, we
5025 // must use default ABI alignment as external users rely on it. Same
5026 // for a function that may be called from a function pointer.
5027 if (!F || !F->hasLocalLinkage() ||
5028 F->hasAddressTaken(/*Users=*/nullptr,
5029 /*IgnoreCallbackUses=*/false,
5030 /*IgnoreAssumeLikeCalls=*/true,
5031 /*IgnoreLLVMUsed=*/true))
5032 return ABITypeAlign;
5033
5034 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5035 return std::max(Align(16), ABITypeAlign);
5036}
5037
5038/// Helper for computing alignment of a device function byval parameter.
5040 const Function *F, Type *ArgTy, Align InitialAlign,
5041 const DataLayout &DL) const {
5042 Align ArgAlign = InitialAlign;
5043 // Try to increase alignment to enhance vectorization options.
5044 if (F)
5045 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5046
5047 // Old ptx versions have a bug. When PTX code takes address of
5048 // byval parameter with alignment < 4, ptxas generates code to
5049 // spill argument into memory. Alas on sm_50+ ptxas generates
5050 // SASS code that fails with misaligned access. To work around
5051 // the problem, make sure that we align byval parameters by at
5052 // least 4. This bug seems to be fixed at least starting from
5053 // ptxas > 9.0.
5054 // TODO: remove this after verifying the bug is not reproduced
5055 // on non-deprecated ptxas versions.
5057 ArgAlign = std::max(ArgAlign, Align(4));
5058
5059 return ArgAlign;
5060}
5061
5062// Helper for getting a function parameter name. Name is composed from
5063// its index and the function name. Negative index corresponds to special
5064// parameter (unsized array) used for passing variable arguments.
5066 int Idx) const {
5067 std::string ParamName;
5068 raw_string_ostream ParamStr(ParamName);
5069
5070 ParamStr << getTargetMachine().getSymbol(F)->getName();
5071 if (Idx < 0)
5072 ParamStr << "_vararg";
5073 else
5074 ParamStr << "_param_" << Idx;
5075
5076 return ParamName;
5077}
5078
5079/// isLegalAddressingMode - Return true if the addressing mode represented
5080/// by AM is legal for this target, for a load/store of the specified type.
5081/// Used to guide target specific optimizations, like loop strength reduction
5082/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5083/// (CodeGenPrepare.cpp)
5085 const AddrMode &AM, Type *Ty,
5086 unsigned AS, Instruction *I) const {
5087 // AddrMode - This represents an addressing mode of:
5088 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5089 //
5090 // The legal address modes are
5091 // - [avar]
5092 // - [areg]
5093 // - [areg+immoff]
5094 // - [immAddr]
5095
5096 // immoff must fit in a signed 32-bit int
5097 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5098 return false;
5099
5100 if (AM.BaseGV)
5101 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5102
5103 switch (AM.Scale) {
5104 case 0: // "r", "r+i" or "i" is allowed
5105 break;
5106 case 1:
5107 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5108 return false;
5109 // Otherwise we have r+i.
5110 break;
5111 default:
5112 // No scale > 1 is allowed
5113 return false;
5114 }
5115 return true;
5116}
5117
5118//===----------------------------------------------------------------------===//
5119// NVPTX Inline Assembly Support
5120//===----------------------------------------------------------------------===//
5121
5122/// getConstraintType - Given a constraint letter, return the type of
5123/// constraint it is for this target.
5126 if (Constraint.size() == 1) {
5127 switch (Constraint[0]) {
5128 default:
5129 break;
5130 case 'b':
5131 case 'r':
5132 case 'h':
5133 case 'c':
5134 case 'l':
5135 case 'f':
5136 case 'd':
5137 case 'q':
5138 case '0':
5139 case 'N':
5140 return C_RegisterClass;
5141 }
5142 }
5143 return TargetLowering::getConstraintType(Constraint);
5144}
5145
5146std::pair<unsigned, const TargetRegisterClass *>
5148 StringRef Constraint,
5149 MVT VT) const {
5150 if (Constraint.size() == 1) {
5151 switch (Constraint[0]) {
5152 case 'b':
5153 return std::make_pair(0U, &NVPTX::B1RegClass);
5154 case 'c':
5155 case 'h':
5156 return std::make_pair(0U, &NVPTX::B16RegClass);
5157 case 'r':
5158 case 'f':
5159 return std::make_pair(0U, &NVPTX::B32RegClass);
5160 case 'l':
5161 case 'N':
5162 case 'd':
5163 return std::make_pair(0U, &NVPTX::B64RegClass);
5164 case 'q': {
5165 if (STI.getSmVersion() < 70)
5166 report_fatal_error("Inline asm with 128 bit operands is only "
5167 "supported for sm_70 and higher!");
5168 return std::make_pair(0U, &NVPTX::B128RegClass);
5169 }
5170 }
5171 }
5172 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5173}
5174
5175//===----------------------------------------------------------------------===//
5176// NVPTX DAG Combining
5177//===----------------------------------------------------------------------===//
5178
5180 CodeGenOptLevel OptLevel) const {
5181 // Always honor command-line argument
5182 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5183 return FMAContractLevelOpt > 0;
5184
5185 // Do not contract if we're not optimizing the code.
5186 if (OptLevel == CodeGenOptLevel::None)
5187 return false;
5188
5189 // Honor TargetOptions flags that explicitly say fusion is okay.
5191 return true;
5192
5193 return false;
5194}
5195
5196static bool isConstZero(const SDValue &Operand) {
5197 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5198 return Const && Const->getZExtValue() == 0;
5199}
5200
5201/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5202/// operands N0 and N1. This is a helper for PerformADDCombine that is
5203/// called with the default operands, and if that fails, with commuted
5204/// operands.
5205static SDValue
5208 EVT VT = N0.getValueType();
5209
5210 // Since integer multiply-add costs the same as integer multiply
5211 // but is more costly than integer add, do the fusion only when
5212 // the mul is only used in the add.
5213 // TODO: this may not be true for later architectures, consider relaxing this
5214 if (!N0.getNode()->hasOneUse())
5215 return SDValue();
5216
5217 // fold (add (select cond, 0, (mul a, b)), c)
5218 // -> (select cond, c, (add (mul a, b), c))
5219 //
5220 if (N0.getOpcode() == ISD::SELECT) {
5221 unsigned ZeroOpNum;
5222 if (isConstZero(N0->getOperand(1)))
5223 ZeroOpNum = 1;
5224 else if (isConstZero(N0->getOperand(2)))
5225 ZeroOpNum = 2;
5226 else
5227 return SDValue();
5228
5229 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5230 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5231 return SDValue();
5232
5233 SDLoc DL(N);
5234 SDValue Mul =
5235 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
5236 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
5237 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5238 ((ZeroOpNum == 1) ? N1 : MAD),
5239 ((ZeroOpNum == 1) ? MAD : N1));
5240 }
5241
5242 return SDValue();
5243}
5244
5245static SDValue
5248 CodeGenOptLevel OptLevel) {
5249 EVT VT = N0.getValueType();
5250 if (N0.getOpcode() == ISD::FMUL) {
5251 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5252 &DCI.DAG.getTargetLoweringInfo());
5253 if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
5254 (N->getFlags().hasAllowContract() &&
5255 N0->getFlags().hasAllowContract())))
5256 return SDValue();
5257
5258 // For floating point:
5259 // Do the fusion only when the mul has less than 5 uses and all
5260 // are add.
5261 // The heuristic is that if a use is not an add, then that use
5262 // cannot be fused into fma, therefore mul is still needed anyway.
5263 // If there are more than 4 uses, even if they are all add, fusing
5264 // them will increase register pressue.
5265 //
5266 int numUses = 0;
5267 int nonAddCount = 0;
5268 for (const SDNode *User : N0.getNode()->users()) {
5269 numUses++;
5270 if (User->getOpcode() != ISD::FADD)
5271 ++nonAddCount;
5272 if (numUses >= 5)
5273 return SDValue();
5274 }
5275 if (nonAddCount) {
5276 int orderNo = N->getIROrder();
5277 int orderNo2 = N0.getNode()->getIROrder();
5278 // simple heuristics here for considering potential register
5279 // pressure, the logics here is that the differnce are used
5280 // to measure the distance between def and use, the longer distance
5281 // more likely cause register pressure.
5282 if (orderNo - orderNo2 < 500)
5283 return SDValue();
5284
5285 // Now, check if at least one of the FMUL's operands is live beyond the
5286 // node N, which guarantees that the FMA will not increase register
5287 // pressure at node N.
5288 bool opIsLive = false;
5289 const SDNode *left = N0.getOperand(0).getNode();
5290 const SDNode *right = N0.getOperand(1).getNode();
5291
5292 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5293 opIsLive = true;
5294
5295 if (!opIsLive)
5296 for (const SDNode *User : left->users()) {
5297 int orderNo3 = User->getIROrder();
5298 if (orderNo3 > orderNo) {
5299 opIsLive = true;
5300 break;
5301 }
5302 }
5303
5304 if (!opIsLive)
5305 for (const SDNode *User : right->users()) {
5306 int orderNo3 = User->getIROrder();
5307 if (orderNo3 > orderNo) {
5308 opIsLive = true;
5309 break;
5310 }
5311 }
5312
5313 if (!opIsLive)
5314 return SDValue();
5315 }
5316
5317 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5318 N0.getOperand(1), N1);
5319 }
5320
5321 return SDValue();
5322}
5323
5324/// Fold unpacking movs into a load by increasing the number of return values.
5325///
5326/// ex:
5327/// L: v2f16,ch = load <p>
5328/// a: f16 = extractelt L:0, 0
5329/// b: f16 = extractelt L:0, 1
5330/// use(a, b)
5331///
5332/// ...is turned into...
5333///
5334/// L: f16,f16,ch = LoadV2 <p>
5335/// use(L:0, L:1)
5336static SDValue
5338 // Don't run this optimization before the legalizer
5339 if (!DCI.isAfterLegalizeDAG())
5340 return SDValue();
5341
5342 EVT ElementVT = N->getValueType(0);
5343 // Avoid non-packed types and v4i8
5344 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5345 return SDValue();
5346
5347 SmallVector<SDNode *> DeadCopyToRegs;
5348
5349 // Check whether all outputs are either used by an extractelt or are
5350 // glue/chain nodes
5351 if (!all_of(N->uses(), [&](SDUse &U) {
5352 // Skip glue, chain nodes
5353 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5354 return true;
5355 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5356 if (N->getOpcode() != ISD::LOAD)
5357 return true;
5358 // Since this is an ISD::LOAD, check all extractelts are used. If
5359 // any are not used, we don't want to defeat another optimization that
5360 // will narrow the load.
5361 //
5362 // For example:
5363 //
5364 // L: v2f16,ch = load <p>
5365 // e0: f16 = extractelt L:0, 0
5366 // e1: f16 = extractelt L:0, 1 <-- unused
5367 // store e0
5368 //
5369 // Can be optimized by DAGCombiner to:
5370 //
5371 // L: f16,ch = load <p>
5372 // store L:0
5373 return !U.getUser()->use_empty();
5374 }
5375
5376 // Otherwise, this use prevents us from splitting a value.
5377 return false;
5378 }))
5379 return SDValue();
5380
5381 auto *LD = cast<MemSDNode>(N);
5382 SDLoc DL(LD);
5383
5384 // the new opcode after we double the number of operands
5385 NVPTXISD::NodeType Opcode;
5387 unsigned OldNumOutputs; // non-glue, non-chain outputs
5388 switch (LD->getOpcode()) {
5389 case ISD::LOAD:
5390 OldNumOutputs = 1;
5391 // Any packed type is legal, so the legalizer will not have lowered
5392 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5393 // here.
5394 Opcode = NVPTXISD::LoadV2;
5395 Operands.push_back(DCI.DAG.getIntPtrConstant(
5396 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5397 break;
5398 case NVPTXISD::LoadV2:
5399 OldNumOutputs = 2;
5400 Opcode = NVPTXISD::LoadV4;
5401 break;
5402 case NVPTXISD::LoadV4:
5403 // V8 is only supported for f32. Don't forget, we're not changing the load
5404 // size here. This is already a 256-bit load.
5405 if (ElementVT != MVT::v2f32)
5406 return SDValue();
5407 OldNumOutputs = 4;
5408 Opcode = NVPTXISD::LoadV8;
5409 break;
5410 case NVPTXISD::LoadV8:
5411 // PTX doesn't support the next doubling of outputs
5412 return SDValue();
5413 }
5414
5415 // the non-glue, non-chain outputs in the new load
5416 const unsigned NewNumOutputs = OldNumOutputs * 2;
5417 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5418 // add remaining chain and glue values
5419 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5420
5421 // Create the new load
5422 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5423 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5424 LD->getMemOperand());
5425
5426 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5427 // the outputs the same. These nodes will be optimized away in later
5428 // DAGCombiner iterations.
5430 for (unsigned I : seq(OldNumOutputs))
5431 Results.push_back(DCI.DAG.getBuildVector(
5432 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5433 // Add remaining chain and glue nodes
5434 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5435 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5436
5437 return DCI.DAG.getMergeValues(Results, DL);
5438}
5439
5440/// Fold packing movs into a store.
5441///
5442/// ex:
5443/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5444/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5445/// StoreV2 v1, v2
5446///
5447/// ...is turned into...
5448///
5449/// StoreV4 a, b, c, d
5452 unsigned Front, unsigned Back) {
5453 // We want to run this as late as possible since other optimizations may
5454 // eliminate the BUILD_VECTORs.
5455 if (!DCI.isAfterLegalizeDAG())
5456 return SDValue();
5457
5458 // Get the type of the operands being stored.
5459 EVT ElementVT = N->getOperand(Front).getValueType();
5460
5461 // Avoid non-packed types and v4i8
5462 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5463 return SDValue();
5464
5465 auto *ST = cast<MemSDNode>(N);
5466
5467 // The new opcode after we double the number of operands.
5468 NVPTXISD::NodeType Opcode;
5469 switch (N->getOpcode()) {
5470 case ISD::STORE:
5471 // Any packed type is legal, so the legalizer will not have lowered
5472 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5473 // it here.
5474 Opcode = NVPTXISD::StoreV2;
5475 break;
5476 case NVPTXISD::StoreV2:
5477 Opcode = NVPTXISD::StoreV4;
5478 break;
5479 case NVPTXISD::StoreV4:
5480 // V8 is only supported for f32. Don't forget, we're not changing the store
5481 // size here. This is already a 256-bit store.
5482 if (ElementVT != MVT::v2f32)
5483 return SDValue();
5484 Opcode = NVPTXISD::StoreV8;
5485 break;
5486 case NVPTXISD::StoreV8:
5487 // PTX doesn't support the next doubling of operands
5488 return SDValue();
5489 default:
5490 llvm_unreachable("Unhandled store opcode");
5491 }
5492
5493 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5494 // their elements.
5495 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5496 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5497 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5498 return SDValue();
5499
5500 // If the operand has multiple uses, this optimization can increase register
5501 // pressure.
5502 if (!BV.hasOneUse())
5503 return SDValue();
5504
5505 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
5506 // any signs they may be folded by some other pattern or rule.
5507 for (SDValue Op : BV->ops()) {
5508 // Peek through bitcasts
5509 if (Op.getOpcode() == ISD::BITCAST)
5510 Op = Op.getOperand(0);
5511
5512 // This may be folded into a PRMT.
5513 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
5514 Op->getOperand(0).getValueType() == MVT::i32)
5515 return SDValue();
5516
5517 // This may be folded into cvt.bf16x2
5518 if (Op.getOpcode() == ISD::FP_ROUND)
5519 return SDValue();
5520 }
5521 Operands.append({BV.getOperand(0), BV.getOperand(1)});
5522 }
5523 Operands.append(N->op_end() - Back, N->op_end());
5524
5525 // Now we replace the store
5526 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
5527 ST->getMemoryVT(), ST->getMemOperand());
5528}
5529
5531 const NVPTXSubtarget &STI) {
5532
5533 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
5534 // Here is our chance to custom lower a store with a non-simple type.
5535 // Unfortunately, we can't do this in the legalizer because there is no
5536 // way to setOperationAction for an non-simple type.
5538 if (!ST->getValue().getValueType().isSimple())
5539 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
5540 }
5541
5542 return combinePackingMovIntoStore(N, DCI, 1, 2);
5543}
5544
5546 const NVPTXSubtarget &STI) {
5547 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
5548 // Here is our chance to custom lower a load with a non-simple type.
5549 // Unfortunately, we can't do this in the legalizer because there is no
5550 // way to setOperationAction for an non-simple type.
5551 if (!N->getValueType(0).isSimple())
5552 return lowerLoadVector(N, DCI.DAG, STI);
5553 }
5554
5555 return combineUnpackingMovIntoLoad(N, DCI);
5556}
5557
5558/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5559///
5562 CodeGenOptLevel OptLevel) {
5563 if (OptLevel == CodeGenOptLevel::None)
5564 return SDValue();
5565
5566 SDValue N0 = N->getOperand(0);
5567 SDValue N1 = N->getOperand(1);
5568
5569 // Skip non-integer, non-scalar case
5570 EVT VT = N0.getValueType();
5571 if (VT.isVector() || VT != MVT::i32)
5572 return SDValue();
5573
5574 // First try with the default operand order.
5575 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5576 return Result;
5577
5578 // If that didn't work, try again with the operands commuted.
5579 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5580}
5581
5582/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5583///
5586 CodeGenOptLevel OptLevel) {
5587 SDValue N0 = N->getOperand(0);
5588 SDValue N1 = N->getOperand(1);
5589
5590 EVT VT = N0.getValueType();
5591 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5592 return SDValue();
5593
5594 // First try with the default operand order.
5595 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5596 return Result;
5597
5598 // If that didn't work, try again with the operands commuted.
5599 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5600}
5601
5602/// Get 3-input version of a 2-input min/max opcode
5603static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode) {
5604 switch (MinMax2Opcode) {
5605 case ISD::FMAXNUM:
5606 case ISD::FMAXIMUMNUM:
5607 return NVPTXISD::FMAXNUM3;
5608 case ISD::FMINNUM:
5609 case ISD::FMINIMUMNUM:
5610 return NVPTXISD::FMINNUM3;
5611 case ISD::FMAXIMUM:
5612 return NVPTXISD::FMAXIMUM3;
5613 case ISD::FMINIMUM:
5614 return NVPTXISD::FMINIMUM3;
5615 default:
5616 llvm_unreachable("Invalid 2-input min/max opcode");
5617 }
5618}
5619
5620/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
5621/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
5624 unsigned PTXVersion, unsigned SmVersion) {
5625
5626 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
5627 EVT VT = N->getValueType(0);
5628 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
5629 return SDValue();
5630
5631 SDValue Op0 = N->getOperand(0);
5632 SDValue Op1 = N->getOperand(1);
5633 unsigned MinMaxOp2 = N->getOpcode();
5634 NVPTXISD::NodeType MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
5635
5636 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
5637 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
5638 SDValue A = Op0.getOperand(0);
5639 SDValue B = Op0.getOperand(1);
5640 SDValue C = Op1;
5641 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5642 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
5643 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
5644 SDValue A = Op0;
5645 SDValue B = Op1.getOperand(0);
5646 SDValue C = Op1.getOperand(1);
5647 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5648 }
5649 return SDValue();
5650}
5651
5654 CodeGenOptLevel OptLevel) {
5655 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5656
5657 // Don't do anything at less than -O2.
5658 if (OptLevel < CodeGenOptLevel::Default)
5659 return SDValue();
5660
5661 SelectionDAG &DAG = DCI.DAG;
5662 SDLoc DL(N);
5663 EVT VT = N->getValueType(0);
5664 bool IsSigned = N->getOpcode() == ISD::SREM;
5665 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5666
5667 const SDValue &Num = N->getOperand(0);
5668 const SDValue &Den = N->getOperand(1);
5669
5670 for (const SDNode *U : Num->users()) {
5671 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5672 U->getOperand(1) == Den) {
5673 // Num % Den -> Num - (Num / Den) * Den
5674 return DAG.getNode(ISD::SUB, DL, VT, Num,
5675 DAG.getNode(ISD::MUL, DL, VT,
5676 DAG.getNode(DivOpc, DL, VT, Num, Den),
5677 Den));
5678 }
5679 }
5680 return SDValue();
5681}
5682
5683// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
5685 CodeGenOptLevel OptLevel) {
5686 if (OptLevel == CodeGenOptLevel::None)
5687 return SDValue();
5688
5689 SDValue Op = N->getOperand(0);
5690 if (!Op.hasOneUse())
5691 return SDValue();
5692 EVT ToVT = N->getValueType(0);
5693 EVT FromVT = Op.getValueType();
5694 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
5695 (ToVT == MVT::i64 && FromVT == MVT::i32)))
5696 return SDValue();
5697 if (!(Op.getOpcode() == ISD::MUL ||
5698 (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
5699 return SDValue();
5700
5701 SDLoc DL(N);
5702 unsigned ExtOpcode = N->getOpcode();
5703 unsigned Opcode = 0;
5704 if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
5706 else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
5708 else
5709 return SDValue();
5710 SDValue RHS = Op.getOperand(1);
5711 if (Op.getOpcode() == ISD::SHL) {
5712 const auto ShiftAmt = Op.getConstantOperandVal(1);
5713 const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
5714 RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
5715 }
5716 return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
5717}
5718
5724
5725/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5726/// that can be demoted to \p OptSize bits without loss of information. The
5727/// signedness of the operand, if determinable, is placed in \p S.
5729 unsigned OptSize,
5730 OperandSignedness &S) {
5731 S = Unknown;
5732
5733 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5734 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5735 EVT OrigVT = Op.getOperand(0).getValueType();
5736 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5737 S = Signed;
5738 return true;
5739 }
5740 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5741 EVT OrigVT = Op.getOperand(0).getValueType();
5742 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5743 S = Unsigned;
5744 return true;
5745 }
5746 }
5747
5748 return false;
5749}
5750
5751/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5752/// be demoted to \p OptSize bits without loss of information. If the operands
5753/// contain a constant, it should appear as the RHS operand. The signedness of
5754/// the operands is placed in \p IsSigned.
5756 unsigned OptSize,
5757 bool &IsSigned) {
5758 OperandSignedness LHSSign;
5759
5760 // The LHS operand must be a demotable op
5761 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5762 return false;
5763
5764 // We should have been able to determine the signedness from the LHS
5765 if (LHSSign == Unknown)
5766 return false;
5767
5768 IsSigned = (LHSSign == Signed);
5769
5770 // The RHS can be a demotable op or a constant
5772 const APInt &Val = CI->getAPIntValue();
5773 if (LHSSign == Unsigned) {
5774 return Val.isIntN(OptSize);
5775 } else {
5776 return Val.isSignedIntN(OptSize);
5777 }
5778 } else {
5779 OperandSignedness RHSSign;
5780 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5781 return false;
5782
5783 return LHSSign == RHSSign;
5784 }
5785}
5786
5787/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5788/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5789/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5790/// amount.
5793 EVT MulType = N->getValueType(0);
5794 if (MulType != MVT::i32 && MulType != MVT::i64) {
5795 return SDValue();
5796 }
5797
5798 SDLoc DL(N);
5799 unsigned OptSize = MulType.getSizeInBits() >> 1;
5800 SDValue LHS = N->getOperand(0);
5801 SDValue RHS = N->getOperand(1);
5802
5803 // Canonicalize the multiply so the constant (if any) is on the right
5804 if (N->getOpcode() == ISD::MUL) {
5805 if (isa<ConstantSDNode>(LHS)) {
5806 std::swap(LHS, RHS);
5807 }
5808 }
5809
5810 // If we have a SHL, determine the actual multiply amount
5811 if (N->getOpcode() == ISD::SHL) {
5813 if (!ShlRHS) {
5814 return SDValue();
5815 }
5816
5817 APInt ShiftAmt = ShlRHS->getAPIntValue();
5818 unsigned BitWidth = MulType.getSizeInBits();
5819 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5820 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5821 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5822 } else {
5823 return SDValue();
5824 }
5825 }
5826
5827 bool Signed;
5828 // Verify that our operands are demotable
5829 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5830 return SDValue();
5831 }
5832
5833 EVT DemotedVT;
5834 if (MulType == MVT::i32) {
5835 DemotedVT = MVT::i16;
5836 } else {
5837 DemotedVT = MVT::i32;
5838 }
5839
5840 // Truncate the operands to the correct size. Note that these are just for
5841 // type consistency and will (likely) be eliminated in later phases.
5842 SDValue TruncLHS =
5843 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5844 SDValue TruncRHS =
5845 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5846
5847 unsigned Opc;
5848 if (Signed) {
5850 } else {
5852 }
5853
5854 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5855}
5856
5857static bool isConstOne(const SDValue &Operand) {
5858 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5859 return Const && Const->getZExtValue() == 1;
5860}
5861
5863 if (Add->getOpcode() != ISD::ADD)
5864 return SDValue();
5865
5866 if (isConstOne(Add->getOperand(0)))
5867 return Add->getOperand(1);
5868
5869 if (isConstOne(Add->getOperand(1)))
5870 return Add->getOperand(0);
5871
5872 return SDValue();
5873}
5874
5877
5879 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5880 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
5881 }
5882
5883 return SDValue();
5884}
5885
5887 SDLoc DL,
5889 if (Select->getOpcode() != ISD::SELECT)
5890 return SDValue();
5891
5892 SDValue Cond = Select->getOperand(0);
5893
5894 unsigned ConstOpNo;
5895 if (isConstOne(Select->getOperand(1)))
5896 ConstOpNo = 1;
5897 else if (isConstOne(Select->getOperand(2)))
5898 ConstOpNo = 2;
5899 else
5900 return SDValue();
5901
5902 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5903
5904 // Do not combine if the resulting sequence is not obviously profitable.
5906 return SDValue();
5907
5908 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5909
5910 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5911 (ConstOpNo == 1) ? X : NewMul,
5912 (ConstOpNo == 1) ? NewMul : X);
5913}
5914
5915static SDValue
5918
5919 EVT VT = N0.getValueType();
5920 if (VT.isVector())
5921 return SDValue();
5922
5923 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5924 return SDValue();
5925
5926 SDLoc DL(N);
5927
5928 // (mul x, (add y, 1)) -> (add (mul x, y), x)
5929 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5930 return Res;
5931 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5932 return Res;
5933
5934 // (mul x, (select y, 1)) -> (select (mul x, y), x)
5935 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5936 return Res;
5937 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5938 return Res;
5939
5940 return SDValue();
5941}
5942
5943/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5946 CodeGenOptLevel OptLevel) {
5947 if (OptLevel == CodeGenOptLevel::None)
5948 return SDValue();
5949
5950 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5951 return Ret;
5952
5953 SDValue N0 = N->getOperand(0);
5954 SDValue N1 = N->getOperand(1);
5955 return PerformMULCombineWithOperands(N, N0, N1, DCI);
5956}
5957
5958/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5961 CodeGenOptLevel OptLevel) {
5962 if (OptLevel > CodeGenOptLevel::None) {
5963 // Try mul.wide combining at OptLevel > 0
5964 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5965 return Ret;
5966 }
5967
5968 return SDValue();
5969}
5970
5973 unsigned int SmVersion) {
5974 EVT CCType = N->getValueType(0);
5975 SDValue A = N->getOperand(0);
5976 SDValue B = N->getOperand(1);
5977
5978 EVT AType = A.getValueType();
5979 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5980 return SDValue();
5981
5982 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5983 return SDValue();
5984
5985 SDLoc DL(N);
5986 // setp.f16x2 returns two scalar predicates, which we need to
5987 // convert back to v2i1. The returned result will be scalarized by
5988 // the legalizer, but the comparison will remain a single vector
5989 // instruction.
5990 SDValue CCNode = DCI.DAG.getNode(
5991 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5993 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5994 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5995 CCNode.getValue(1));
5996}
5997
6000 SDValue Vector = N->getOperand(0);
6001 if (Vector->getOpcode() == ISD::FREEZE)
6002 Vector = Vector->getOperand(0);
6003 SDLoc DL(N);
6004 EVT VectorVT = Vector.getValueType();
6005 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
6006 IsPTXVectorType(VectorVT.getSimpleVT()))
6007 return SDValue(); // Native vector loads already combine nicely w/
6008 // extract_vector_elt.
6009 // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),
6010 // we already handle them OK.
6011 if (VectorVT.getVectorNumElements() == 1 ||
6012 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
6013 return SDValue();
6014
6015 // Don't mess with undef values as sra may be simplified to 0, not undef.
6016 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
6017 return SDValue();
6018
6019 uint64_t VectorBits = VectorVT.getSizeInBits();
6020 // We only handle the types we can extract in-register.
6021 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
6022 return SDValue();
6023
6024 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
6025 // Index == 0 is handled by generic DAG combiner.
6026 if (!Index || Index->getZExtValue() == 0)
6027 return SDValue();
6028
6029 MVT IVT = MVT::getIntegerVT(VectorBits);
6030 EVT EltVT = VectorVT.getVectorElementType();
6031 EVT EltIVT = EltVT.changeTypeToInteger();
6032 uint64_t EltBits = EltVT.getScalarSizeInBits();
6033
6034 SDValue Result = DCI.DAG.getNode(
6035 ISD::TRUNCATE, DL, EltIVT,
6036 DCI.DAG.getNode(
6037 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
6038 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
6039
6040 // If element has non-integer type, bitcast it back to the expected type.
6041 if (EltVT != EltIVT)
6042 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
6043 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
6044 if (EltVT != N->getValueType(0))
6045 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
6046
6047 return Result;
6048}
6049
6052 SDValue VA = N->getOperand(1);
6053 EVT VectorVT = VA.getValueType();
6054 if (VectorVT != MVT::v4i8)
6055 return SDValue();
6056
6057 // We need to split vselect into individual per-element operations Because we
6058 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
6059 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
6060 // to/from i16 normally used for i8 values.
6062 SDLoc DL(N);
6063 SDValue VCond = N->getOperand(0);
6064 SDValue VB = N->getOperand(2);
6065 for (int I = 0; I < 4; ++I) {
6066 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
6067 DCI.DAG.getConstant(I, DL, MVT::i32));
6068 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
6069 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
6070 DCI.DAG.getConstant(I, DL, MVT::i32)),
6071 DL, MVT::i32);
6072 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
6073 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
6074 DCI.DAG.getConstant(I, DL, MVT::i32)),
6075 DL, MVT::i32);
6076 E.push_back(DCI.DAG.getAnyExtOrTrunc(
6077 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
6078 }
6079 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
6080}
6081
6082static SDValue
6084 auto VT = N->getValueType(0);
6085 if (!DCI.isAfterLegalizeDAG() ||
6086 // only process v2*16 types
6087 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
6088 VT.getVectorNumElements() == 2))
6089 return SDValue();
6090
6091 auto Op0 = N->getOperand(0);
6092 auto Op1 = N->getOperand(1);
6093
6094 // Start out by assuming we want to take the lower 2 bytes of each i32
6095 // operand.
6096 uint64_t Op0Bytes = 0x10;
6097 uint64_t Op1Bytes = 0x54;
6098
6099 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6100 {&Op1, &Op1Bytes}};
6101
6102 // Check that each operand is an i16, truncated from an i32 operand. We'll
6103 // select individual bytes from those original operands. Optionally, fold in a
6104 // shift right of that original operand.
6105 for (auto &[Op, OpBytes] : OpData) {
6106 // Eat up any bitcast
6107 if (Op->getOpcode() == ISD::BITCAST)
6108 *Op = Op->getOperand(0);
6109
6110 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
6111 Op->getOperand(0).getValueType() == MVT::i32))
6112 return SDValue();
6113
6114 // If the truncate has multiple uses, this optimization can increase
6115 // register pressure
6116 if (!Op->hasOneUse())
6117 return SDValue();
6118
6119 *Op = Op->getOperand(0);
6120
6121 // Optionally, fold in a shift-right of the original operand and let permute
6122 // pick the two higher bytes of the original value directly.
6123 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6124 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6125 // Shift the PRMT byte selector to pick upper bytes from each respective
6126 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
6127 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
6128 "PRMT selector values out of range");
6129 *OpBytes += 0x22;
6130 *Op = Op->getOperand(0);
6131 }
6132 }
6133 }
6134
6135 SDLoc DL(N);
6136 auto &DAG = DCI.DAG;
6137
6138 auto PRMT =
6139 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
6140 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
6141 return DAG.getBitcast(VT, PRMT);
6142}
6143
6146 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
6147
6148 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
6149 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
6150
6151 // Fold asc[B -> A](asc[A -> B](x)) -> x
6152 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
6153 return ASCN2->getOperand(0);
6154 }
6155
6156 return SDValue();
6157}
6158
6159// Given a constant selector value and a prmt mode, return the selector value
6160// normalized to the generic prmt mode. See the PTX ISA documentation for more
6161// details:
6162// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
6163static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
6164 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6165
6167 return Selector;
6168
6169 const unsigned V = Selector.trunc(2).getZExtValue();
6170
6171 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
6172 unsigned S3) {
6173 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
6174 };
6175
6176 switch (Mode) {
6178 return GetSelector(V, V + 1, V + 2, V + 3);
6180 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
6182 return GetSelector(V, V, V, V);
6184 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
6186 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
6188 unsigned V1 = (V & 1) << 1;
6189 return GetSelector(V1, V1 + 1, V1, V1 + 1);
6190 }
6191 default:
6192 llvm_unreachable("Invalid PRMT mode");
6193 }
6194}
6195
6196static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
6197 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
6198 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6199 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6200 APInt BitField = B.concat(A);
6201 APInt SelectorVal = getPRMTSelector(Selector, Mode);
6202 APInt Result(32, 0);
6203 for (unsigned I : llvm::seq(4U)) {
6204 APInt Sel = SelectorVal.extractBits(4, I * 4);
6205 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6206 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6207 APInt Byte = BitField.extractBits(8, Idx * 8);
6208 if (Sign)
6209 Byte = Byte.ashr(8);
6210 Result.insertBits(Byte, I * 8);
6211 }
6212 return Result;
6213}
6214
6216 CodeGenOptLevel OptLevel) {
6217 if (OptLevel == CodeGenOptLevel::None)
6218 return SDValue();
6219
6220 // Constant fold PRMT
6221 if (isa<ConstantSDNode>(N->getOperand(0)) &&
6222 isa<ConstantSDNode>(N->getOperand(1)) &&
6223 isa<ConstantSDNode>(N->getOperand(2)))
6224 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
6225 N->getConstantOperandAPInt(1),
6226 N->getConstantOperandAPInt(2),
6227 N->getConstantOperandVal(3)),
6228 SDLoc(N), N->getValueType(0));
6229 return SDValue();
6230}
6231
6232// During call lowering we wrap the return values in a ProxyReg node which
6233// depend on the chain value produced by the completed call. This ensures that
6234// the full call is emitted in cases where libcalls are used to legalize
6235// operations. To improve the functioning of other DAG combines we pull all
6236// operations we can through one of these nodes, ensuring that the ProxyReg
6237// directly wraps a load. That is:
6238//
6239// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
6240//
6243 switch (R.getOpcode()) {
6244 case ISD::TRUNCATE:
6245 case ISD::ANY_EXTEND:
6246 case ISD::SIGN_EXTEND:
6247 case ISD::ZERO_EXTEND:
6248 case ISD::BITCAST: {
6249 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6250 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
6251 return SDValue();
6252 }
6253 case ISD::SHL:
6254 case ISD::SRL:
6255 case ISD::SRA:
6256 case ISD::OR: {
6257 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
6258 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
6259 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
6260 return SDValue();
6261 }
6262 case ISD::Constant:
6263 return R;
6264 case ISD::LOAD:
6265 case NVPTXISD::LoadV2:
6266 case NVPTXISD::LoadV4: {
6267 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
6268 {Chain, R});
6269 }
6270 case ISD::BUILD_VECTOR: {
6271 if (DCI.isBeforeLegalize())
6272 return SDValue();
6273
6275 for (auto &Op : R->ops()) {
6276 SDValue V = sinkProxyReg(Op, Chain, DCI);
6277 if (!V)
6278 return SDValue();
6279 Ops.push_back(V);
6280 }
6281 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
6282 }
6284 if (DCI.isBeforeLegalize())
6285 return SDValue();
6286
6287 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6289 R.getValueType(), V, R.getOperand(1));
6290 return SDValue();
6291 }
6292 default:
6293 return SDValue();
6294 }
6295}
6296
6299
6300 SDValue Chain = N->getOperand(0);
6301 SDValue Reg = N->getOperand(1);
6302
6303 // If the ProxyReg is not wrapping a load, try to pull the operations through
6304 // the ProxyReg.
6305 if (Reg.getOpcode() != ISD::LOAD) {
6306 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
6307 return V;
6308 }
6309
6310 return SDValue();
6311}
6312
6313SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6314 DAGCombinerInfo &DCI) const {
6316 switch (N->getOpcode()) {
6317 default:
6318 break;
6319 case ISD::ADD:
6320 return PerformADDCombine(N, DCI, OptLevel);
6321 case ISD::ADDRSPACECAST:
6322 return combineADDRSPACECAST(N, DCI);
6323 case ISD::SIGN_EXTEND:
6324 case ISD::ZERO_EXTEND:
6325 return combineMulWide(N, DCI, OptLevel);
6326 case ISD::BUILD_VECTOR:
6327 return PerformBUILD_VECTORCombine(N, DCI);
6329 return PerformEXTRACTCombine(N, DCI);
6330 case ISD::FADD:
6331 return PerformFADDCombine(N, DCI, OptLevel);
6332 case ISD::FMAXNUM:
6333 case ISD::FMINNUM:
6334 case ISD::FMAXIMUM:
6335 case ISD::FMINIMUM:
6336 case ISD::FMAXIMUMNUM:
6337 case ISD::FMINIMUMNUM:
6338 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
6339 STI.getSmVersion());
6340 case ISD::LOAD:
6341 case NVPTXISD::LoadV2:
6342 case NVPTXISD::LoadV4:
6343 return combineLOAD(N, DCI, STI);
6344 case ISD::MUL:
6345 return PerformMULCombine(N, DCI, OptLevel);
6346 case NVPTXISD::PRMT:
6347 return combinePRMT(N, DCI, OptLevel);
6348 case NVPTXISD::ProxyReg:
6349 return combineProxyReg(N, DCI);
6350 case ISD::SETCC:
6351 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6352 case ISD::SHL:
6353 return PerformSHLCombine(N, DCI, OptLevel);
6354 case ISD::SREM:
6355 case ISD::UREM:
6356 return PerformREMCombine(N, DCI, OptLevel);
6357 case ISD::STORE:
6358 case NVPTXISD::StoreV2:
6359 case NVPTXISD::StoreV4:
6360 return combineSTORE(N, DCI, STI);
6361 case ISD::VSELECT:
6362 return PerformVSELECTCombine(N, DCI);
6363 }
6364 return SDValue();
6365}
6366
6369 // Handle bitcasting to v2i8 without hitting the default promotion
6370 // strategy which goes through stack memory.
6371 SDValue Op(Node, 0);
6372 EVT ToVT = Op->getValueType(0);
6373 if (ToVT != MVT::v2i8) {
6374 return;
6375 }
6376
6377 // Bitcast to i16 and unpack elements into a vector
6378 SDLoc DL(Node);
6379 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
6380 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
6381 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
6382 SDValue Vec1 =
6383 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6384 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
6385 Results.push_back(
6386 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
6387}
6388
6391 SDValue Chain = N->getOperand(0);
6392 SDValue Intrin = N->getOperand(1);
6393 SDLoc DL(N);
6394
6395 // Get the intrinsic ID
6396 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6397 switch (IntrinNo) {
6398 default:
6399 return;
6400 case Intrinsic::nvvm_ldu_global_i:
6401 case Intrinsic::nvvm_ldu_global_f:
6402 case Intrinsic::nvvm_ldu_global_p: {
6403 EVT ResVT = N->getValueType(0);
6404
6405 if (ResVT.isVector()) {
6406 // Vector LDG/LDU
6407
6408 unsigned NumElts = ResVT.getVectorNumElements();
6409 EVT EltVT = ResVT.getVectorElementType();
6410
6411 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6412 // legalization.
6413 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6414 // loaded type to i16 and propagate the "real" type as the memory type.
6415 bool NeedTrunc = false;
6416 if (EltVT.getSizeInBits() < 16) {
6417 EltVT = MVT::i16;
6418 NeedTrunc = true;
6419 }
6420
6421 unsigned Opcode = 0;
6422 SDVTList LdResVTs;
6423
6424 switch (NumElts) {
6425 default:
6426 return;
6427 case 2:
6428 Opcode = NVPTXISD::LDUV2;
6429 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6430 break;
6431 case 4: {
6432 Opcode = NVPTXISD::LDUV4;
6433 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6434 LdResVTs = DAG.getVTList(ListVTs);
6435 break;
6436 }
6437 }
6438
6439 SmallVector<SDValue, 8> OtherOps;
6440
6441 // Copy regular operands
6442
6443 OtherOps.push_back(Chain); // Chain
6444 // Skip operand 1 (intrinsic ID)
6445 // Others
6446 OtherOps.append(N->op_begin() + 2, N->op_end());
6447
6449
6450 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6451 MemSD->getMemoryVT(),
6452 MemSD->getMemOperand());
6453
6454 SmallVector<SDValue, 4> ScalarRes;
6455
6456 for (unsigned i = 0; i < NumElts; ++i) {
6457 SDValue Res = NewLD.getValue(i);
6458 if (NeedTrunc)
6459 Res =
6460 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6461 ScalarRes.push_back(Res);
6462 }
6463
6464 SDValue LoadChain = NewLD.getValue(NumElts);
6465
6466 SDValue BuildVec =
6467 DAG.getBuildVector(ResVT, DL, ScalarRes);
6468
6469 Results.push_back(BuildVec);
6470 Results.push_back(LoadChain);
6471 } else {
6472 // i8 LDG/LDU
6473 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6474 "Custom handling of non-i8 ldu/ldg?");
6475
6476 // Just copy all operands as-is
6478
6479 // Force output to i16
6480 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6481
6483
6484 // We make sure the memory type is i8, which will be used during isel
6485 // to select the proper instruction.
6486 SDValue NewLD =
6488 MVT::i8, MemSD->getMemOperand());
6489
6490 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6491 NewLD.getValue(0)));
6492 Results.push_back(NewLD.getValue(1));
6493 }
6494 return;
6495 }
6496
6497 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
6498 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
6499 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
6500 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
6501 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
6502 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
6503 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
6504 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
6505 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
6506 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
6507 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
6508 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
6509 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
6510 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
6511 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
6512 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
6513 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
6514 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
6515 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
6516 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
6517 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
6518 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
6519 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
6520 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
6521 if (auto Res = lowerTcgen05Ld(N, DAG)) {
6522 Results.push_back(Res->first);
6523 Results.push_back(Res->second);
6524 }
6525 return;
6526
6527 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
6528 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
6529 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
6530 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
6531 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
6532 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
6533 if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {
6534 Results.push_back(Res->first);
6535 Results.push_back(Res->second);
6536 }
6537 return;
6538 }
6539}
6540
6543 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6544 // result so that it can pass the legalization
6545 SDLoc DL(N);
6546 SDValue Chain = N->getOperand(0);
6547 SDValue Reg = N->getOperand(1);
6548 SDValue Glue = N->getOperand(2);
6549
6550 assert(Reg.getValueType() == MVT::i128 &&
6551 "Custom lowering for CopyFromReg with 128-bit reg only");
6552 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6553 N->getValueType(2)};
6554 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6555
6556 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6557 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6558 {NewValue.getValue(0), NewValue.getValue(1)});
6559
6560 Results.push_back(Pair);
6561 Results.push_back(NewValue.getValue(2));
6562 Results.push_back(NewValue.getValue(3));
6563}
6564
6566 const TargetLowering &TLI,
6568 SDValue Chain = N->getOperand(0);
6569 SDValue Reg = N->getOperand(1);
6570
6571 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
6572
6573 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
6574 SDValue NewProxy =
6575 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
6576 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
6577
6578 Results.push_back(Res);
6579}
6580
6582 const NVPTXSubtarget &STI,
6584 assert(N->getValueType(0) == MVT::i128 &&
6585 "Custom lowering for atomic128 only supports i128");
6586
6588 SDLoc dl(N);
6589
6590 if (!STI.hasAtomSwap128()) {
6593 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
6594 "requires target sm_90.",
6595 dl.getDebugLoc()));
6596
6597 Results.push_back(DAG.getUNDEF(MVT::i128));
6598 Results.push_back(AN->getOperand(0)); // Chain
6599 return;
6600 }
6601
6603 Ops.push_back(AN->getOperand(0)); // Chain
6604 Ops.push_back(AN->getOperand(1)); // Ptr
6605 for (const auto &Op : AN->ops().drop_front(2)) {
6606 // Low part
6607 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6608 DAG.getIntPtrConstant(0, dl)));
6609 // High part
6610 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6611 DAG.getIntPtrConstant(1, dl)));
6612 }
6613 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
6616 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
6617 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
6618 AN->getMemOperand());
6619 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
6620 {Result.getValue(0), Result.getValue(1)}));
6621 Results.push_back(Result.getValue(2));
6622}
6623
6624void NVPTXTargetLowering::ReplaceNodeResults(
6626 switch (N->getOpcode()) {
6627 default:
6628 report_fatal_error("Unhandled custom legalization");
6629 case ISD::BITCAST:
6630 ReplaceBITCAST(N, DAG, Results);
6631 return;
6632 case ISD::LOAD:
6633 replaceLoadVector(N, DAG, Results, STI);
6634 return;
6637 return;
6638 case ISD::CopyFromReg:
6640 return;
6641 case NVPTXISD::ProxyReg:
6642 replaceProxyReg(N, DAG, *this, Results);
6643 return;
6644 case ISD::ATOMIC_CMP_SWAP:
6645 case ISD::ATOMIC_SWAP:
6646 replaceAtomicSwap128(N, DAG, STI, Results);
6647 return;
6648 }
6649}
6650
6653 Type *Ty = AI->getValOperand()->getType();
6654
6655 if (AI->isFloatingPointOperation()) {
6657 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6658 STI.getPTXVersion() >= 63)
6660 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6661 STI.getPTXVersion() >= 78)
6663 if (Ty->isFloatTy())
6665 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6667 }
6669 }
6670
6671 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6672 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
6673
6674 switch (AI->getOperation()) {
6675 default:
6678 if (BitWidth == 128)
6684 switch (BitWidth) {
6685 case 8:
6686 case 16:
6688 case 32:
6690 case 64:
6691 if (STI.hasAtomBitwise64())
6694 case 128:
6696 default:
6697 llvm_unreachable("unsupported width encountered");
6698 }
6705 switch (BitWidth) {
6706 case 8:
6707 case 16:
6709 case 32:
6711 case 64:
6712 if (STI.hasAtomMinMax64())
6715 case 128:
6717 default:
6718 llvm_unreachable("unsupported width encountered");
6719 }
6722 switch (BitWidth) {
6723 case 32:
6725 case 8:
6726 case 16:
6727 case 64:
6728 case 128:
6730 default:
6731 llvm_unreachable("unsupported width encountered");
6732 }
6733 }
6734
6736}
6737
6739 const Instruction *I) const {
6740 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6741 // When CAS bitwidth is not supported on the hardware, the CAS is emulated
6742 // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
6743 // the memory order using explicit fences around the retry loop.
6744 // The memory order of natively supported CAS operations can be enforced
6745 // by lowering to an atom.cas with the right memory synchronizing effect.
6746 // However, atom.cas only supports relaxed, acquire, release and acq_rel.
6747 // So we also use explicit fences for enforcing memory order for
6748 // seq_cast CAS with natively-supported bitwidths.
6749 return CI &&
6750 (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
6751 STI.getMinCmpXchgSizeInBits() ||
6752 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
6753}
6754
6756 const Instruction *I) const {
6757 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6758 bool BitwidthSupportedAndIsSeqCst =
6759 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
6760 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
6761 STI.getMinCmpXchgSizeInBits();
6762 return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
6764}
6765
6767 Instruction *Inst,
6768 AtomicOrdering Ord) const {
6769 if (!isa<AtomicCmpXchgInst>(Inst))
6770 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
6771
6772 // Specialize for cmpxchg
6773 // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
6774 SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
6775 if (isReleaseOrStronger(Ord))
6776 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
6777 ? Ord
6779 SSID);
6780
6781 return nullptr;
6782}
6783
6785 Instruction *Inst,
6786 AtomicOrdering Ord) const {
6787 // Specialize for cmpxchg
6788 if (!isa<AtomicCmpXchgInst>(Inst))
6789 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
6790
6791 auto *CI = cast<AtomicCmpXchgInst>(Inst);
6792 auto CASWidth =
6793 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
6794 SyncScope::ID SSID = CI->getSyncScopeID();
6795 // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
6796 if (isAcquireOrStronger(Ord) &&
6798 CASWidth < STI.getMinCmpXchgSizeInBits()))
6799 return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
6800
6801 return nullptr;
6802}
6803
6804// Rather than default to SINT when both UINT and SINT are custom, we only
6805// change the opcode when UINT is not legal and SINT is. UINT is preferred when
6806// both are custom since unsigned CVT instructions can lead to slightly better
6807// SASS code with fewer instructions.
6809 EVT ToVT) const {
6810 if (isOperationLegal(Op, ToVT))
6811 return Op;
6812 switch (Op) {
6813 case ISD::FP_TO_UINT:
6815 return ISD::FP_TO_SINT;
6816 break;
6820 break;
6821 case ISD::VP_FP_TO_UINT:
6822 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
6823 return ISD::VP_FP_TO_SINT;
6824 break;
6825 default:
6826 break;
6827 }
6828 return Op;
6829}
6830
6831// Pin NVPTXTargetObjectFile's vtables to this file.
6833
6838
6840 const SelectionDAG &DAG, unsigned Depth) {
6841 SDValue A = Op.getOperand(0);
6842 SDValue B = Op.getOperand(1);
6843 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6844 unsigned Mode = Op.getConstantOperandVal(3);
6845
6846 if (!Selector)
6847 return;
6848
6849 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
6850 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
6851
6852 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6853 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
6854 "PRMT must have i32 operands");
6855 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
6856 KnownBits BitField = BKnown.concat(AKnown);
6857
6858 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
6859 for (unsigned I : llvm::seq(4)) {
6860 APInt Sel = SelectorVal.extractBits(4, I * 4);
6861 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6862 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6863 KnownBits Byte = BitField.extractBits(8, Idx * 8);
6864 if (Sign)
6865 Byte = KnownBits::ashr(Byte, 8);
6866 Known.insertBits(Byte, I * 8);
6867 }
6868}
6869
6870static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
6872
6873 // We can't do anything without knowing the sign bit.
6874 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
6875 if (ExtType == ISD::SEXTLOAD)
6876 return;
6877
6878 // ExtLoading to vector types is weird and may not work well with known bits.
6879 auto DestVT = LD->getValueType(0);
6880 if (DestVT.isVector())
6881 return;
6882
6883 assert(Known.getBitWidth() == DestVT.getSizeInBits());
6884 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
6885 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
6886}
6887
6889 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
6890 const SelectionDAG &DAG, unsigned Depth) const {
6891 Known.resetAll();
6892
6893 switch (Op.getOpcode()) {
6894 case NVPTXISD::PRMT:
6895 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
6896 break;
6897 case NVPTXISD::LoadV2:
6898 case NVPTXISD::LoadV4:
6899 case NVPTXISD::LoadV8:
6901 break;
6902 default:
6903 break;
6904 }
6905}
6906
6907static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
6908 const APInt &DemandedBits) {
6909 APInt DemandedLHS = APInt(32, 0);
6910 APInt DemandedRHS = APInt(32, 0);
6911
6912 for (unsigned I : llvm::seq(4)) {
6913 if (DemandedBits.extractBits(8, I * 8).isZero())
6914 continue;
6915
6916 APInt Sel = SelectorVal.extractBits(4, I * 4);
6917 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6918 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6919
6920 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
6921 unsigned ByteStart = (Idx % 4) * 8;
6922 if (Sign)
6923 Src.setBit(ByteStart + 7);
6924 else
6925 Src.setBits(ByteStart, ByteStart + 8);
6926 }
6927
6928 return {DemandedLHS, DemandedRHS};
6929}
6930
6931// Replace undef with 0 as this is easier for other optimizations such as
6932// known bits.
6934 if (!Op)
6935 return SDValue();
6936 if (Op.isUndef())
6937 return DAG.getConstant(0, SDLoc(), MVT::i32);
6938 return Op;
6939}
6940
6942 const APInt &DemandedBits,
6943 SelectionDAG &DAG,
6944 const TargetLowering &TLI,
6945 unsigned Depth) {
6946 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
6947 SDValue Op0 = PRMT.getOperand(0);
6948 SDValue Op1 = PRMT.getOperand(1);
6949 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
6950 if (!SelectorConst)
6951 return SDValue();
6952
6953 unsigned Mode = PRMT.getConstantOperandVal(3);
6954 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
6955
6956 // Try to simplify the PRMT to one of the inputs if the used bytes are all
6957 // from the same input in the correct order.
6958 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
6959 const unsigned SelBits = (4 - LeadingBytes) * 4;
6960 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
6961 return Op0;
6962 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
6963 return Op1;
6964
6965 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
6966
6967 // Attempt to avoid multi-use ops if we don't need anything from them.
6968 SDValue DemandedOp0 =
6969 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
6970 SDValue DemandedOp1 =
6971 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
6972
6973 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
6974 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
6975 if ((DemandedOp0 && DemandedOp0 != Op0) ||
6976 (DemandedOp1 && DemandedOp1 != Op1)) {
6977 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
6978 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
6979 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
6980 }
6981
6982 return SDValue();
6983}
6984
6986 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
6987 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
6988 Known.resetAll();
6989
6990 switch (Op.getOpcode()) {
6991 case NVPTXISD::PRMT:
6993 *this, Depth)) {
6994 TLO.CombineTo(Op, Result);
6995 return true;
6996 }
6997 break;
6998 default:
6999 break;
7000 }
7001
7002 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
7003 return false;
7004}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition Compiler.h:404
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG)
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset=false)
static std::optional< NVPTXISD::NodeType > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1130
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
This class represents a function call, abstracting a target machine's calling convention.
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:521
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3155
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:379
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CALL
This node represents a PTX call instruction.
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ DeclareScalarParam
These nodes represent a parameter declaration.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
@ BUILD_VECTOR
This node is similar to ISD::BUILD_VECTOR except that the output may be implicitly bitcast to a scala...
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:251
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1948
unsigned promoteScalarArgumentSize(unsigned size)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool shouldPassAsArray(Type *Ty)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:197
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:233
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:219
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...