LLVM 22.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
18#include "NVPTXSubtarget.h"
19#include "NVPTXTargetMachine.h"
21#include "NVPTXUtilities.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/StringRef.h"
38#include "llvm/IR/Argument.h"
39#include "llvm/IR/Attributes.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DataLayout.h"
44#include "llvm/IR/FPEnv.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/IRBuilder.h"
48#include "llvm/IR/Instruction.h"
50#include "llvm/IR/IntrinsicsNVPTX.h"
51#include "llvm/IR/Module.h"
52#include "llvm/IR/Type.h"
53#include "llvm/IR/Value.h"
65#include <algorithm>
66#include <cassert>
67#include <cmath>
68#include <cstdint>
69#include <iterator>
70#include <optional>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76#define DEBUG_TYPE "nvptx-lower"
77
78using namespace llvm;
79
81 "nvptx-sched4reg",
82 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
83
85 "nvptx-fma-level", cl::Hidden,
86 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
87 " 1: do it 2: do it aggressively"),
88 cl::init(2));
89
91 "nvptx-prec-divf32", cl::Hidden,
93 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
95 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
96 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
98 "Use IEEE Compliant F32 div.rnd if available (default)"),
100 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
102
104 "nvptx-prec-sqrtf32", cl::Hidden,
105 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
106 cl::init(true));
107
108/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
109/// does NOT use lg2.approx for log2, so this is disabled by default.
111 "nvptx-approx-log2f32",
112 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
113 cl::init(false));
114
116 "nvptx-force-min-byval-param-align", cl::Hidden,
117 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
118 " params of device functions."),
119 cl::init(false));
120
123 const SDNode &N) const {
124 // If nvptx-prec-div32=N is used on the command-line, always honor it
125 if (UsePrecDivF32.getNumOccurrences() > 0)
126 return UsePrecDivF32;
127
128 const SDNodeFlags Flags = N.getFlags();
129 if (Flags.hasApproximateFuncs())
131
133}
134
136 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
137 if (UsePrecSqrtF32.getNumOccurrences() > 0)
138 return UsePrecSqrtF32;
139
140 if (N) {
141 const SDNodeFlags Flags = N->getFlags();
142 if (Flags.hasApproximateFuncs())
143 return false;
144 }
145
146 return true;
147}
148
153
154static bool IsPTXVectorType(MVT VT) {
155 switch (VT.SimpleTy) {
156 default:
157 return false;
158 case MVT::v2i1:
159 case MVT::v4i1:
160 case MVT::v2i8:
161 case MVT::v4i8:
162 case MVT::v8i8: // <2 x i8x4>
163 case MVT::v16i8: // <4 x i8x4>
164 case MVT::v2i16:
165 case MVT::v4i16:
166 case MVT::v8i16: // <4 x i16x2>
167 case MVT::v2i32:
168 case MVT::v4i32:
169 case MVT::v2i64:
170 case MVT::v2f16:
171 case MVT::v4f16:
172 case MVT::v8f16: // <4 x f16x2>
173 case MVT::v2bf16:
174 case MVT::v4bf16:
175 case MVT::v8bf16: // <4 x bf16x2>
176 case MVT::v2f32:
177 case MVT::v4f32:
178 case MVT::v2f64:
179 case MVT::v4i64:
180 case MVT::v4f64:
181 case MVT::v8i32:
182 case MVT::v8f32:
183 case MVT::v16f16: // <8 x f16x2>
184 case MVT::v16bf16: // <8 x bf16x2>
185 case MVT::v16i16: // <8 x i16x2>
186 case MVT::v32i8: // <8 x i8x4>
187 return true;
188 }
189}
190
191// When legalizing vector loads/stores, this function is called, which does two
192// things:
193// 1. Determines Whether the vector is something we want to custom lower,
194// std::nullopt is returned if we do not want to custom lower it.
195// 2. If we do want to handle it, returns two parameters:
196// - unsigned int NumElts - The number of elements in the final vector
197// - EVT EltVT - The type of the elements in the final vector
198static std::optional<std::pair<unsigned int, MVT>>
200 unsigned AddressSpace) {
201 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
202
203 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
204 VectorEVT.getSizeInBits() == 256)
205 return {{4, MVT::i64}};
206
207 if (!VectorEVT.isSimple())
208 return std::nullopt;
209 const MVT VectorVT = VectorEVT.getSimpleVT();
210
211 if (!VectorVT.isVector()) {
212 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
213 return {{2, MVT::i64}};
214 return std::nullopt;
215 }
216
217 const MVT EltVT = VectorVT.getVectorElementType();
218 const unsigned NumElts = VectorVT.getVectorNumElements();
219
220 // The size of the PTX virtual register that holds a packed type.
221 unsigned PackRegSize;
222
223 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
224 // legal. We can (and should) split that into 2 stores of <2 x double> here
225 // but I'm leaving that as a TODO for now.
226 switch (VectorVT.SimpleTy) {
227 default:
228 return std::nullopt;
229
230 case MVT::v4i64:
231 case MVT::v4f64:
232 // This is a "native" vector type iff the address space is global and the
233 // target supports 256-bit loads/stores
234 if (!CanLowerTo256Bit)
235 return std::nullopt;
237 case MVT::v2i8:
238 case MVT::v2i64:
239 case MVT::v2f64:
240 // This is a "native" vector type
241 return std::pair(NumElts, EltVT);
242
243 case MVT::v16f16: // <8 x f16x2>
244 case MVT::v16bf16: // <8 x bf16x2>
245 case MVT::v16i16: // <8 x i16x2>
246 case MVT::v32i8: // <8 x i8x4>
247 // This can be upsized into a "native" vector type iff the address space is
248 // global and the target supports 256-bit loads/stores.
249 if (!CanLowerTo256Bit)
250 return std::nullopt;
252 case MVT::v2i16: // <1 x i16x2>
253 case MVT::v2f16: // <1 x f16x2>
254 case MVT::v2bf16: // <1 x bf16x2>
255 case MVT::v4i8: // <1 x i8x4>
256 case MVT::v4i16: // <2 x i16x2>
257 case MVT::v4f16: // <2 x f16x2>
258 case MVT::v4bf16: // <2 x bf16x2>
259 case MVT::v8i8: // <2 x i8x4>
260 case MVT::v8f16: // <4 x f16x2>
261 case MVT::v8bf16: // <4 x bf16x2>
262 case MVT::v8i16: // <4 x i16x2>
263 case MVT::v16i8: // <4 x i8x4>
264 PackRegSize = 32;
265 break;
266
267 case MVT::v8f32: // <4 x f32x2>
268 case MVT::v8i32: // <4 x i32x2>
269 // This is a "native" vector type iff the address space is global and the
270 // target supports 256-bit loads/stores
271 if (!CanLowerTo256Bit)
272 return std::nullopt;
274 case MVT::v2f32: // <1 x f32x2>
275 case MVT::v4f32: // <2 x f32x2>
276 case MVT::v2i32: // <1 x i32x2>
277 case MVT::v4i32: // <2 x i32x2>
278 if (!STI.hasF32x2Instructions())
279 return std::pair(NumElts, EltVT);
280 PackRegSize = 64;
281 break;
282 }
283
284 // If we reach here, then we can pack 2 or more elements into a single 32-bit
285 // or 64-bit PTX register and treat the vector as a new vector containing
286 // packed elements.
287
288 // Number of elements to pack in one word.
289 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
290
291 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
292}
293
294/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
295/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
296/// the types as required by the calling convention (with special handling for
297/// i8s).
298/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
299/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
300/// LowerCall, and LowerReturn.
301static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
302 LLVMContext &Ctx, CallingConv::ID CallConv,
303 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
305 uint64_t StartingOffset = 0) {
306 SmallVector<EVT, 16> TempVTs;
307 SmallVector<uint64_t, 16> TempOffsets;
308 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
309
310 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
311 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
312 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
313
314 // Since we actually can load/store b8, we need to ensure that we'll use
315 // the original sized type for any i8s or i8 vectors.
316 if (VT.getScalarType() == MVT::i8) {
317 if (RegisterVT == MVT::i16)
318 RegisterVT = MVT::i8;
319 else if (RegisterVT == MVT::v2i16)
320 RegisterVT = MVT::v2i8;
321 else
322 assert(RegisterVT == MVT::v4i8 &&
323 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
324 }
325
326 // TODO: This is horribly incorrect for cases where the vector elements are
327 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
328 // has existed for as long as NVPTX has and no one has complained, so we'll
329 // leave it for now.
330 for (unsigned I : seq(NumRegs)) {
331 ValueVTs.push_back(RegisterVT);
332 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
333 }
334 }
335}
336
337// We return an EVT that can hold N VTs
338// If the VT is a vector, the resulting EVT is a flat vector with the same
339// element type as VT's element type.
340static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
341 if (N == 1)
342 return VT;
343
344 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
345 VT.getVectorNumElements() * N)
346 : EVT::getVectorVT(C, VT, N);
347}
348
350 const SDLoc &dl, SelectionDAG &DAG) {
351 if (V.getValueType() == VT) {
352 assert(I == 0 && "Index must be 0 for scalar value");
353 return V;
354 }
355
356 if (!VT.isVector())
357 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
358 DAG.getVectorIdxConstant(I, dl));
359
360 return DAG.getNode(
361 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
363}
364
365template <typename T>
366static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
367 SelectionDAG &DAG, T GetElement) {
368 if (N == 1)
369 return GetElement(0);
370
372 for (const unsigned I : llvm::seq(N)) {
373 SDValue Val = GetElement(I);
374 if (Val.getValueType().isVector())
375 DAG.ExtractVectorElements(Val, Values);
376 else
377 Values.push_back(Val);
378 }
379
380 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
381 Values.size());
382 return DAG.getBuildVector(VT, dl, Values);
383}
384
385/// PromoteScalarIntegerPTX
386/// Used to make sure the arguments/returns are suitable for passing
387/// and promote them to a larger size if they're not.
388///
389/// The promoted type is placed in \p PromoteVT if the function returns true.
391 if (VT.isScalarInteger()) {
392 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
393 default:
395 "Promotion is not suitable for scalars of size larger than 64-bits");
396 case 1:
397 return MVT::i1;
398 case 2:
399 case 4:
400 case 8:
401 return MVT::i8;
402 case 16:
403 return MVT::i16;
404 case 32:
405 return MVT::i32;
406 case 64:
407 return MVT::i64;
408 }
409 }
410 return VT;
411}
412
413// Check whether we can merge loads/stores of some of the pieces of a
414// flattened function parameter or return value into a single vector
415// load/store.
416//
417// The flattened parameter is represented as a list of EVTs and
418// offsets, and the whole structure is aligned to ParamAlignment. This
419// function determines whether we can load/store pieces of the
420// parameter starting at index Idx using a single vectorized op of
421// size AccessSize. If so, it returns the number of param pieces
422// covered by the vector op. Otherwise, it returns 1.
423template <typename T>
425 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
426 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
427
428 // Can't vectorize if param alignment is not sufficient.
429 if (ParamAlignment < AccessSize)
430 return 1;
431 // Can't vectorize if offset is not aligned.
432 if (Offsets[Idx] & (AccessSize - 1))
433 return 1;
434
435 EVT EltVT = ValueVTs[Idx];
436 unsigned EltSize = EltVT.getStoreSize();
437
438 // Element is too large to vectorize.
439 if (EltSize >= AccessSize)
440 return 1;
441
442 unsigned NumElts = AccessSize / EltSize;
443 // Can't vectorize if AccessBytes if not a multiple of EltSize.
444 if (AccessSize != EltSize * NumElts)
445 return 1;
446
447 // We don't have enough elements to vectorize.
448 if (Idx + NumElts > ValueVTs.size())
449 return 1;
450
451 // PTX ISA can only deal with 2- and 4-element vector ops.
452 if (NumElts != 4 && NumElts != 2)
453 return 1;
454
455 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
456 // Types do not match.
457 if (ValueVTs[j] != EltVT)
458 return 1;
459
460 // Elements are not contiguous.
461 if (Offsets[j] - Offsets[j - 1] != EltSize)
462 return 1;
463 }
464 // OK. We can vectorize ValueVTs[i..i+NumElts)
465 return NumElts;
466}
467
468// Computes whether and how we can vectorize the loads/stores of a
469// flattened function parameter or return value.
470//
471// The flattened parameter is represented as the list of ValueVTs and
472// Offsets, and is aligned to ParamAlignment bytes. We return a vector
473// of the same size as ValueVTs indicating how each piece should be
474// loaded/stored (i.e. as a scalar, or as part of a vector
475// load/store).
476template <typename T>
479 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
480 bool IsVAArg = false) {
481 // Set vector size to match ValueVTs and mark all elements as
482 // scalars by default.
483
484 if (IsVAArg)
485 return SmallVector<unsigned>(ValueVTs.size(), 1);
486
487 SmallVector<unsigned, 16> VectorInfo;
488
489 const auto GetNumElts = [&](unsigned I) -> unsigned {
490 for (const unsigned AccessSize : {16, 8, 4, 2}) {
491 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
492 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
493 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
494 "Unexpected vectorization size");
495 if (NumElts != 1)
496 return NumElts;
497 }
498 return 1;
499 };
500
501 // Check what we can vectorize using 128/64/32-bit accesses.
502 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
503 const unsigned NumElts = GetNumElts(I);
504 VectorInfo.push_back(NumElts);
505 I += NumElts;
506 }
507 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
508 ValueVTs.size());
509 return VectorInfo;
510}
511
512// NVPTXTargetLowering Constructor.
514 const NVPTXSubtarget &STI)
515 : TargetLowering(TM), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
516 // always lower memset, memcpy, and memmove intrinsics to load/store
517 // instructions, rather
518 // then generating calls to memset, mempcy or memmove.
522
525
526 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
527 // condition branches.
528 setJumpIsExpensive(true);
529
530 // Wide divides are _very_ slow. Try to reduce the width of the divide if
531 // possible.
532 addBypassSlowDiv(64, 32);
533
534 // By default, use the Source scheduling
535 if (sched4reg)
537 else
539
540 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
541 LegalizeAction NoF16Action) {
542 bool IsOpSupported = STI.allowFP16Math();
543 switch (Op) {
544 // Several FP16 instructions are available on sm_80 only.
545 case ISD::FMINNUM:
546 case ISD::FMAXNUM:
547 case ISD::FMAXNUM_IEEE:
548 case ISD::FMINNUM_IEEE:
549 case ISD::FMAXIMUM:
550 case ISD::FMINIMUM:
551 case ISD::FMAXIMUMNUM:
552 case ISD::FMINIMUMNUM:
553 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
554 break;
555 case ISD::FEXP2:
556 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
557 break;
558 }
559 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
560 };
561
562 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
563 LegalizeAction NoBF16Action) {
564 bool IsOpSupported = STI.hasNativeBF16Support(Op);
566 Op, VT, IsOpSupported ? Action : NoBF16Action);
567 };
568
569 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
570 LegalizeAction NoI16x2Action) {
571 bool IsOpSupported = false;
572 // instructions are available on sm_90 only
573 switch (Op) {
574 case ISD::ADD:
575 case ISD::SMAX:
576 case ISD::SMIN:
577 case ISD::UMIN:
578 case ISD::UMAX:
579 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
580 break;
581 }
582 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
583 };
584
585 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
586 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
587 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
588 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
589 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
590 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
591 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
592 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
593 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
594 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
595 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
596 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
597
598 if (STI.hasF32x2Instructions()) {
599 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
600 addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);
601 }
602
603 // Conversion to/from FP16/FP16x2 is always legal.
608
609 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
610 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
611 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
612
613 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
614 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
615
616 // Conversion to/from BFP16/BFP16x2 is always legal.
621
622 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
623 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
624 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
625 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
626
627 // Conversion to/from i16/i16x2 is always legal.
632
637
638 // No support for these operations with v2f32/v2i32
639 setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
640 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
641
644 MVT::v2i32, Expand);
645
646 // Need custom lowering in case the index is dynamic.
647 if (STI.hasF32x2Instructions())
648 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
649 Custom);
650
651 // Custom conversions to/from v2i8.
652 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
653
654 // Only logical ops can be done on v4i8 directly, others must be done
655 // elementwise.
672 MVT::v4i8, Expand);
673
674 // Operations not directly supported by NVPTX.
675 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
676 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
677 MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {
679 setOperationAction(ISD::BR_CC, VT, Expand);
680 }
681
682 // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.
683 setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);
684
685 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
686 // For others we will expand to a SHL/SRA pair.
693
700
703
705 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
706 Expand);
707
708 if (STI.hasHWROT32()) {
711 Custom);
712 }
713
715
716 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
717 setOperationAction(ISD::BRIND, MVT::Other, Expand);
718
719 // We want to legalize constant related memmove and memcopy
720 // intrinsics.
722
723 // FP extload/truncstore is not legal in PTX. We need to expand all these.
724 for (auto FloatVTs :
726 for (MVT ValVT : FloatVTs) {
727 for (MVT MemVT : FloatVTs) {
728 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
729 setTruncStoreAction(ValVT, MemVT, Expand);
730 }
731 }
732 }
733
734 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
735 // how they'll be lowered in ISel anyway, and by doing this a little earlier
736 // we allow for more DAG combine opportunities.
737 for (auto IntVTs :
739 for (MVT ValVT : IntVTs)
740 for (MVT MemVT : IntVTs)
741 if (isTypeLegal(ValVT))
742 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
743
744 // PTX does not support load / store predicate registers
745 setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
746 for (MVT VT : MVT::integer_valuetypes()) {
748 Promote);
749 setTruncStoreAction(VT, MVT::i1, Expand);
750 }
751
752 // Disable generations of extload/truncstore for v2i16/v2i8. The generic
753 // expansion for these nodes when they are unaligned is incorrect if the
754 // type is a vector.
755 //
756 // TODO: Fix the generic expansion for these nodes found in
757 // TargetLowering::expandUnalignedLoad/Store.
759 MVT::v2i8, Expand);
760 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
761
762 // Register custom handling for illegal type loads/stores. We'll try to custom
763 // lower almost all illegal types and logic in the lowering will discard cases
764 // we can't handle.
765 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
767 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
768 setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
769
770 // Custom legalization for LDU intrinsics.
771 // TODO: The logic to lower these is not very robust and we should rewrite it.
772 // Perhaps LDU should not be represented as an intrinsic at all.
775 if (IsPTXVectorType(VT))
777
781 MVT::i1, Expand);
782
783 // This is legal in NVPTX
788
789 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
790 setOperationAction({ISD::STACKRESTORE, ISD::STACKSAVE}, MVT::Other, Custom);
791
792 // TRAP can be lowered to PTX trap
793 setOperationAction(ISD::TRAP, MVT::Other, Legal);
794 // DEBUGTRAP can be lowered to PTX brkpt
795 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
796
797 // Support varargs.
798 setOperationAction(ISD::VASTART, MVT::Other, Custom);
799 setOperationAction(ISD::VAARG, MVT::Other, Custom);
800 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
801 setOperationAction(ISD::VAEND, MVT::Other, Expand);
802
804 {MVT::i16, MVT::i32, MVT::i64}, Legal);
805
807 Promote);
810
811 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
812 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
813 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
814 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
815 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
816 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
817 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
818
819 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
820 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
821 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
822 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
823 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
824 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
825
826 // Other arithmetic and logic ops are unsupported.
830 {MVT::v2i16, MVT::v2i32}, Expand);
831
832 // v2i32 is not supported for any arithmetic operations
837 MVT::v2i32, Expand);
838
843 if (STI.getPTXVersion() >= 43) {
848 }
849
851 setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);
854
855 // PTX does not directly support SELP of i1, so promote to i32 first
857
858 // PTX cannot multiply two i64s in a single instruction.
861
862 // We have some custom DAG combine patterns for these nodes
865 ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM,
866 ISD::FMAXIMUM, ISD::FMINIMUM, ISD::FMAXIMUMNUM,
867 ISD::FMINIMUMNUM, ISD::MUL, ISD::SHL,
869 ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
870 ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
871
872 // setcc for f16x2 and bf16x2 needs special handling to prevent
873 // legalizer's attempt to scalarize it due to v2i1 not being legal.
874 if (STI.allowFP16Math() || STI.hasBF16Math())
876
877 // Vector reduction operations. These may be turned into shuffle or tree
878 // reductions depending on what instructions are available for each type.
880 MVT EltVT = VT.getVectorElementType();
881 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
882 setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
883 ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
884 VT, Custom);
885 }
886 }
887
888 // Promote fp16 arithmetic if fp16 hardware isn't available or the
889 // user passed --nvptx-no-fp16-math. The flag is useful because,
890 // although sm_53+ GPUs have some sort of FP16 support in
891 // hardware, only sm_53 and sm_60 have full implementation. Others
892 // only have token amount of hardware and are likely to run faster
893 // by using fp32 units instead.
894 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
895 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
896 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
897 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
898 // bf16 must be promoted to f32.
899 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
900 if (getOperationAction(Op, MVT::bf16) == Promote)
901 AddPromotedToType(Op, MVT::bf16, MVT::f32);
902 setOperationAction(Op, MVT::v2f32,
903 STI.hasF32x2Instructions() ? Legal : Expand);
904 }
905
906 // On SM80, we select add/mul/sub as fma to avoid promotion to float
907 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
908 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
909 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
911 }
912 }
913 }
914
915 // f16/f16x2 neg was introduced in PTX 60, SM_53.
916 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
917 STI.getPTXVersion() >= 60 &&
918 STI.allowFP16Math();
919 for (const auto &VT : {MVT::f16, MVT::v2f16})
920 setOperationAction(ISD::FNEG, VT,
921 IsFP16FP16x2NegAvailable ? Legal : Expand);
922
923 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
924 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
925 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
926 // (would be) Library functions.
927
928 // These map to conversion instructions for scalar FP types.
929 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
930 ISD::FROUNDEVEN, ISD::FTRUNC}) {
931 setOperationAction(Op, MVT::f16, Legal);
932 setOperationAction(Op, MVT::f32, Legal);
933 setOperationAction(Op, MVT::f64, Legal);
934 setOperationAction(Op, MVT::v2f16, Expand);
935 setOperationAction(Op, MVT::v2bf16, Expand);
936 setOperationAction(Op, MVT::v2f32, Expand);
937 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
938 if (getOperationAction(Op, MVT::bf16) == Promote)
939 AddPromotedToType(Op, MVT::bf16, MVT::f32);
940 }
941
942 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
943 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
944 }
945 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
946 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
947 setOperationAction(ISD::FP_EXTEND, VT, Custom);
949 }
950 }
951
952 // Expand v2f32 = fp_extend
953 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
954 // Expand v2[b]f16 = fp_round v2f32
955 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
956
957 // sm_80 only has conversions between f32 and bf16. Custom lower all other
958 // bf16 conversions.
959 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
960 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
963 VT, Custom);
964 }
967 MVT::bf16, Custom);
968 }
969
970 setOperationAction(ISD::FROUND, MVT::f16, Promote);
971 setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
972 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
973 setOperationAction(ISD::FROUND, MVT::f32, Custom);
974 setOperationAction(ISD::FROUND, MVT::f64, Custom);
975 setOperationAction(ISD::FROUND, MVT::bf16, Promote);
976 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
977
978 // 'Expand' implements FCOPYSIGN without calling an external library.
985
986 // These map to corresponding instructions for f32/f64. f16 must be
987 // promoted to f32. v2f16 is expanded to f16, which is then promoted
988 // to f32.
989 for (const auto &Op :
990 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
991 setOperationAction(Op, MVT::f16, Promote);
992 setOperationAction(Op, MVT::f32, Legal);
993 // only div/rem/sqrt are legal for f64
994 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
995 setOperationAction(Op, MVT::f64, Legal);
996 }
997 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
998 setOperationAction(Op, MVT::bf16, Promote);
999 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1000 }
1001 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
1002
1003 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
1004 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
1005 if (STI.getPTXVersion() >= 65) {
1006 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
1007 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
1008 } else {
1009 setOperationAction(ISD::FABS, MVT::f16, Promote);
1010 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
1011 }
1012 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
1013 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
1014 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
1015 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
1016
1017 for (const auto &Op :
1018 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
1019 setOperationAction(Op, MVT::f32, Legal);
1020 setOperationAction(Op, MVT::f64, Legal);
1021 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1022 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1023 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1024 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1025 if (getOperationAction(Op, MVT::bf16) == Promote)
1026 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1027 setOperationAction(Op, MVT::v2f32, Expand);
1028 }
1029 bool SupportsF32MinMaxNaN =
1030 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1031 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1032 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1033 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1034 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1035 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1036 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1037 setOperationAction(Op, MVT::v2f32, Expand);
1038 }
1039
1040 // Custom lowering for inline asm with 128-bit operands
1043
1044 // FEXP2 support:
1045 // - f32
1046 // - f16/f16x2 (sm_70+, PTX 7.0+)
1047 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1048 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1049 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
1050 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1051 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1052 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1053 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1054 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1055
1056 // FLOG2 supports f32 only
1057 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1058 if (UseApproxLog2F32) {
1059 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
1060 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1061 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1062 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1063 Expand);
1064 }
1065
1066 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1067
1068 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1069
1070 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1071 // type, we need to custom lower it.
1072 setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
1073 Custom);
1074
1075 // Now deduce the information based on the above mentioned
1076 // actions
1077 computeRegisterProperties(STI.getRegisterInfo());
1078
1079 // PTX support for 16-bit CAS is emulated. Only use 32+
1080 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1081 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1083
1084 // Custom lowering for tcgen05.ld vector operands
1086 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1087 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1088 Custom);
1089
1090 // Custom lowering for tcgen05.st vector operands
1092 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1093 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other},
1094 Custom);
1095
1096 // Enable custom lowering for the following:
1097 // * MVT::i128 - clusterlaunchcontrol
1098 // * MVT::i32 - prmt
1099 // * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics
1100 // * MVT::Other - internal.addrspace.wrap
1102 {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
1103}
1104
1105const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
1106
1107#define MAKE_CASE(V) \
1108 case V: \
1109 return #V;
1110
1111 switch ((NVPTXISD::NodeType)Opcode) {
1113 break;
1114
1167 MAKE_CASE(
1169 MAKE_CASE(
1181 MAKE_CASE(
1183 MAKE_CASE(
1190 }
1191 return nullptr;
1192
1193#undef MAKE_CASE
1194}
1195
1198 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1199 VT.getScalarType() == MVT::i1)
1200 return TypeSplitVector;
1202}
1203
1205 int Enabled, int &ExtraSteps,
1206 bool &UseOneConst,
1207 bool Reciprocal) const {
1210 return SDValue();
1211
1212 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1213 ExtraSteps = 0;
1214
1215 SDLoc DL(Operand);
1216 EVT VT = Operand.getValueType();
1217 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1218
1219 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1220 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1221 DAG.getConstant(IID, DL, MVT::i32), Operand);
1222 };
1223
1224 // The sqrt and rsqrt refinement processes assume we always start out with an
1225 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1226 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1227 // any refinement, we must return a regular sqrt.
1228 if (Reciprocal || ExtraSteps > 0) {
1229 if (VT == MVT::f32)
1230 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1231 : Intrinsic::nvvm_rsqrt_approx_f);
1232 else if (VT == MVT::f64)
1233 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1234 else
1235 return SDValue();
1236 } else {
1237 if (VT == MVT::f32)
1238 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1239 : Intrinsic::nvvm_sqrt_approx_f);
1240 else {
1241 // There's no sqrt.approx.f64 instruction, so we emit
1242 // reciprocal(rsqrt(x)). This is faster than
1243 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1244 // x * rsqrt(x).)
1245 return DAG.getNode(
1247 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1248 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1249 }
1250 }
1251}
1252
1254 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1256 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1257 unsigned UniqueCallSite) const {
1258 auto PtrVT = getPointerTy(DL);
1259
1260 std::string Prototype;
1261 raw_string_ostream O(Prototype);
1262 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1263
1264 if (RetTy->isVoidTy()) {
1265 O << "()";
1266 } else {
1267 O << "(";
1268 if (shouldPassAsArray(RetTy)) {
1269 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1270 O << ".param .align " << RetAlign.value() << " .b8 _["
1271 << DL.getTypeAllocSize(RetTy) << "]";
1272 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1273 unsigned size = 0;
1274 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1275 size = ITy->getBitWidth();
1276 } else {
1277 assert(RetTy->isFloatingPointTy() &&
1278 "Floating point type expected here");
1279 size = RetTy->getPrimitiveSizeInBits();
1280 }
1281 // PTX ABI requires all scalar return values to be at least 32
1282 // bits in size. fp16 normally uses .b16 as its storage type in
1283 // PTX, so its size must be adjusted here, too.
1285
1286 O << ".param .b" << size << " _";
1287 } else if (isa<PointerType>(RetTy)) {
1288 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1289 } else {
1290 llvm_unreachable("Unknown return type");
1291 }
1292 O << ") ";
1293 }
1294 O << "_ (";
1295
1296 bool first = true;
1297
1298 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1299 auto AllOuts = ArrayRef(Outs);
1300 for (const unsigned I : llvm::seq(NumArgs)) {
1301 const auto ArgOuts =
1302 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1303 AllOuts = AllOuts.drop_front(ArgOuts.size());
1304
1305 Type *Ty = Args[I].Ty;
1306 if (!first) {
1307 O << ", ";
1308 }
1309 first = false;
1310
1311 if (ArgOuts[0].Flags.isByVal()) {
1312 // Indirect calls need strict ABI alignment so we disable optimizations by
1313 // not providing a function to optimize.
1314 Type *ETy = Args[I].IndirectType;
1315 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1316 Align ParamByValAlign =
1317 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1318
1319 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1320 << ArgOuts[0].Flags.getByValSize() << "]";
1321 } else {
1322 if (shouldPassAsArray(Ty)) {
1323 Align ParamAlign =
1324 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1325 O << ".param .align " << ParamAlign.value() << " .b8 _["
1326 << DL.getTypeAllocSize(Ty) << "]";
1327 continue;
1328 }
1329 // i8 types in IR will be i16 types in SDAG
1330 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1331 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1332 "type mismatch between callee prototype and arguments");
1333 // scalar type
1334 unsigned sz = 0;
1335 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1336 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1337 } else if (isa<PointerType>(Ty)) {
1338 sz = PtrVT.getSizeInBits();
1339 } else {
1340 sz = Ty->getPrimitiveSizeInBits();
1341 }
1342 O << ".param .b" << sz << " _";
1343 }
1344 }
1345
1346 if (FirstVAArg)
1347 O << (first ? "" : ",") << " .param .align "
1348 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1349 O << ")";
1350 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1351 O << " .noreturn";
1352 O << ";";
1353
1354 return Prototype;
1355}
1356
1358 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1359 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1360}
1361
1362Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1363 unsigned Idx,
1364 const DataLayout &DL) const {
1365 if (!CB) {
1366 // CallSite is zero, fallback to ABI type alignment
1367 return DL.getABITypeAlign(Ty);
1368 }
1369
1370 const Function *DirectCallee = CB->getCalledFunction();
1371
1372 if (!DirectCallee) {
1373 // We don't have a direct function symbol, but that may be because of
1374 // constant cast instructions in the call.
1375
1376 // With bitcast'd call targets, the instruction will be the call
1377 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1378 // Check if we have call alignment metadata
1379 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1380 return StackAlign.value();
1381 }
1382 DirectCallee = getMaybeBitcastedCallee(CB);
1383 }
1384
1385 // Check for function alignment information if we found that the
1386 // ultimate target is a Function
1387 if (DirectCallee)
1388 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1389
1390 // Call is indirect, fall back to the ABI type alignment
1391 return DL.getABITypeAlign(Ty);
1392}
1393
1395 const GlobalAddressSDNode *Func) {
1396 if (!Func)
1397 return false;
1398 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1399 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1400 return false;
1401}
1402
1404 const DataLayout &DL,
1405 const TargetLowering &TL) {
1406 if (Ptr->getOpcode() == ISD::FrameIndex) {
1407 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1410
1412 }
1413
1414 // Peel of an addrspacecast to generic and load directly from the specific
1415 // address space.
1416 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1417 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1418 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1419 Ptr = ASC->getOperand(0);
1420 return MachinePointerInfo(ASC->getSrcAddressSpace());
1421 }
1422 }
1423
1424 return MachinePointerInfo();
1425}
1426
1428 if (Flags.isSExt())
1429 return ISD::SIGN_EXTEND;
1430 if (Flags.isZExt())
1431 return ISD::ZERO_EXTEND;
1432 return ISD::ANY_EXTEND;
1433}
1434
1436 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1437 SDLoc dl) {
1438 const EVT ActualVT = V.getValueType();
1439 assert((ActualVT == ExpectedVT ||
1440 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1441 "Non-integer argument type size mismatch");
1442 if (ExpectedVT.bitsGT(ActualVT))
1443 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1444 if (ExpectedVT.bitsLT(ActualVT))
1445 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1446
1447 return V;
1448}
1449
1451 SmallVectorImpl<SDValue> &InVals) const {
1452
1453 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1455 "Support for variadic functions (unsized array parameter) introduced "
1456 "in PTX ISA version 6.0 and requires target sm_30.");
1457
1458 SelectionDAG &DAG = CLI.DAG;
1459 SDLoc dl = CLI.DL;
1460 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1461 SDValue Callee = CLI.Callee;
1462 ArgListTy &Args = CLI.getArgs();
1463 Type *RetTy = CLI.RetTy;
1464 const CallBase *CB = CLI.CB;
1465 const DataLayout &DL = DAG.getDataLayout();
1466 LLVMContext &Ctx = *DAG.getContext();
1467
1468 const auto GetI32 = [&](const unsigned I) {
1469 return DAG.getConstant(I, dl, MVT::i32);
1470 };
1471
1472 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1473 const SDValue CallChain = CLI.Chain;
1474 const SDValue StartChain =
1475 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1476 SDValue DeclareGlue = StartChain.getValue(1);
1477
1478 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1479
1480 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1481 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1482 // loaded/stored using i16, so it's handled here as well.
1483 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1484 SDValue Declare =
1485 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1486 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1487 CallPrereqs.push_back(Declare);
1488 DeclareGlue = Declare.getValue(1);
1489 return Declare;
1490 };
1491
1492 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1493 unsigned Size) {
1494 SDValue Declare = DAG.getNode(
1495 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1496 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1497 CallPrereqs.push_back(Declare);
1498 DeclareGlue = Declare.getValue(1);
1499 return Declare;
1500 };
1501
1502 // Variadic arguments.
1503 //
1504 // Normally, for each argument, we declare a param scalar or a param
1505 // byte array in the .param space, and store the argument value to that
1506 // param scalar or array starting at offset 0.
1507 //
1508 // In the case of the first variadic argument, we declare a vararg byte array
1509 // with size 0. The exact size of this array isn't known at this point, so
1510 // it'll be patched later. All the variadic arguments will be stored to this
1511 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1512 // initially set to 0, so it can be used for non-variadic arguments (which use
1513 // 0 offset) to simplify the code.
1514 //
1515 // After all vararg is processed, 'VAOffset' holds the size of the
1516 // vararg byte array.
1517 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1518 "Non-VarArg function with extra arguments");
1519
1520 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1521 unsigned VAOffset = 0; // current offset in the param array
1522
1523 const SDValue VADeclareParam =
1524 CLI.Args.size() > FirstVAArg
1525 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1526 Align(STI.getMaxRequiredAlignment()), 0)
1527 : SDValue();
1528
1529 // Args.size() and Outs.size() need not match.
1530 // Outs.size() will be larger
1531 // * if there is an aggregate argument with multiple fields (each field
1532 // showing up separately in Outs)
1533 // * if there is a vector argument with more than typical vector-length
1534 // elements (generally if more than 4) where each vector element is
1535 // individually present in Outs.
1536 // So a different index should be used for indexing into Outs/OutVals.
1537 // See similar issue in LowerFormalArguments.
1538 auto AllOuts = ArrayRef(CLI.Outs);
1539 auto AllOutVals = ArrayRef(CLI.OutVals);
1540 assert(AllOuts.size() == AllOutVals.size() &&
1541 "Outs and OutVals must be the same size");
1542 // Declare the .params or .reg need to pass values
1543 // to the function
1544 for (const auto E : llvm::enumerate(Args)) {
1545 const auto ArgI = E.index();
1546 const auto Arg = E.value();
1547 const auto ArgOuts =
1548 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1549 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1550 AllOuts = AllOuts.drop_front(ArgOuts.size());
1551 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1552
1553 const bool IsVAArg = (ArgI >= FirstVAArg);
1554 const bool IsByVal = Arg.IsByVal;
1555
1556 const SDValue ParamSymbol =
1557 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1558
1559 assert((!IsByVal || Arg.IndirectType) &&
1560 "byval arg must have indirect type");
1561 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1562
1563 const Align ArgAlign = [&]() {
1564 if (IsByVal) {
1565 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1566 // so we don't need to worry whether it's naturally aligned or not.
1567 // See TargetLowering::LowerCallTo().
1568 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1570 InitialAlign, DL);
1571 }
1572 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1573 }();
1574
1575 const unsigned TySize = DL.getTypeAllocSize(ETy);
1576 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1577 "type size mismatch");
1578
1579 const SDValue ArgDeclare = [&]() {
1580 if (IsVAArg)
1581 return VADeclareParam;
1582
1583 if (IsByVal || shouldPassAsArray(Arg.Ty))
1584 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1585
1586 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1587 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1588 "Only int and float types are supported as non-array arguments");
1589
1590 return MakeDeclareScalarParam(ParamSymbol, TySize);
1591 }();
1592
1593 if (IsByVal) {
1594 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1595 SDValue SrcPtr = ArgOutVals[0];
1596 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1597 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1598
1599 if (IsVAArg)
1600 VAOffset = alignTo(VAOffset, ArgAlign);
1601
1602 SmallVector<EVT, 4> ValueVTs, MemVTs;
1604 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1605
1606 unsigned J = 0;
1607 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1608 for (const unsigned NumElts : VI) {
1609 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1610 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1611 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1612 SDValue SrcLoad =
1613 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1614
1615 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1616 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1617 SDValue ParamAddr =
1618 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1619 SDValue StoreParam =
1620 DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
1622 CallPrereqs.push_back(StoreParam);
1623
1624 J += NumElts;
1625 }
1626 if (IsVAArg)
1627 VAOffset += TySize;
1628 } else {
1631 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1632 VAOffset);
1633 assert(VTs.size() == Offsets.size() && "Size mismatch");
1634 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1635
1636 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1637 // than 32-bits are sign extended or zero extended, depending on
1638 // whether they are signed or unsigned types. This case applies
1639 // only to scalar parameters and not to aggregate values.
1640 const bool ExtendIntegerParam =
1641 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1642
1643 const auto GetStoredValue = [&](const unsigned I) {
1644 SDValue StVal = ArgOutVals[I];
1646 StVal.getValueType() &&
1647 "OutVal type should always be legal");
1648
1649 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1650 const EVT StoreVT =
1651 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1652
1653 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1654 };
1655
1656 unsigned J = 0;
1657 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1658 for (const unsigned NumElts : VI) {
1659 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1660
1661 unsigned Offset;
1662 if (IsVAArg) {
1663 // TODO: We may need to support vector types that can be passed
1664 // as scalars in variadic arguments.
1665 assert(NumElts == 1 &&
1666 "Vectorization should be disabled for vaargs.");
1667
1668 // Align each part of the variadic argument to their type.
1669 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1670 Offset = VAOffset;
1671
1672 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1673 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1674 } else {
1675 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1676 Offset = Offsets[J];
1677 }
1678
1679 SDValue Ptr =
1680 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1681
1682 const MaybeAlign CurrentAlign = ExtendIntegerParam
1683 ? MaybeAlign(std::nullopt)
1684 : commonAlignment(ArgAlign, Offset);
1685
1686 SDValue Val =
1687 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1688 return GetStoredValue(J + K);
1689 });
1690
1691 SDValue StoreParam =
1692 DAG.getStore(ArgDeclare, dl, Val, Ptr,
1694 CallPrereqs.push_back(StoreParam);
1695
1696 J += NumElts;
1697 }
1698 }
1699 }
1700
1701 // Handle Result
1702 if (!Ins.empty()) {
1703 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1704 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1705 if (shouldPassAsArray(RetTy)) {
1706 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1707 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1708 } else {
1709 MakeDeclareScalarParam(RetSymbol, ResultSize);
1710 }
1711 }
1712
1713 // Set the size of the vararg param byte array if the callee is a variadic
1714 // function and the variadic part is not empty.
1715 if (VADeclareParam) {
1716 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1717 VADeclareParam.getOperand(1),
1718 VADeclareParam.getOperand(2), GetI32(VAOffset),
1719 VADeclareParam.getOperand(4)};
1720 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1721 VADeclareParam->getVTList(), DeclareParamOps);
1722 }
1723
1724 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1725 // If the type of the callsite does not match that of the function, convert
1726 // the callsite to an indirect call.
1727 const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1728
1729 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1730 // between them we must rely on the call site value which is valid for
1731 // indirect calls but is always null for libcalls.
1732 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1733
1734 if (isa<ExternalSymbolSDNode>(Callee)) {
1735 Function* CalleeFunc = nullptr;
1736
1737 // Try to find the callee in the current module.
1738 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1739 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1740
1741 // Set the "libcall callee" attribute to indicate that the function
1742 // must always have a declaration.
1743 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1744 }
1745
1746 if (IsIndirectCall) {
1747 // This is indirect function call case : PTX requires a prototype of the
1748 // form
1749 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1750 // to be emitted, and the label has to used as the last arg of call
1751 // instruction.
1752 // The prototype is embedded in a string and put as the operand for a
1753 // CallPrototype SDNode which will print out to the value of the string.
1754 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1755 std::string Proto =
1756 getPrototype(DL, RetTy, Args, CLI.Outs,
1757 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1758 UniqueCallSite);
1759 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1760 const SDValue PrototypeDeclare = DAG.getNode(
1761 NVPTXISD::CallPrototype, dl, MVT::Other,
1762 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1763 CallPrereqs.push_back(PrototypeDeclare);
1764 }
1765
1766 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1767 const unsigned NumArgs =
1768 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1769 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1770 /// NumParams, Callee, Proto)
1771 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1772 const SDValue Call = DAG.getNode(
1773 NVPTXISD::CALL, dl, MVT::Other,
1774 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1775 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1776
1777 SmallVector<SDValue, 16> LoadChains{Call};
1778 SmallVector<SDValue, 16> ProxyRegOps;
1779 if (!Ins.empty()) {
1782 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1783 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1784
1785 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1786 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1787
1788 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1789 // 32-bits are sign extended or zero extended, depending on whether
1790 // they are signed or unsigned types.
1791 const bool ExtendIntegerRetVal =
1792 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1793
1794 unsigned I = 0;
1795 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1796 for (const unsigned NumElts : VI) {
1797 const MaybeAlign CurrentAlign =
1798 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1799 : commonAlignment(RetAlign, Offsets[I]);
1800
1801 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1802 const EVT LoadVT =
1803 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1804 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1805 SDValue Ptr =
1806 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1807
1808 SDValue R =
1809 DAG.getLoad(VecVT, dl, Call, Ptr,
1811
1812 LoadChains.push_back(R.getValue(1));
1813 for (const unsigned J : llvm::seq(NumElts))
1814 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1815 I += NumElts;
1816 }
1817 }
1818
1819 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1820 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1821 UniqueCallSite + 1, SDValue(), dl);
1822
1823 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1824 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1825 // dangling.
1826 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1827 SDValue Proxy =
1828 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1829 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1830 InVals.push_back(Ret);
1831 }
1832
1833 // set IsTailCall to false for now, until we figure out how to express
1834 // tail call optimization in PTX
1835 CLI.IsTailCall = false;
1836 return CallEnd;
1837}
1838
1840 SelectionDAG &DAG) const {
1841
1842 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1843 const Function &Fn = DAG.getMachineFunction().getFunction();
1844
1846 Fn,
1847 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1848 "requires target sm_52.",
1849 SDLoc(Op).getDebugLoc()));
1850 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1851 Op.getOperand(0)};
1852 return DAG.getMergeValues(Ops, SDLoc());
1853 }
1854
1855 SDLoc DL(Op.getNode());
1856 SDValue Chain = Op.getOperand(0);
1857 SDValue Size = Op.getOperand(1);
1858 uint64_t Align = Op.getConstantOperandVal(2);
1859
1860 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1861 // the default stack alignment should be used.
1862 if (Align == 0)
1864
1865 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1866 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1867
1868 SDValue Alloc =
1869 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1870 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1871 DAG.getTargetConstant(Align, DL, MVT::i32)});
1872
1873 SDValue ASC = DAG.getAddrSpaceCast(
1875
1876 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1877}
1878
1880 SelectionDAG &DAG) const {
1881 SDLoc DL(Op.getNode());
1882 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1883 const Function &Fn = DAG.getMachineFunction().getFunction();
1884
1886 Fn,
1887 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1888 ">= sm_52.",
1889 DL.getDebugLoc()));
1890 return Op.getOperand(0);
1891 }
1892
1893 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1894 SDValue Chain = Op.getOperand(0);
1895 SDValue Ptr = Op.getOperand(1);
1898 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1899}
1900
1902 SelectionDAG &DAG) const {
1903 SDLoc DL(Op.getNode());
1904 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1905 const Function &Fn = DAG.getMachineFunction().getFunction();
1906
1908 Fn,
1909 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1910 "sm_52.",
1911 DL.getDebugLoc()));
1912 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1913 return DAG.getMergeValues(Ops, DL);
1914 }
1915
1916 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1917 SDValue Chain = Op.getOperand(0);
1918 SDValue SS =
1919 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1920 SDValue ASC = DAG.getAddrSpaceCast(
1921 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1922 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1923}
1924
1925// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1926// (see LegalizeDAG.cpp). This is slow and uses local memory.
1927// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1928SDValue
1929NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1930 SDNode *Node = Op.getNode();
1931 SDLoc dl(Node);
1933 unsigned NumOperands = Node->getNumOperands();
1934 for (unsigned i = 0; i < NumOperands; ++i) {
1935 SDValue SubOp = Node->getOperand(i);
1936 EVT VVT = SubOp.getNode()->getValueType(0);
1937 EVT EltVT = VVT.getVectorElementType();
1938 unsigned NumSubElem = VVT.getVectorNumElements();
1939 for (unsigned j = 0; j < NumSubElem; ++j) {
1940 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1941 DAG.getIntPtrConstant(j, dl)));
1942 }
1943 }
1944 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1945}
1946
1948 SelectionDAG &DAG,
1949 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1950 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1951 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1952 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1953 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1954}
1955
1957 SelectionDAG &DAG,
1958 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1959 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1960}
1961
1962/// Reduces the elements using the scalar operations provided. The operations
1963/// are sorted descending in number of inputs they take. The flags on the
1964/// original reduction operation will be propagated to each scalar operation.
1965/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1966/// used in ExpandReductions and SelectionDAG.
1968 const SmallVector<SDValue> &Elements, EVT EltTy,
1969 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1970 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1971 // Build the reduction tree at each level, starting with all the elements.
1972 SmallVector<SDValue> Level = Elements;
1973
1974 unsigned OpIdx = 0;
1975 while (Level.size() > 1) {
1976 // Try to reduce this level using the current operator.
1977 const auto [Op, NumInputs] = Ops[OpIdx];
1978
1979 // Build the next level by partially reducing all elements.
1980 SmallVector<SDValue> ReducedLevel;
1981 unsigned I = 0, E = Level.size();
1982 for (; I + NumInputs <= E; I += NumInputs) {
1983 // Reduce elements in groups of [NumInputs], as much as possible.
1984 ReducedLevel.push_back(DAG.getNode(
1985 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1986 }
1987
1988 if (I < E) {
1989 // Handle leftover elements.
1990
1991 if (ReducedLevel.empty()) {
1992 // We didn't reduce anything at this level. We need to pick a smaller
1993 // operator.
1994 ++OpIdx;
1995 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1996 continue;
1997 }
1998
1999 // We reduced some things but there's still more left, meaning the
2000 // operator's number of inputs doesn't evenly divide this level size. Move
2001 // these elements to the next level.
2002 for (; I < E; ++I)
2003 ReducedLevel.push_back(Level[I]);
2004 }
2005
2006 // Process the next level.
2007 Level = ReducedLevel;
2008 }
2009
2010 return *Level.begin();
2011}
2012
2013// Get scalar reduction opcode
2014static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
2015 switch (ReductionOpcode) {
2016 case ISD::VECREDUCE_FMAX:
2017 return ISD::FMAXNUM;
2018 case ISD::VECREDUCE_FMIN:
2019 return ISD::FMINNUM;
2020 case ISD::VECREDUCE_FMAXIMUM:
2021 return ISD::FMAXIMUM;
2022 case ISD::VECREDUCE_FMINIMUM:
2023 return ISD::FMINIMUM;
2024 default:
2025 llvm_unreachable("unhandled reduction opcode");
2026 }
2027}
2028
2029/// Get 3-input scalar reduction opcode
2030static std::optional<NVPTXISD::NodeType>
2031getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
2032 switch (ReductionOpcode) {
2033 case ISD::VECREDUCE_FMAX:
2034 return NVPTXISD::FMAXNUM3;
2035 case ISD::VECREDUCE_FMIN:
2036 return NVPTXISD::FMINNUM3;
2037 case ISD::VECREDUCE_FMAXIMUM:
2038 return NVPTXISD::FMAXIMUM3;
2039 case ISD::VECREDUCE_FMINIMUM:
2040 return NVPTXISD::FMINIMUM3;
2041 default:
2042 return std::nullopt;
2043 }
2044}
2045
2046/// Lower reductions to either a sequence of operations or a tree if
2047/// reassociations are allowed. This method will use larger operations like
2048/// max3/min3 when the target supports them.
2049SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
2050 SelectionDAG &DAG) const {
2051 SDLoc DL(Op);
2052 const SDNodeFlags Flags = Op->getFlags();
2053 SDValue Vector = Op.getOperand(0);
2054
2055 const unsigned Opcode = Op->getOpcode();
2056 const EVT EltTy = Vector.getValueType().getVectorElementType();
2057
2058 // Whether we can use 3-input min/max when expanding the reduction.
2059 const bool CanUseMinMax3 =
2060 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
2061 STI.getPTXVersion() >= 88 &&
2062 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2063 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2064
2065 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2066 // number of inputs they take.
2067 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2068
2069 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2070 CanUseMinMax3 && Opcode3Elem)
2071 ScalarOps.push_back({*Opcode3Elem, 3});
2072 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2073
2075 DAG.ExtractVectorElements(Vector, Elements);
2076
2077 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2078}
2079
2080SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2081 // Handle bitcasting from v2i8 without hitting the default promotion
2082 // strategy which goes through stack memory.
2083 EVT FromVT = Op->getOperand(0)->getValueType(0);
2084 if (FromVT != MVT::v2i8) {
2085 return Op;
2086 }
2087
2088 // Pack vector elements into i16 and bitcast to final type
2089 SDLoc DL(Op);
2090 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2091 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2092 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2093 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2094 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2095 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2096 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2097 SDValue AsInt = DAG.getNode(
2098 ISD::OR, DL, MVT::i16,
2099 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2100 EVT ToVT = Op->getValueType(0);
2101 return DAG.getBitcast(ToVT, AsInt);
2102}
2103
2104// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2105// would get lowered as two constant loads and vector-packing move.
2106// Instead we want just a constant move:
2107// mov.b32 %r2, 0x40003C00
2108SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2109 SelectionDAG &DAG) const {
2110 EVT VT = Op->getValueType(0);
2111 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2112 return Op;
2113 SDLoc DL(Op);
2114
2115 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2116 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2117 isa<ConstantFPSDNode>(Operand);
2118 })) {
2119 if (VT != MVT::v4i8)
2120 return Op;
2121 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2122 // to optimize calculation of constant parts.
2123 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2124 uint64_t SelectionValue) -> SDValue {
2125 SDValue L = Left;
2126 SDValue R = Right;
2127 if (Cast) {
2128 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2129 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2130 }
2131 return getPRMT(L, R, SelectionValue, DL, DAG);
2132 };
2133 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2134 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2135 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2136 return DAG.getBitcast(VT, PRMT3210);
2137 }
2138
2139 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2140 auto GetOperand = [](SDValue Op, int N) -> APInt {
2141 const SDValue &Operand = Op->getOperand(N);
2142 EVT VT = Op->getValueType(0);
2143 if (Operand->isUndef())
2144 return APInt(32, 0);
2145 APInt Value;
2146 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2147 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2148 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2149 Value = Operand->getAsAPIntVal();
2150 else
2151 llvm_unreachable("Unsupported type");
2152 // i8 values are carried around as i16, so we need to zero out upper bits,
2153 // so they do not get in the way of combining individual byte values
2154 if (VT == MVT::v4i8)
2155 Value = Value.trunc(8);
2156 return Value.zext(32);
2157 };
2158
2159 // Construct a 32-bit constant by shifting into place smaller values
2160 // (elements of the vector type VT).
2161 // For example, if VT has 2 elements, then N == 2:
2162 // ShiftAmount = 32 / N = 16
2163 // Value |= Op0 (b16) << 0
2164 // Value |= Op1 (b16) << 16
2165 // If N == 4:
2166 // ShiftAmount = 32 / N = 8
2167 // Value |= Op0 (b8) << 0
2168 // Value |= Op1 (b8) << 8
2169 // Value |= Op2 (b8) << 16
2170 // Value |= Op3 (b8) << 24
2171 // ...etc
2172 APInt Value(32, 0);
2173 const unsigned NumElements = VT.getVectorNumElements();
2174 assert(32 % NumElements == 0 && "must evenly divide bit length");
2175 const unsigned ShiftAmount = 32 / NumElements;
2176 for (unsigned ElementNo : seq(NumElements))
2177 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2178 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2179 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2180}
2181
2182SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2183 SelectionDAG &DAG) const {
2184 SDValue Index = Op->getOperand(1);
2185 SDValue Vector = Op->getOperand(0);
2186 SDLoc DL(Op);
2187 EVT VectorVT = Vector.getValueType();
2188
2189 if (VectorVT == MVT::v4i8) {
2190 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2191 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2192 DAG.getConstant(0x7770, DL, MVT::i32));
2193 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2194 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2195 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2196 SDNodeFlags Flags;
2197 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2198 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2199 Ext->setFlags(Flags);
2200 return Ext;
2201 }
2202
2203 // Constant index will be matched by tablegen.
2204 if (isa<ConstantSDNode>(Index.getNode()))
2205 return Op;
2206
2207 // Extract individual elements and select one of them.
2208 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2209 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2210 EVT EltVT = VectorVT.getVectorElementType();
2211
2212 SDLoc dl(Op.getNode());
2213 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2214 DAG.getIntPtrConstant(0, dl));
2215 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2216 DAG.getIntPtrConstant(1, dl));
2217 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2219}
2220
2221SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2222 SelectionDAG &DAG) const {
2223 SDValue Vector = Op->getOperand(0);
2224 EVT VectorVT = Vector.getValueType();
2225
2226 if (VectorVT != MVT::v4i8)
2227 return Op;
2228 SDLoc DL(Op);
2229 SDValue Value = Op->getOperand(1);
2230 if (Value->isUndef())
2231 return Vector;
2232
2233 SDValue Index = Op->getOperand(2);
2234
2235 SDValue BFI =
2236 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2237 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2238 DAG.getNode(ISD::MUL, DL, MVT::i32,
2239 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2240 DAG.getConstant(8, DL, MVT::i32)),
2241 DAG.getConstant(8, DL, MVT::i32)});
2242 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2243}
2244
2245SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2246 SelectionDAG &DAG) const {
2247 SDValue V1 = Op.getOperand(0);
2248 EVT VectorVT = V1.getValueType();
2249 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2250 return Op;
2251
2252 // Lower shuffle to PRMT instruction.
2253 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2254 SDValue V2 = Op.getOperand(1);
2255 uint32_t Selector = 0;
2256 for (auto I : llvm::enumerate(SVN->getMask())) {
2257 if (I.value() != -1) // -1 is a placeholder for undef.
2258 Selector |= (I.value() << (I.index() * 4));
2259 }
2260
2261 SDLoc DL(Op);
2262 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2263 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2264 return DAG.getBitcast(Op.getValueType(), PRMT);
2265}
2266/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2267/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2268/// amount, or
2269/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2270/// amount.
2271SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2272 SelectionDAG &DAG) const {
2273 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2274 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2275
2276 EVT VT = Op.getValueType();
2277 unsigned VTBits = VT.getSizeInBits();
2278 SDLoc dl(Op);
2279 SDValue ShOpLo = Op.getOperand(0);
2280 SDValue ShOpHi = Op.getOperand(1);
2281 SDValue ShAmt = Op.getOperand(2);
2282 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2283
2284 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2285 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2286 // {dHi, dLo} = {aHi, aLo} >> Amt
2287 // dHi = aHi >> Amt
2288 // dLo = shf.r.clamp aLo, aHi, Amt
2289
2290 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2291 SDValue Lo =
2292 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2293
2294 SDValue Ops[2] = { Lo, Hi };
2295 return DAG.getMergeValues(Ops, dl);
2296 }
2297 else {
2298 // {dHi, dLo} = {aHi, aLo} >> Amt
2299 // - if (Amt>=size) then
2300 // dLo = aHi >> (Amt-size)
2301 // dHi = aHi >> Amt (this is either all 0 or all 1)
2302 // else
2303 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2304 // dHi = aHi >> Amt
2305
2306 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2307 DAG.getConstant(VTBits, dl, MVT::i32),
2308 ShAmt);
2309 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2310 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2311 DAG.getConstant(VTBits, dl, MVT::i32));
2312 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2313 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2314 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2315
2316 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2317 DAG.getConstant(VTBits, dl, MVT::i32),
2318 ISD::SETGE);
2319 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2320 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2321
2322 SDValue Ops[2] = { Lo, Hi };
2323 return DAG.getMergeValues(Ops, dl);
2324 }
2325}
2326
2327/// LowerShiftLeftParts - Lower SHL_PARTS, which
2328/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2329/// amount, or
2330/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2331/// amount.
2332SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2333 SelectionDAG &DAG) const {
2334 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2335 assert(Op.getOpcode() == ISD::SHL_PARTS);
2336
2337 EVT VT = Op.getValueType();
2338 unsigned VTBits = VT.getSizeInBits();
2339 SDLoc dl(Op);
2340 SDValue ShOpLo = Op.getOperand(0);
2341 SDValue ShOpHi = Op.getOperand(1);
2342 SDValue ShAmt = Op.getOperand(2);
2343
2344 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2345 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2346 // {dHi, dLo} = {aHi, aLo} << Amt
2347 // dHi = shf.l.clamp aLo, aHi, Amt
2348 // dLo = aLo << Amt
2349
2350 SDValue Hi =
2351 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2352 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2353
2354 SDValue Ops[2] = { Lo, Hi };
2355 return DAG.getMergeValues(Ops, dl);
2356 }
2357 else {
2358 // {dHi, dLo} = {aHi, aLo} << Amt
2359 // - if (Amt>=size) then
2360 // dLo = aLo << Amt (all 0)
2361 // dLo = aLo << (Amt-size)
2362 // else
2363 // dLo = aLo << Amt
2364 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2365
2366 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2367 DAG.getConstant(VTBits, dl, MVT::i32),
2368 ShAmt);
2369 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2370 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2371 DAG.getConstant(VTBits, dl, MVT::i32));
2372 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2373 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2374 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2375
2376 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2377 DAG.getConstant(VTBits, dl, MVT::i32),
2378 ISD::SETGE);
2379 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2380 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2381
2382 SDValue Ops[2] = { Lo, Hi };
2383 return DAG.getMergeValues(Ops, dl);
2384 }
2385}
2386
2387/// If the types match, convert the generic copysign to the NVPTXISD version,
2388/// otherwise bail ensuring that mismatched cases are properly expaned.
2389SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2390 SelectionDAG &DAG) const {
2391 EVT VT = Op.getValueType();
2392 SDLoc DL(Op);
2393
2394 SDValue In1 = Op.getOperand(0);
2395 SDValue In2 = Op.getOperand(1);
2396 EVT SrcVT = In2.getValueType();
2397
2398 if (!SrcVT.bitsEq(VT))
2399 return SDValue();
2400
2401 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2402}
2403
2404SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2405 EVT VT = Op.getValueType();
2406
2407 if (VT == MVT::f32)
2408 return LowerFROUND32(Op, DAG);
2409
2410 if (VT == MVT::f64)
2411 return LowerFROUND64(Op, DAG);
2412
2413 llvm_unreachable("unhandled type");
2414}
2415
2416// This is the the rounding method used in CUDA libdevice in C like code:
2417// float roundf(float A)
2418// {
2419// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2420// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2421// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2422// }
2423SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2424 SelectionDAG &DAG) const {
2425 SDLoc SL(Op);
2426 SDValue A = Op.getOperand(0);
2427 EVT VT = Op.getValueType();
2428
2429 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2430
2431 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2432 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2433 const unsigned SignBitMask = 0x80000000;
2434 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2435 DAG.getConstant(SignBitMask, SL, MVT::i32));
2436 const unsigned PointFiveInBits = 0x3F000000;
2437 SDValue PointFiveWithSignRaw =
2438 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2439 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2440 SDValue PointFiveWithSign =
2441 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2442 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2443 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2444
2445 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2446 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2447 SDValue IsLarge =
2448 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2449 ISD::SETOGT);
2450 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2451
2452 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2453 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2454 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2455 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2456 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2457}
2458
2459// The implementation of round(double) is similar to that of round(float) in
2460// that they both separate the value range into three regions and use a method
2461// specific to the region to round the values. However, round(double) first
2462// calculates the round of the absolute value and then adds the sign back while
2463// round(float) directly rounds the value with sign.
2464SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2465 SelectionDAG &DAG) const {
2466 SDLoc SL(Op);
2467 SDValue A = Op.getOperand(0);
2468 EVT VT = Op.getValueType();
2469
2470 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2471
2472 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2473 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2474 DAG.getConstantFP(0.5, SL, VT));
2475 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2476
2477 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2478 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2479 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2480 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2481 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2482 DAG.getConstantFP(0, SL, VT),
2483 RoundedA);
2484
2485 // Add sign to rounded_A
2486 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2487 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2488
2489 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2490 SDValue IsLarge =
2491 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2492 ISD::SETOGT);
2493 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2494}
2495
2497 EVT VT = N->getValueType(0);
2498 EVT NVT = MVT::f32;
2499 if (VT.isVector()) {
2500 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2501 }
2502 SDLoc DL(N);
2503 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2504 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2505 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2506 return DAG.getFPExtendOrRound(Res, DL, VT);
2507}
2508
2509SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2510 SelectionDAG &DAG) const {
2511 if (useF32FTZ(DAG.getMachineFunction())) {
2512 return PromoteBinOpToF32(Op.getNode(), DAG);
2513 }
2514 return Op;
2515}
2516
2517SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2518 SelectionDAG &DAG) const {
2519 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2520
2521 if (Op.getValueType() == MVT::bf16) {
2522 SDLoc Loc(Op);
2523 return DAG.getNode(
2524 ISD::FP_ROUND, Loc, MVT::bf16,
2525 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2526 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2527 }
2528
2529 // Everything else is considered legal.
2530 return Op;
2531}
2532
2533SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2534 SelectionDAG &DAG) const {
2535 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2536
2537 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2538 SDLoc Loc(Op);
2539 return DAG.getNode(
2540 Op.getOpcode(), Loc, Op.getValueType(),
2541 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2542 }
2543
2544 // Everything else is considered legal.
2545 return Op;
2546}
2547
2548SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2549 SelectionDAG &DAG) const {
2550 EVT NarrowVT = Op.getValueType();
2551 SDValue Wide = Op.getOperand(0);
2552 EVT WideVT = Wide.getValueType();
2553 if (NarrowVT.getScalarType() == MVT::bf16) {
2554 const TargetLowering *TLI = STI.getTargetLowering();
2555 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2556 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2557 }
2558 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2559 // This combination was the first to support f32 -> bf16.
2560 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2561 if (WideVT.getScalarType() == MVT::f32) {
2562 return Op;
2563 }
2564 if (WideVT.getScalarType() == MVT::f64) {
2565 SDLoc Loc(Op);
2566 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2567 // the hardware f32 -> bf16 instruction.
2569 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2570 : MVT::f32,
2571 Wide, Loc, DAG);
2572 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2573 }
2574 }
2575 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2576 }
2577 }
2578
2579 // Everything else is considered legal.
2580 return Op;
2581}
2582
2583SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2584 SelectionDAG &DAG) const {
2585 SDValue Narrow = Op.getOperand(0);
2586 EVT NarrowVT = Narrow.getValueType();
2587 EVT WideVT = Op.getValueType();
2588 if (NarrowVT.getScalarType() == MVT::bf16) {
2589 if (WideVT.getScalarType() == MVT::f32 &&
2590 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2591 SDLoc Loc(Op);
2592 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2593 }
2594 if (WideVT.getScalarType() == MVT::f64 &&
2595 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2596 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2597 : MVT::f32;
2598 SDLoc Loc(Op);
2599 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2600 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2601 } else {
2602 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2603 }
2604 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2605 }
2606 }
2607
2608 // Everything else is considered legal.
2609 return Op;
2610}
2611
2613 SDLoc DL(Op);
2614 if (Op.getValueType() != MVT::v2i16)
2615 return Op;
2616 EVT EltVT = Op.getValueType().getVectorElementType();
2617 SmallVector<SDValue> VecElements;
2618 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2619 SmallVector<SDValue> ScalarArgs;
2620 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2621 [&](const SDUse &O) {
2622 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2623 O.get(), DAG.getIntPtrConstant(I, DL));
2624 });
2625 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2626 }
2627 SDValue V =
2628 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2629 return V;
2630}
2631
2633 SDNode *N = Op.getNode();
2634 SDLoc DL(N);
2636
2637 // split the vector argument
2638 for (size_t I = 0; I < N->getNumOperands(); I++) {
2639 SDValue Val = N->getOperand(I);
2640 EVT ValVT = Val.getValueType();
2641 if (ValVT.isVector()) {
2642 EVT EltVT = ValVT.getVectorElementType();
2643 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2644 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2645 DAG.getIntPtrConstant(J, DL)));
2646 } else
2647 Ops.push_back(Val);
2648 }
2649
2651 SDValue Tcgen05StNode =
2652 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2653 MemSD->getMemoryVT(), MemSD->getMemOperand());
2654
2655 return Tcgen05StNode;
2656}
2657
2658static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
2659 switch (IID) {
2660 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2662 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2664 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2666 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2668 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2670 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2672 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2674 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2676 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2678 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2680 case Intrinsic::
2681 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2683 case Intrinsic::
2684 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2686 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2688 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2690 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2692 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2694 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2696 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2698 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2700 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2702 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2704 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2706 case Intrinsic::
2707 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2708 return NVPTXISD::
2709 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2710 case Intrinsic::
2711 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2712 return NVPTXISD::
2713 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2714 };
2715 llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");
2716}
2717
2719 SDNode *N = Op.getNode();
2720 SDLoc DL(N);
2721 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2722
2724 // split the vector argument
2725 for (size_t I = 0; I < N->getNumOperands(); I++) {
2726 if (I == 1)
2727 continue; // skip IID
2728 SDValue Val = N->getOperand(I);
2729 EVT ValVT = Val.getValueType();
2730 if (ValVT.isVector()) {
2731 EVT EltVT = ValVT.getVectorElementType();
2732 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2733 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2734 DAG.getIntPtrConstant(J, DL)));
2735 } else
2736 Ops.push_back(Val);
2737 }
2738
2740 SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(
2741 getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,
2742 MemSD->getMemoryVT(), MemSD->getMemOperand());
2743
2744 return Tcgen05MMANode;
2745}
2746
2747// Lower vector return type of tcgen05.ld intrinsics
2748static std::optional<std::pair<SDValue, SDValue>>
2749lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
2750 SDLoc DL(N);
2751 EVT ResVT = N->getValueType(0);
2752 if (!ResVT.isVector())
2753 return {}; // already legalized.
2754
2755 const unsigned NumElts = ResVT.getVectorNumElements();
2756
2757 // Create the return type of the instructions
2758 SmallVector<EVT, 5> ListVTs;
2759 for (unsigned i = 0; i < NumElts; ++i)
2760 ListVTs.push_back(MVT::i32);
2761
2762 ListVTs.push_back(N->getValueType(1)); // Chain
2763
2764 SDVTList ResVTs = DAG.getVTList(ListVTs);
2765
2766 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
2767 N->getOperand(2)};
2768
2769 if (HasOffset) {
2770 Ops.push_back(N->getOperand(3)); // offset
2771 Ops.push_back(N->getOperand(4)); // Pack flag
2772 } else
2773 Ops.push_back(N->getOperand(3)); // Pack flag
2774
2776 SDValue NewNode =
2778 MemSD->getMemoryVT(), MemSD->getMemOperand());
2779
2780 // split the vector result
2781 SmallVector<SDValue, 4> ScalarRes;
2782 for (unsigned i = 0; i < NumElts; ++i) {
2783 SDValue Res = NewNode.getValue(i);
2784 ScalarRes.push_back(Res);
2785 }
2786
2787 SDValue Chain = NewNode.getValue(NumElts);
2788 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
2789 return {{BuildVector, Chain}};
2790}
2791
2793 SDNode *N = Op.getNode();
2794 SDValue Intrin = N->getOperand(1);
2795
2796 // Get the intrinsic ID
2797 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2798 switch (IntrinNo) {
2799 default:
2800 break;
2801 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2802 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2803 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2804 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2805 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2806 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2807 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2808 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2809 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2810 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2811 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2812 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2813 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2814 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2815 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2816 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2817 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2818 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2819 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2820 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2821 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2822 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2823 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2824 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2825 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2826 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2827 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2828 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2829 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2830 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2831 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2832 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2833 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2834 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2835 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2836 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2837 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2838 return lowerTcgen05St(Op, DAG);
2839 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2840 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2841 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2842 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2843 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2844 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2845 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2846 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2847 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2848 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2849 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2850 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2851 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2852 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2853 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2854 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2855 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2856 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2857 case Intrinsic::
2858 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2859 case Intrinsic::
2860 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2861 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2862 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2863 case Intrinsic::
2864 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2865 case Intrinsic::
2866 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2868 }
2869 return Op;
2870}
2871
2873 SelectionDAG &DAG) {
2874
2875 SDNode *N = Op.getNode();
2876 if (N->getOperand(1).getValueType() != MVT::i128) {
2877 // return, if the operand is already lowered
2878 return SDValue();
2879 }
2880
2881 unsigned IID =
2882 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2883 auto Opcode = [&]() {
2884 switch (IID) {
2885 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2887 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2889 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2891 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2893 default:
2894 llvm_unreachable("unsupported/unhandled intrinsic");
2895 }
2896 }();
2897
2898 SDLoc DL(N);
2899 SDValue TryCancelResponse = N->getOperand(1);
2900 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2901 SDValue TryCancelResponse0 =
2902 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2903 DAG.getIntPtrConstant(0, DL));
2904 SDValue TryCancelResponse1 =
2905 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2906 DAG.getIntPtrConstant(1, DL));
2907
2908 return DAG.getNode(Opcode, DL, N->getVTList(),
2909 {TryCancelResponse0, TryCancelResponse1});
2910}
2911
2913 SDNode *N = Op.getNode();
2914 SDLoc DL(N);
2915 SDValue F32Vec = N->getOperand(1);
2916 SDValue RBits = N->getOperand(2);
2917
2918 unsigned IntrinsicID = N->getConstantOperandVal(0);
2919
2920 // Extract the 4 float elements from the vector
2922 for (unsigned i = 0; i < 4; ++i)
2923 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec,
2924 DAG.getIntPtrConstant(i, DL)));
2925
2927
2928 auto [OpCode, RetTy, CvtModeFlag] =
2929 [&]() -> std::tuple<NVPTXISD::NodeType, MVT::SimpleValueType, uint32_t> {
2930 switch (IntrinsicID) {
2931 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
2932 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8,
2933 CvtMode::RS | CvtMode::RELU_FLAG};
2934 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
2935 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2936 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
2937 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8,
2938 CvtMode::RS | CvtMode::RELU_FLAG};
2939 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
2940 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2941 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
2942 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8,
2943 CvtMode::RS | CvtMode::RELU_FLAG};
2944 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
2945 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2946 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
2947 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8,
2948 CvtMode::RS | CvtMode::RELU_FLAG};
2949 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
2950 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2951 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
2952 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16,
2953 CvtMode::RS | CvtMode::RELU_FLAG};
2954 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
2955 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS};
2956 default:
2957 llvm_unreachable("unsupported/unhandled intrinsic");
2958 }
2959 }();
2960
2961 Ops.push_back(RBits);
2962 Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32));
2963
2964 return DAG.getNode(OpCode, DL, RetTy, Ops);
2965}
2966
2968 const unsigned Mode = [&]() {
2969 switch (Op->getConstantOperandVal(0)) {
2970 case Intrinsic::nvvm_prmt:
2972 case Intrinsic::nvvm_prmt_b4e:
2974 case Intrinsic::nvvm_prmt_ecl:
2976 case Intrinsic::nvvm_prmt_ecr:
2978 case Intrinsic::nvvm_prmt_f4e:
2980 case Intrinsic::nvvm_prmt_rc16:
2982 case Intrinsic::nvvm_prmt_rc8:
2984 default:
2985 llvm_unreachable("unsupported/unhandled intrinsic");
2986 }
2987 }();
2988 SDLoc DL(Op);
2989 SDValue A = Op->getOperand(1);
2990 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
2991 : DAG.getConstant(0, DL, MVT::i32);
2992 SDValue Selector = (Op->op_end() - 1)->get();
2993 return getPRMT(A, B, Selector, DL, DAG, Mode);
2994}
2995
2997 switch (Op->getConstantOperandVal(1)) {
2998 default:
2999 return Op;
3000
3001 // These tcgen05 intrinsics return a v2i32, which is legal, so we have to
3002 // lower them through LowerOperation() instead of ReplaceNodeResults().
3003 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
3004 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
3005 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
3006 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))
3007 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3008 return SDValue();
3009
3010 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
3011 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
3012 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3013 return SDValue();
3014 }
3015}
3016
3018 switch (Op->getConstantOperandVal(0)) {
3019 default:
3020 return Op;
3021 case Intrinsic::nvvm_prmt:
3022 case Intrinsic::nvvm_prmt_b4e:
3023 case Intrinsic::nvvm_prmt_ecl:
3024 case Intrinsic::nvvm_prmt_ecr:
3025 case Intrinsic::nvvm_prmt_f4e:
3026 case Intrinsic::nvvm_prmt_rc16:
3027 case Intrinsic::nvvm_prmt_rc8:
3028 return lowerPrmtIntrinsic(Op, DAG);
3029 case Intrinsic::nvvm_internal_addrspace_wrap:
3030 return Op.getOperand(1);
3031 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
3032 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
3033 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
3034 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
3036 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
3037 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
3038 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
3039 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
3040 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
3041 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
3042 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
3043 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
3044 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
3045 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
3046 return lowerCvtRSIntrinsics(Op, DAG);
3047 }
3048}
3049
3050// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
3051// Lower these into a node returning the correct type which is zero-extended
3052// back to the correct size.
3054 SDValue V = Op->getOperand(0);
3055 assert(V.getValueType() == MVT::i64 &&
3056 "Unexpected CTLZ/CTPOP type to legalize");
3057
3058 SDLoc DL(Op);
3059 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
3060 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
3061}
3062
3064 unsigned Opcode, SelectionDAG &DAG) {
3065 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
3066
3067 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
3068 if (!AmtConst)
3069 return SDValue();
3070 const auto Amt = AmtConst->getZExtValue() & 63;
3071
3072 SDValue UnpackA =
3073 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
3074 SDValue UnpackB =
3075 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
3076
3077 // Arch is Little endiain: 0 = low bits, 1 = high bits
3078 SDValue ALo = UnpackA.getValue(0);
3079 SDValue AHi = UnpackA.getValue(1);
3080 SDValue BLo = UnpackB.getValue(0);
3081 SDValue BHi = UnpackB.getValue(1);
3082
3083 // The bitfeild consists of { AHi : ALo : BHi : BLo }
3084 //
3085 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
3086 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
3087 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
3088 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
3089 //
3090 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
3091 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
3092 // on the direction. Amt = 32 can be implemented by a packing and unpacking
3093 // move to select and arrange the 32bit values. For simplicity, these cases
3094 // are not handled here explicitly and instead we rely on DAGCombiner to
3095 // remove the no-op funnel shifts we insert.
3096 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
3097 ? std::make_tuple(AHi, ALo, BHi)
3098 : std::make_tuple(ALo, BHi, BLo);
3099
3100 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
3101 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
3102 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
3103
3104 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
3105}
3106
3108 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
3109 SDLoc(Op), Op->getOpcode(), DAG);
3110}
3111
3113 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
3114 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
3115 SDLoc(Op), Opcode, DAG);
3116}
3117
3119 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
3120 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
3121 // the semantics of LLVM's frem.
3122 SDLoc DL(Op);
3123 SDValue X = Op->getOperand(0);
3124 SDValue Y = Op->getOperand(1);
3125 EVT Ty = Op.getValueType();
3126 SDNodeFlags Flags = Op->getFlags();
3127
3128 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
3129 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
3130 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
3132 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
3134
3135 if (Flags.hasNoInfs())
3136 return Sub;
3137
3138 // If Y is infinite, return X
3139 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
3140 SDValue Inf =
3141 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
3142 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
3143 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
3144}
3145
3147 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
3148
3149 SDValue Cond = Op->getOperand(0);
3150 SDValue TrueVal = Op->getOperand(1);
3151 SDValue FalseVal = Op->getOperand(2);
3152 SDLoc DL(Op);
3153
3154 // If both operands are truncated, we push the select through the truncates.
3155 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
3156 FalseVal.getOpcode() == ISD::TRUNCATE) {
3157 TrueVal = TrueVal.getOperand(0);
3158 FalseVal = FalseVal.getOperand(0);
3159
3160 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
3161 ? TrueVal.getValueType()
3162 : FalseVal.getValueType();
3163 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
3164 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
3165 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
3166 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
3167 }
3168
3169 // Otherwise, expand the select into a series of logical operations. These
3170 // often can be folded into other operations either by us or ptxas.
3171 TrueVal = DAG.getFreeze(TrueVal);
3172 FalseVal = DAG.getFreeze(FalseVal);
3173 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
3174 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
3175 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
3176 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
3177 return Or;
3178}
3179
3180SDValue
3182 switch (Op.getOpcode()) {
3183 case ISD::RETURNADDR:
3184 return SDValue();
3185 case ISD::FRAMEADDR:
3186 return SDValue();
3187 case ISD::ADDRSPACECAST:
3188 return LowerADDRSPACECAST(Op, DAG);
3190 return lowerIntrinsicWChain(Op, DAG);
3192 return lowerIntrinsicWOChain(Op, DAG);
3194 return lowerIntrinsicVoid(Op, DAG);
3195 case ISD::BUILD_VECTOR:
3196 return LowerBUILD_VECTOR(Op, DAG);
3197 case ISD::BITCAST:
3198 return LowerBITCAST(Op, DAG);
3200 return Op;
3202 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3204 return LowerINSERT_VECTOR_ELT(Op, DAG);
3206 return LowerVECTOR_SHUFFLE(Op, DAG);
3208 return LowerCONCAT_VECTORS(Op, DAG);
3209 case ISD::VECREDUCE_FMAX:
3210 case ISD::VECREDUCE_FMIN:
3211 case ISD::VECREDUCE_FMAXIMUM:
3212 case ISD::VECREDUCE_FMINIMUM:
3213 return LowerVECREDUCE(Op, DAG);
3214 case ISD::STORE:
3215 return LowerSTORE(Op, DAG);
3216 case ISD::LOAD:
3217 return LowerLOAD(Op, DAG);
3218 case ISD::SHL_PARTS:
3219 return LowerShiftLeftParts(Op, DAG);
3220 case ISD::SRA_PARTS:
3221 case ISD::SRL_PARTS:
3222 return LowerShiftRightParts(Op, DAG);
3223 case ISD::SELECT:
3224 return lowerSELECT(Op, DAG);
3225 case ISD::FROUND:
3226 return LowerFROUND(Op, DAG);
3227 case ISD::FCOPYSIGN:
3228 return LowerFCOPYSIGN(Op, DAG);
3229 case ISD::SINT_TO_FP:
3230 case ISD::UINT_TO_FP:
3231 return LowerINT_TO_FP(Op, DAG);
3232 case ISD::FP_TO_SINT:
3233 case ISD::FP_TO_UINT:
3234 return LowerFP_TO_INT(Op, DAG);
3235 case ISD::FP_ROUND:
3236 return LowerFP_ROUND(Op, DAG);
3237 case ISD::FP_EXTEND:
3238 return LowerFP_EXTEND(Op, DAG);
3239 case ISD::BR_JT:
3240 return LowerBR_JT(Op, DAG);
3241 case ISD::VAARG:
3242 return LowerVAARG(Op, DAG);
3243 case ISD::VASTART:
3244 return LowerVASTART(Op, DAG);
3245 case ISD::FSHL:
3246 case ISD::FSHR:
3247 return lowerFSH(Op, DAG);
3248 case ISD::ROTL:
3249 case ISD::ROTR:
3250 return lowerROT(Op, DAG);
3251 case ISD::ABS:
3252 case ISD::SMIN:
3253 case ISD::SMAX:
3254 case ISD::UMIN:
3255 case ISD::UMAX:
3256 case ISD::ADD:
3257 case ISD::SUB:
3258 case ISD::MUL:
3259 case ISD::SHL:
3260 case ISD::SREM:
3261 case ISD::UREM:
3262 return LowerVectorArith(Op, DAG);
3263 case ISD::DYNAMIC_STACKALLOC:
3264 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3265 case ISD::STACKRESTORE:
3266 return LowerSTACKRESTORE(Op, DAG);
3267 case ISD::STACKSAVE:
3268 return LowerSTACKSAVE(Op, DAG);
3269 case ISD::CopyToReg:
3270 return LowerCopyToReg_128(Op, DAG);
3271 case ISD::FADD:
3272 case ISD::FSUB:
3273 case ISD::FMUL:
3274 // Used only for bf16 on SM80, where we select fma for non-ftz operation
3275 return PromoteBinOpIfF32FTZ(Op, DAG);
3276 case ISD::CTPOP:
3277 case ISD::CTLZ:
3278 return lowerCTLZCTPOP(Op, DAG);
3279 case ISD::FREM:
3280 return lowerFREM(Op, DAG);
3281
3282 default:
3283 llvm_unreachable("Custom lowering not defined for operation");
3284 }
3285}
3286
3287SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3288 SDLoc DL(Op);
3289 SDValue Chain = Op.getOperand(0);
3290 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
3291 SDValue Index = Op.getOperand(2);
3292
3293 unsigned JId = JT->getIndex();
3295 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
3296
3297 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
3298
3299 // Generate BrxStart node
3300 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
3301 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
3302
3303 // Generate BrxItem nodes
3304 assert(!MBBs.empty());
3305 for (MachineBasicBlock *MBB : MBBs.drop_back())
3306 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
3307 DAG.getBasicBlock(MBB), Chain.getValue(1));
3308
3309 // Generate BrxEnd nodes
3310 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
3311 IdV, Chain.getValue(1)};
3312 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
3313
3314 return BrxEnd;
3315}
3316
3317// This will prevent AsmPrinter from trying to print the jump tables itself.
3321
3322SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3323 SelectionDAG &DAG) const {
3325 unsigned SrcAS = N->getSrcAddressSpace();
3326 unsigned DestAS = N->getDestAddressSpace();
3327 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3328 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3329 // Shared and SharedCluster can be converted to each other through generic
3330 // space
3331 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3334 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3335 SDLoc DL(Op.getNode());
3336 const MVT GenerictVT =
3338 SDValue GenericConversion = DAG.getAddrSpaceCast(
3339 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3340 SDValue SharedClusterConversion =
3341 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3342 ADDRESS_SPACE_GENERIC, DestAS);
3343 return SharedClusterConversion;
3344 }
3345
3346 return DAG.getUNDEF(Op.getValueType());
3347 }
3348
3349 return Op;
3350}
3351
3352// This function is almost a copy of SelectionDAG::expandVAArg().
3353// The only diff is that this one produces loads from local address space.
3354SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3355 const TargetLowering *TLI = STI.getTargetLowering();
3356 SDLoc DL(Op);
3357
3358 SDNode *Node = Op.getNode();
3359 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3360 EVT VT = Node->getValueType(0);
3361 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3362 SDValue Tmp1 = Node->getOperand(0);
3363 SDValue Tmp2 = Node->getOperand(1);
3364 const MaybeAlign MA(Node->getConstantOperandVal(3));
3365
3366 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3367 Tmp1, Tmp2, MachinePointerInfo(V));
3368 SDValue VAList = VAListLoad;
3369
3370 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3371 VAList = DAG.getNode(
3372 ISD::ADD, DL, VAList.getValueType(), VAList,
3373 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3374
3375 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3376 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3377 VAList.getValueType()));
3378 }
3379
3380 // Increment the pointer, VAList, to the next vaarg
3381 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3383 DL, VAList.getValueType()));
3384
3385 // Store the incremented VAList to the legalized pointer
3386 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3387 MachinePointerInfo(V));
3388
3389 const Value *SrcV = Constant::getNullValue(
3391
3392 // Load the actual argument out of the pointer VAList
3393 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3394}
3395
3396SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3397 const TargetLowering *TLI = STI.getTargetLowering();
3398 SDLoc DL(Op);
3399 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3400
3401 // Store the address of unsized array <function>_vararg[] in the ap object.
3402 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3403
3404 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3405 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3406 MachinePointerInfo(SV));
3407}
3408
3409/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3410static std::optional<std::pair<SDValue, SDValue>>
3413 const EVT ResVT = LD->getValueType(0);
3414 const EVT MemVT = LD->getMemoryVT();
3415
3416 // If we're doing sign/zero extension as part of the load, avoid lowering to
3417 // a LoadV node. TODO: consider relaxing this restriction.
3418 if (ResVT != MemVT)
3419 return std::nullopt;
3420
3421 const auto NumEltsAndEltVT =
3422 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3423 if (!NumEltsAndEltVT)
3424 return std::nullopt;
3425 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3426
3427 Align Alignment = LD->getAlign();
3428 const auto &TD = DAG.getDataLayout();
3429 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3430 if (Alignment < PrefAlign) {
3431 // This load is not sufficiently aligned, so bail out and let this vector
3432 // load be scalarized. Note that we may still be able to emit smaller
3433 // vector loads. For example, if we are loading a <4 x float> with an
3434 // alignment of 8, this check will fail but the legalizer will try again
3435 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3436 return std::nullopt;
3437 }
3438
3439 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3440 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3441 // loaded type to i16 and propagate the "real" type as the memory type.
3442 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3443
3444 unsigned Opcode;
3445 switch (NumElts) {
3446 default:
3447 return std::nullopt;
3448 case 2:
3449 Opcode = NVPTXISD::LoadV2;
3450 break;
3451 case 4:
3452 Opcode = NVPTXISD::LoadV4;
3453 break;
3454 case 8:
3455 Opcode = NVPTXISD::LoadV8;
3456 break;
3457 }
3458 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3459 ListVTs.push_back(MVT::Other);
3460 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3461
3462 SDLoc DL(LD);
3463
3464 // Copy regular operands
3465 SmallVector<SDValue, 8> OtherOps(LD->ops());
3466
3467 // The select routine does not have access to the LoadSDNode instance, so
3468 // pass along the extension information
3469 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
3470
3471 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3472 LD->getMemOperand());
3473
3474 SmallVector<SDValue> ScalarRes;
3475 if (EltVT.isVector()) {
3477 assert(NumElts * EltVT.getVectorNumElements() ==
3478 ResVT.getVectorNumElements());
3479 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3480 // into individual elements.
3481 for (const unsigned I : llvm::seq(NumElts)) {
3482 SDValue SubVector = NewLD.getValue(I);
3483 DAG.ExtractVectorElements(SubVector, ScalarRes);
3484 }
3485 } else {
3486 for (const unsigned I : llvm::seq(NumElts)) {
3487 SDValue Res = NewLD.getValue(I);
3488 if (LoadEltVT != EltVT)
3489 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3490 ScalarRes.push_back(Res);
3491 }
3492 }
3493
3494 SDValue LoadChain = NewLD.getValue(NumElts);
3495
3496 const MVT BuildVecVT =
3497 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3498 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3499 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3500
3501 return {{LoadValue, LoadChain}};
3502}
3503
3506 const NVPTXSubtarget &STI) {
3507 if (auto Res = replaceLoadVector(N, DAG, STI))
3508 Results.append({Res->first, Res->second});
3509}
3510
3512 const NVPTXSubtarget &STI) {
3513 if (auto Res = replaceLoadVector(N, DAG, STI))
3514 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3515 return SDValue();
3516}
3517
3518// v = ld i1* addr
3519// =>
3520// v1 = ld i8* addr (-> i16)
3521// v = trunc i16 to i1
3523 SDLoc dl(LD);
3524 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3525 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3526 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3527 LD->getBasePtr(), LD->getPointerInfo(),
3528 MVT::i8, LD->getAlign(),
3529 LD->getMemOperand()->getFlags());
3530 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3531 // The legalizer (the caller) is expecting two values from the legalized
3532 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3533 // in LegalizeDAG.cpp which also uses MergeValues.
3534 return DAG.getMergeValues({result, LD->getChain()}, dl);
3535}
3536
3537SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3538 LoadSDNode *LD = cast<LoadSDNode>(Op);
3539
3540 if (Op.getValueType() == MVT::i1)
3541 return lowerLOADi1(LD, DAG);
3542
3543 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3544 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3545 // we allow for more DAG combine opportunities.
3546 if (LD->getExtensionType() == ISD::EXTLOAD) {
3547 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3548 "Unexpected fpext-load");
3549 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3550 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3551 LD->getMemOperand());
3552 }
3553
3554 llvm_unreachable("Unexpected custom lowering for load");
3555}
3556
3558 const NVPTXSubtarget &STI) {
3559 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3560 SDValue Val = N->getOperand(1);
3561 SDLoc DL(N);
3562 const EVT ValVT = Val.getValueType();
3563 const EVT MemVT = N->getMemoryVT();
3564
3565 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3566 // TODO: consider relaxing this restriction.
3567 if (ValVT != MemVT)
3568 return SDValue();
3569
3570 const auto NumEltsAndEltVT =
3571 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3572 if (!NumEltsAndEltVT)
3573 return SDValue();
3574 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3575
3576 const DataLayout &TD = DAG.getDataLayout();
3577
3578 Align Alignment = N->getAlign();
3579 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3580 if (Alignment < PrefAlign) {
3581 // This store is not sufficiently aligned, so bail out and let this vector
3582 // store be scalarized. Note that we may still be able to emit smaller
3583 // vector stores. For example, if we are storing a <4 x float> with an
3584 // alignment of 8, this check will fail but the legalizer will try again
3585 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3586 return SDValue();
3587 }
3588
3589 unsigned Opcode;
3590 switch (NumElts) {
3591 default:
3592 return SDValue();
3593 case 2:
3594 Opcode = NVPTXISD::StoreV2;
3595 break;
3596 case 4:
3597 Opcode = NVPTXISD::StoreV4;
3598 break;
3599 case 8:
3600 Opcode = NVPTXISD::StoreV8;
3601 break;
3602 }
3603
3605
3606 // First is the chain
3607 Ops.push_back(N->getOperand(0));
3608
3609 // Then the split values
3610 if (EltVT.isVector()) {
3612 assert(NumElts * EltVT.getVectorNumElements() ==
3613 ValVT.getVectorNumElements());
3614 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3615 // stored as b32s
3616 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3617 for (const unsigned I : llvm::seq(NumElts)) {
3618 SmallVector<SDValue, 4> SubVectorElts;
3619 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3620 NumEltsPerSubVector);
3621 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3622 }
3623 } else {
3624 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3625 for (const unsigned I : llvm::seq(NumElts)) {
3626 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3627 DAG.getIntPtrConstant(I, DL));
3628
3629 // Since StoreV2 is a target node, we cannot rely on DAG type
3630 // legalization. Therefore, we must ensure the type is legal. For i1 and
3631 // i8, we set the stored type to i16 and propagate the "real" type as the
3632 // memory type.
3633 if (EltVT.getSizeInBits() < 16)
3634 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3635 Ops.push_back(ExtVal);
3636 }
3637 }
3638
3639 // Then any remaining arguments
3640 Ops.append(N->op_begin() + 2, N->op_end());
3641
3642 SDValue NewSt =
3643 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3644 N->getMemoryVT(), N->getMemOperand());
3645
3646 // return DCI.CombineTo(N, NewSt, true);
3647 return NewSt;
3648}
3649
3650SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3651 StoreSDNode *Store = cast<StoreSDNode>(Op);
3652 EVT VT = Store->getMemoryVT();
3653
3654 if (VT == MVT::i1)
3655 return LowerSTOREi1(Op, DAG);
3656
3657 // Lower store of any other vector type, including v2f32 as we want to break
3658 // it apart since this is not a widely-supported type.
3659 return lowerSTOREVector(Op, DAG, STI);
3660}
3661
3662// st i1 v, addr
3663// =>
3664// v1 = zxt v to i16
3665// st.u8 i16, addr
3666SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3667 SDNode *Node = Op.getNode();
3668 SDLoc dl(Node);
3669 StoreSDNode *ST = cast<StoreSDNode>(Node);
3670 SDValue Tmp1 = ST->getChain();
3671 SDValue Tmp2 = ST->getBasePtr();
3672 SDValue Tmp3 = ST->getValue();
3673 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3674 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3675 SDValue Result =
3676 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3677 ST->getAlign(), ST->getMemOperand()->getFlags());
3678 return Result;
3679}
3680
3681SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3682 SelectionDAG &DAG) const {
3683 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3684 // operand so that it can pass the legalization.
3685
3686 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3687 "Custom lowering for 128-bit CopyToReg only");
3688
3689 SDNode *Node = Op.getNode();
3690 SDLoc DL(Node);
3691
3692 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3693 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3694 DAG.getIntPtrConstant(0, DL));
3695 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3696 DAG.getIntPtrConstant(1, DL));
3697
3699 SmallVector<EVT, 3> ResultsType(Node->values());
3700
3701 NewOps[0] = Op->getOperand(0); // Chain
3702 NewOps[1] = Op->getOperand(1); // Dst Reg
3703 NewOps[2] = Lo; // Lower 64-bit
3704 NewOps[3] = Hi; // Higher 64-bit
3705 if (Op.getNumOperands() == 4)
3706 NewOps[4] = Op->getOperand(3); // Glue if exists
3707
3708 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3709}
3710
3711unsigned NVPTXTargetLowering::getNumRegisters(
3712 LLVMContext &Context, EVT VT,
3713 std::optional<MVT> RegisterVT = std::nullopt) const {
3714 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3715 return 1;
3716 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3717}
3718
3719bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3720 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3721 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3722 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3723 Parts[0] = Val;
3724 return true;
3725 }
3726 return false;
3727}
3728
3729// This creates target external symbol for a function parameter.
3730// Name of the symbol is composed from its index and the function name.
3731// Negative index corresponds to special parameter (unsized array) used for
3732// passing variable arguments.
3733SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
3734 EVT T) const {
3735 StringRef SavedStr = nvTM->getStrPool().save(
3737 return DAG.getExternalSymbol(SavedStr.data(), T);
3738}
3739
3740SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
3741 EVT T) const {
3742 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
3743 return DAG.getExternalSymbol(SavedStr.data(), T);
3744}
3745
3747 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3748 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3749 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3750 const DataLayout &DL = DAG.getDataLayout();
3751 LLVMContext &Ctx = *DAG.getContext();
3752 auto PtrVT = getPointerTy(DAG.getDataLayout());
3753
3754 const Function &F = DAG.getMachineFunction().getFunction();
3755
3756 SDValue Root = DAG.getRoot();
3757 SmallVector<SDValue, 16> OutChains;
3758
3759 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3760 // Ins.size() will be larger
3761 // * if there is an aggregate argument with multiple fields (each field
3762 // showing up separately in Ins)
3763 // * if there is a vector argument with more than typical vector-length
3764 // elements (generally if more than 4) where each vector element is
3765 // individually present in Ins.
3766 // So a different index should be used for indexing into Ins.
3767 // See similar issue in LowerCall.
3768
3769 auto AllIns = ArrayRef(Ins);
3770 for (const auto &Arg : F.args()) {
3771 const auto ArgIns = AllIns.take_while(
3772 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
3773 AllIns = AllIns.drop_front(ArgIns.size());
3774
3775 Type *Ty = Arg.getType();
3776
3777 if (ArgIns.empty())
3778 report_fatal_error("Empty parameter types are not supported");
3779
3780 if (Arg.use_empty()) {
3781 // argument is dead
3782 for (const auto &In : ArgIns) {
3783 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
3784 InVals.push_back(DAG.getUNDEF(In.VT));
3785 }
3786 continue;
3787 }
3788
3789 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
3790
3791 // In the following cases, assign a node order of "i+1"
3792 // to newly created nodes. The SDNodes for params have to
3793 // appear in the same order as their order of appearance
3794 // in the original function. "i+1" holds that order.
3795 if (Arg.hasByValAttr()) {
3796 // Param has ByVal attribute
3797 // Return MoveParam(param symbol).
3798 // Ideally, the param symbol can be returned directly,
3799 // but when SDNode builder decides to use it in a CopyToReg(),
3800 // machine instruction fails because TargetExternalSymbol
3801 // (not lowered) is target dependent, and CopyToReg assumes
3802 // the source is lowered.
3803 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
3804 const auto &ByvalIn = ArgIns[0];
3805 assert(getValueType(DL, Ty) == ByvalIn.VT &&
3806 "Ins type did not match function type");
3807 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
3808
3809 SDValue P;
3810 if (isKernelFunction(F)) {
3811 P = ArgSymbol;
3812 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3813 } else {
3814 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
3815 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3816 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
3818 }
3819 InVals.push_back(P);
3820 } else {
3823 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
3824 assert(VTs.size() == ArgIns.size() && "Size mismatch");
3825 assert(VTs.size() == Offsets.size() && "Size mismatch");
3826
3827 const Align ArgAlign = getFunctionArgumentAlignment(
3828 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
3829
3830 unsigned I = 0;
3831 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3832 for (const unsigned NumElts : VI) {
3833 // i1 is loaded/stored as i8
3834 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
3835 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
3836
3837 SDValue VecAddr = DAG.getObjectPtrOffset(
3838 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
3839
3840 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
3841 SDValue P =
3842 DAG.getLoad(VecVT, dl, Root, VecAddr,
3846 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3847 for (const unsigned J : llvm::seq(NumElts)) {
3848 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
3849
3850 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
3851 DAG, dl);
3852 InVals.push_back(Elt);
3853 }
3854 I += NumElts;
3855 }
3856 }
3857 }
3858
3859 if (!OutChains.empty())
3860 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
3861
3862 return Chain;
3863}
3864
3865SDValue
3867 bool isVarArg,
3869 const SmallVectorImpl<SDValue> &OutVals,
3870 const SDLoc &dl, SelectionDAG &DAG) const {
3871 const Function &F = DAG.getMachineFunction().getFunction();
3872 Type *RetTy = F.getReturnType();
3873
3874 if (RetTy->isVoidTy()) {
3875 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
3876 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3877 }
3878
3879 const DataLayout &DL = DAG.getDataLayout();
3880 LLVMContext &Ctx = *DAG.getContext();
3881
3882 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
3883 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
3884
3885 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3886 // 32-bits are sign extended or zero extended, depending on whether
3887 // they are signed or unsigned types.
3888 const bool ExtendIntegerRetVal =
3889 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3890
3893 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
3894 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3895
3896 const auto GetRetVal = [&](unsigned I) -> SDValue {
3897 SDValue RetVal = OutVals[I];
3899 RetVal.getValueType() &&
3900 "OutVal type should always be legal");
3901
3902 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
3903 const EVT StoreVT =
3904 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
3905 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
3906 };
3907
3908 unsigned I = 0;
3909 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
3910 for (const unsigned NumElts : VI) {
3911 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
3912 ? MaybeAlign(std::nullopt)
3913 : commonAlignment(RetAlign, Offsets[I]);
3914
3916 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
3917
3918 SDValue Ptr =
3919 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
3920
3921 Chain = DAG.getStore(Chain, dl, Val, Ptr,
3923
3924 I += NumElts;
3925 }
3926
3927 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3928}
3929
3931 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3932 SelectionDAG &DAG) const {
3933 if (Constraint.size() > 1)
3934 return;
3936}
3937
3938// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3939// TgtMemIntrinsic
3940// because we need the information that is only available in the "Value" type
3941// of destination
3942// pointer. In particular, the address space information.
3944 IntrinsicInfo &Info, const CallInst &I,
3945 MachineFunction &MF, unsigned Intrinsic) const {
3946 switch (Intrinsic) {
3947 default:
3948 return false;
3949 case Intrinsic::nvvm_match_all_sync_i32p:
3950 case Intrinsic::nvvm_match_all_sync_i64p:
3951 Info.opc = ISD::INTRINSIC_W_CHAIN;
3952 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3953 // in order to model data exchange with other threads, but perform no real
3954 // memory accesses.
3955 Info.memVT = MVT::i1;
3956
3957 // Our result depends on both our and other thread's arguments.
3959 return true;
3960 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3961 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3962 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3963 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3964 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3965 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3966 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3967 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3968 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3969 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3970 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3971 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3972 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3973 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3974 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3975 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3976 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3977 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3978 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3979 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3980 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3981 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3982 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3983 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3984 Info.opc = ISD::INTRINSIC_W_CHAIN;
3985 Info.memVT = MVT::v8f16;
3986 Info.ptrVal = I.getArgOperand(0);
3987 Info.offset = 0;
3988 Info.flags = MachineMemOperand::MOLoad;
3989 Info.align = Align(16);
3990 return true;
3991 }
3992 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3993 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3994 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3995 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3996 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3997 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3998 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3999 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4000 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4001 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4002 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4003 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4004 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4005 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4006 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4007 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4008 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4009 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4010 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4011 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4012 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4013 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4014 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4015 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4016 Info.opc = ISD::INTRINSIC_W_CHAIN;
4017 Info.memVT = MVT::v2i32;
4018 Info.ptrVal = I.getArgOperand(0);
4019 Info.offset = 0;
4020 Info.flags = MachineMemOperand::MOLoad;
4021 Info.align = Align(8);
4022 return true;
4023 }
4024
4025 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4026 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4027 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4028 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4029 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4030 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4031 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4032 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4033 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4034 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4035 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4036 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4037 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4038 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4039 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4040 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4041
4042 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4043 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4044 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4045 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4046 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4047 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4048 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4049 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4050 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4051 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4052 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4053 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4054 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4055 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4056 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4057 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4058 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4059 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
4060 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
4061 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
4062 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
4063 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
4064 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
4065 Info.opc = ISD::INTRINSIC_W_CHAIN;
4066 Info.memVT = MVT::v4i32;
4067 Info.ptrVal = I.getArgOperand(0);
4068 Info.offset = 0;
4069 Info.flags = MachineMemOperand::MOLoad;
4070 Info.align = Align(16);
4071 return true;
4072 }
4073
4074 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4075 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4076 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4077 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4078 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4079 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4080 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4081 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4082
4083 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4084 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4085 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4086 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4087 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4088 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4089 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4090 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4091 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4092 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4093 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4094 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4095 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4096 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4097 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4098 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4099 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4100 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4101 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4102 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4103 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4104 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
4105 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
4106 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
4107 Info.opc = ISD::INTRINSIC_W_CHAIN;
4108 Info.memVT = MVT::i32;
4109 Info.ptrVal = I.getArgOperand(0);
4110 Info.offset = 0;
4111 Info.flags = MachineMemOperand::MOLoad;
4112 Info.align = Align(4);
4113 return true;
4114 }
4115
4116 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4117 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4118 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4119 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4120 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4121 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4122 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4123 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4124 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4125 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4126 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4127 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4128 Info.opc = ISD::INTRINSIC_W_CHAIN;
4129 Info.memVT = MVT::v4f16;
4130 Info.ptrVal = I.getArgOperand(0);
4131 Info.offset = 0;
4132 Info.flags = MachineMemOperand::MOLoad;
4133 Info.align = Align(16);
4134 return true;
4135 }
4136
4137 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4138 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4139 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4140 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4141 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4142 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4143 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4144 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4145 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4146 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4147 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4148 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4149 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4150 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4151 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4152 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4153 Info.opc = ISD::INTRINSIC_W_CHAIN;
4154 Info.memVT = MVT::v8f32;
4155 Info.ptrVal = I.getArgOperand(0);
4156 Info.offset = 0;
4157 Info.flags = MachineMemOperand::MOLoad;
4158 Info.align = Align(16);
4159 return true;
4160 }
4161
4162 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4163 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4164 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4165 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4166
4167 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4168 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4169 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4170 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4171
4172 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4173 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4174 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4175 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4176 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4177 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4178 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4179 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4180 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4181 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4182 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4183 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4184 Info.opc = ISD::INTRINSIC_W_CHAIN;
4185 Info.memVT = MVT::v8i32;
4186 Info.ptrVal = I.getArgOperand(0);
4187 Info.offset = 0;
4188 Info.flags = MachineMemOperand::MOLoad;
4189 Info.align = Align(16);
4190 return true;
4191 }
4192
4193 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4194 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4195 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4196 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4197 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4198 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4199 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4200 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4201 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4202 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
4203 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
4204 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
4205 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
4206 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
4207 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
4208 Info.opc = ISD::INTRINSIC_W_CHAIN;
4209 Info.memVT = MVT::v2i32;
4210 Info.ptrVal = I.getArgOperand(0);
4211 Info.offset = 0;
4212 Info.flags = MachineMemOperand::MOLoad;
4213 Info.align = Align(8);
4214 return true;
4215 }
4216
4217 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4218 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4219 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4220 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4221
4222 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4223 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4224 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4225 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4226 Info.opc = ISD::INTRINSIC_W_CHAIN;
4227 Info.memVT = MVT::f64;
4228 Info.ptrVal = I.getArgOperand(0);
4229 Info.offset = 0;
4230 Info.flags = MachineMemOperand::MOLoad;
4231 Info.align = Align(8);
4232 return true;
4233 }
4234
4235 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4236 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4237 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4238 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4239 Info.opc = ISD::INTRINSIC_W_CHAIN;
4240 Info.memVT = MVT::v2f64;
4241 Info.ptrVal = I.getArgOperand(0);
4242 Info.offset = 0;
4243 Info.flags = MachineMemOperand::MOLoad;
4244 Info.align = Align(16);
4245 return true;
4246 }
4247
4248 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4249 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4250 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4251 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4252 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4253 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4254 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4255 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4256 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4257 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4258 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4259 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4260 Info.opc = ISD::INTRINSIC_VOID;
4261 Info.memVT = MVT::v4f16;
4262 Info.ptrVal = I.getArgOperand(0);
4263 Info.offset = 0;
4264 Info.flags = MachineMemOperand::MOStore;
4265 Info.align = Align(16);
4266 return true;
4267 }
4268
4269 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4270 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4271 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4272 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4273 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4274 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4275 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4276 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4277 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4278 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4279 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4280 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4281 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4282 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4283 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4284 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4285 Info.opc = ISD::INTRINSIC_VOID;
4286 Info.memVT = MVT::v8f32;
4287 Info.ptrVal = I.getArgOperand(0);
4288 Info.offset = 0;
4289 Info.flags = MachineMemOperand::MOStore;
4290 Info.align = Align(16);
4291 return true;
4292 }
4293
4294 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4295 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4296 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4297 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4298 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4299 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4300 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4301 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4302 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4303 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4304 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4305 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4306 Info.opc = ISD::INTRINSIC_VOID;
4307 Info.memVT = MVT::v8i32;
4308 Info.ptrVal = I.getArgOperand(0);
4309 Info.offset = 0;
4310 Info.flags = MachineMemOperand::MOStore;
4311 Info.align = Align(16);
4312 return true;
4313 }
4314
4315 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4316 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4317 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4318 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4319 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4320 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4321 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4322 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4323 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4324 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4325 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4326 Info.opc = ISD::INTRINSIC_VOID;
4327 Info.memVT = MVT::v2i32;
4328 Info.ptrVal = I.getArgOperand(0);
4329 Info.offset = 0;
4330 Info.flags = MachineMemOperand::MOStore;
4331 Info.align = Align(8);
4332 return true;
4333 }
4334
4335 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4336 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4337 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4338 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4339 Info.opc = ISD::INTRINSIC_VOID;
4340 Info.memVT = MVT::v2f64;
4341 Info.ptrVal = I.getArgOperand(0);
4342 Info.offset = 0;
4343 Info.flags = MachineMemOperand::MOStore;
4344 Info.align = Align(16);
4345 return true;
4346 }
4347
4348 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4349 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4350 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4351 Info.opc = ISD::INTRINSIC_VOID;
4352 Info.memVT = MVT::i32;
4353 Info.ptrVal = I.getArgOperand(0);
4354 Info.offset = 0;
4355 Info.flags = MachineMemOperand::MOStore;
4356 Info.align = Align(4);
4357 return true;
4358 }
4359
4360 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4361 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4362 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4363 Info.opc = ISD::INTRINSIC_VOID;
4364 Info.memVT = MVT::v4i32;
4365 Info.ptrVal = I.getArgOperand(0);
4366 Info.offset = 0;
4367 Info.flags = MachineMemOperand::MOStore;
4368 Info.align = Align(16);
4369 return true;
4370 }
4371
4372 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4373 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4374 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4375 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4376 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4377 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4378 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4379 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4380 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4381 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4382 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4383 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4384 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4385 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4386 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4387 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4388 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4389 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4390 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4391 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4392 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4393 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4394 auto &DL = I.getDataLayout();
4395 Info.opc = ISD::INTRINSIC_W_CHAIN;
4396 Info.memVT = getValueType(DL, I.getType());
4397 Info.ptrVal = I.getArgOperand(0);
4398 Info.offset = 0;
4400 Info.align.reset();
4401 return true;
4402 }
4403
4404 case Intrinsic::nvvm_prefetch_tensormap: {
4405 auto &DL = I.getDataLayout();
4406 Info.opc = ISD::INTRINSIC_VOID;
4407 Info.memVT = getPointerTy(DL);
4408 Info.ptrVal = I.getArgOperand(0);
4409 Info.offset = 0;
4410 Info.flags =
4412 Info.align.reset();
4413 return true;
4414 }
4415
4416 case Intrinsic::nvvm_ldu_global_i:
4417 case Intrinsic::nvvm_ldu_global_f:
4418 case Intrinsic::nvvm_ldu_global_p: {
4419 Info.opc = ISD::INTRINSIC_W_CHAIN;
4420 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4421 Info.ptrVal = I.getArgOperand(0);
4422 Info.offset = 0;
4423 Info.flags = MachineMemOperand::MOLoad;
4424 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4425
4426 return true;
4427 }
4428 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4429 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4430 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4431 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4432 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4433 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4434 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4435 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4436 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4437 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4438 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4439 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4440 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4441 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4442 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4443 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4444 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4445 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4446 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4447 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4448 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4449 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4450 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4451 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4452 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4453 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4454 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4455 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4456 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4457 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4458 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4459 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4460 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4461 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4462 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4463 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4464 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4465 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4466 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4467 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4468 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4469 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4470 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4471 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4472 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4473 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4474 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4475 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4476 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4477 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4478 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4479 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4480 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4481 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4482 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4483 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4484 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4485 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4486 Info.opc = ISD::INTRINSIC_W_CHAIN;
4487 Info.memVT = MVT::v4f32;
4488 Info.ptrVal = nullptr;
4489 Info.offset = 0;
4490 Info.flags = MachineMemOperand::MOLoad;
4491 Info.align = Align(16);
4492 return true;
4493
4494 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4495 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4496 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4497 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4498 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4499 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4500 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4501 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4502 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4503 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4504 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4505 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4506 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4507 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4508 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4509 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4510 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4511 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4512 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4513 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4514 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4515 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4516 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4517 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4518 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4519 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4520 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4521 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4522 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4523 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4524 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4525 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4526 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4527 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4528 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4529 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4530 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4531 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4532 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4533 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4534 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4535 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4536 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4537 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4538 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4539 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4540 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4541 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4542 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4543 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4544 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4545 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4546 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4547 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4548 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4549 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4550 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4551 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4552 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4553 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4554 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4555 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4556 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4557 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4558 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4559 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4560 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4561 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4562 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4563 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4564 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4565 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4566 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4567 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4568 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4569 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4570 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4571 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4572 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4573 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4574 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4575 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4576 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4577 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4578 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4579 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4580 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4581 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4582 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4583 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4584 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4585 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4586 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4587 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4588 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4589 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4590 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4591 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4592 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4593 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4594 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4595 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4596 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4597 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4598 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4599 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4600 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4601 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4602 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4603 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4604 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4605 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4606 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4607 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4608 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4609 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4610 Info.opc = ISD::INTRINSIC_W_CHAIN;
4611 Info.memVT = MVT::v4i32;
4612 Info.ptrVal = nullptr;
4613 Info.offset = 0;
4614 Info.flags = MachineMemOperand::MOLoad;
4615 Info.align = Align(16);
4616 return true;
4617
4618 case Intrinsic::nvvm_suld_1d_i8_clamp:
4619 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4620 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4621 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4622 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4623 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4624 case Intrinsic::nvvm_suld_2d_i8_clamp:
4625 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4626 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4627 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4628 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4629 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4630 case Intrinsic::nvvm_suld_3d_i8_clamp:
4631 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4632 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4633 case Intrinsic::nvvm_suld_1d_i8_trap:
4634 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4635 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4636 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4637 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4638 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4639 case Intrinsic::nvvm_suld_2d_i8_trap:
4640 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4641 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4642 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4643 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4644 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4645 case Intrinsic::nvvm_suld_3d_i8_trap:
4646 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4647 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4648 case Intrinsic::nvvm_suld_1d_i8_zero:
4649 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4650 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4651 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4652 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4653 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4654 case Intrinsic::nvvm_suld_2d_i8_zero:
4655 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4656 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4657 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4658 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4659 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4660 case Intrinsic::nvvm_suld_3d_i8_zero:
4661 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4662 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4663 Info.opc = ISD::INTRINSIC_W_CHAIN;
4664 Info.memVT = MVT::i8;
4665 Info.ptrVal = nullptr;
4666 Info.offset = 0;
4667 Info.flags = MachineMemOperand::MOLoad;
4668 Info.align = Align(16);
4669 return true;
4670
4671 case Intrinsic::nvvm_suld_1d_i16_clamp:
4672 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4673 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4674 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4675 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4676 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4677 case Intrinsic::nvvm_suld_2d_i16_clamp:
4678 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4679 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4680 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4681 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4682 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4683 case Intrinsic::nvvm_suld_3d_i16_clamp:
4684 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4685 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4686 case Intrinsic::nvvm_suld_1d_i16_trap:
4687 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4688 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4689 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4690 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4691 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4692 case Intrinsic::nvvm_suld_2d_i16_trap:
4693 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4694 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4695 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4696 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4697 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4698 case Intrinsic::nvvm_suld_3d_i16_trap:
4699 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4700 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4701 case Intrinsic::nvvm_suld_1d_i16_zero:
4702 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4703 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4704 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4705 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4706 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4707 case Intrinsic::nvvm_suld_2d_i16_zero:
4708 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4709 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4710 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4711 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4712 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4713 case Intrinsic::nvvm_suld_3d_i16_zero:
4714 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4715 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4716 Info.opc = ISD::INTRINSIC_W_CHAIN;
4717 Info.memVT = MVT::i16;
4718 Info.ptrVal = nullptr;
4719 Info.offset = 0;
4720 Info.flags = MachineMemOperand::MOLoad;
4721 Info.align = Align(16);
4722 return true;
4723
4724 case Intrinsic::nvvm_suld_1d_i32_clamp:
4725 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4726 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4727 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4728 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4729 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4730 case Intrinsic::nvvm_suld_2d_i32_clamp:
4731 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4732 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4733 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4734 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4735 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4736 case Intrinsic::nvvm_suld_3d_i32_clamp:
4737 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4738 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4739 case Intrinsic::nvvm_suld_1d_i32_trap:
4740 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4741 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4742 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4743 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4744 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4745 case Intrinsic::nvvm_suld_2d_i32_trap:
4746 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4747 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4748 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4749 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4750 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4751 case Intrinsic::nvvm_suld_3d_i32_trap:
4752 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4753 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4754 case Intrinsic::nvvm_suld_1d_i32_zero:
4755 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4756 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4757 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4758 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4759 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4760 case Intrinsic::nvvm_suld_2d_i32_zero:
4761 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4762 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4763 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4764 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4765 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4766 case Intrinsic::nvvm_suld_3d_i32_zero:
4767 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4768 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4769 Info.opc = ISD::INTRINSIC_W_CHAIN;
4770 Info.memVT = MVT::i32;
4771 Info.ptrVal = nullptr;
4772 Info.offset = 0;
4773 Info.flags = MachineMemOperand::MOLoad;
4774 Info.align = Align(16);
4775 return true;
4776
4777 case Intrinsic::nvvm_suld_1d_i64_clamp:
4778 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4779 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4780 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4781 case Intrinsic::nvvm_suld_2d_i64_clamp:
4782 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4783 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4784 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4785 case Intrinsic::nvvm_suld_3d_i64_clamp:
4786 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4787 case Intrinsic::nvvm_suld_1d_i64_trap:
4788 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4789 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4790 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4791 case Intrinsic::nvvm_suld_2d_i64_trap:
4792 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4793 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4794 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4795 case Intrinsic::nvvm_suld_3d_i64_trap:
4796 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4797 case Intrinsic::nvvm_suld_1d_i64_zero:
4798 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4799 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4800 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4801 case Intrinsic::nvvm_suld_2d_i64_zero:
4802 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4803 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4804 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4805 case Intrinsic::nvvm_suld_3d_i64_zero:
4806 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4807 Info.opc = ISD::INTRINSIC_W_CHAIN;
4808 Info.memVT = MVT::i64;
4809 Info.ptrVal = nullptr;
4810 Info.offset = 0;
4811 Info.flags = MachineMemOperand::MOLoad;
4812 Info.align = Align(16);
4813 return true;
4814
4815 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
4816 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
4817 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
4818 Info.opc = ISD::INTRINSIC_W_CHAIN;
4819 Info.memVT = MVT::v1i32;
4820 Info.ptrVal = I.getArgOperand(0);
4821 Info.offset = 0;
4822 Info.flags = MachineMemOperand::MOLoad;
4823 Info.align.reset();
4824 return true;
4825 }
4826
4827 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
4828 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
4829 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
4830 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: {
4831 Info.opc = ISD::INTRINSIC_W_CHAIN;
4832 Info.memVT = MVT::v2i32;
4833 Info.ptrVal = I.getArgOperand(0);
4834 Info.offset = 0;
4835 Info.flags = MachineMemOperand::MOLoad;
4836 Info.align.reset();
4837 return true;
4838 }
4839
4840 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
4841 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
4842 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
4843 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
4844 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: {
4845 Info.opc = ISD::INTRINSIC_W_CHAIN;
4846 Info.memVT = MVT::v4i32;
4847 Info.ptrVal = I.getArgOperand(0);
4848 Info.offset = 0;
4849 Info.flags = MachineMemOperand::MOLoad;
4850 Info.align.reset();
4851 return true;
4852 }
4853
4854 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
4855 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
4856 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
4857 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
4858 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: {
4859 Info.opc = ISD::INTRINSIC_W_CHAIN;
4860 Info.memVT = MVT::v8i32;
4861 Info.ptrVal = I.getArgOperand(0);
4862 Info.offset = 0;
4863 Info.flags = MachineMemOperand::MOLoad;
4864 Info.align.reset();
4865 return true;
4866 }
4867
4868 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
4869 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
4870 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
4871 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
4872 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: {
4873 Info.opc = ISD::INTRINSIC_W_CHAIN;
4874 Info.memVT = MVT::v16i32;
4875 Info.ptrVal = I.getArgOperand(0);
4876 Info.offset = 0;
4877 Info.flags = MachineMemOperand::MOLoad;
4878 Info.align.reset();
4879 return true;
4880 }
4881
4882 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
4883 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
4884 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
4885 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
4886 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: {
4887 Info.opc = ISD::INTRINSIC_W_CHAIN;
4888 Info.memVT = MVT::v32i32;
4889 Info.ptrVal = I.getArgOperand(0);
4890 Info.offset = 0;
4891 Info.flags = MachineMemOperand::MOLoad;
4892 Info.align.reset();
4893 return true;
4894 }
4895
4896 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
4897 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
4898 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
4899 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
4900 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: {
4901 Info.opc = ISD::INTRINSIC_W_CHAIN;
4902 Info.memVT = MVT::v64i32;
4903 Info.ptrVal = I.getArgOperand(0);
4904 Info.offset = 0;
4905 Info.flags = MachineMemOperand::MOLoad;
4906 Info.align.reset();
4907 return true;
4908 }
4909
4910 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
4911 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
4912 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
4913 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
4914 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
4915 Info.opc = ISD::INTRINSIC_W_CHAIN;
4916 Info.memVT = MVT::v128i32;
4917 Info.ptrVal = I.getArgOperand(0);
4918 Info.offset = 0;
4919 Info.flags = MachineMemOperand::MOLoad;
4920 Info.align.reset();
4921 return true;
4922 }
4923
4924 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
4925 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
4926 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
4927 Info.opc = ISD::INTRINSIC_VOID;
4928 Info.memVT = MVT::i32;
4929 Info.ptrVal = I.getArgOperand(0);
4930 Info.offset = 0;
4931 Info.flags = MachineMemOperand::MOStore;
4932 Info.align.reset();
4933 return true;
4934 }
4935
4936 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
4937 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
4938 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
4939 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
4940 Info.opc = ISD::INTRINSIC_VOID;
4941 Info.memVT = MVT::v2i32;
4942 Info.ptrVal = I.getArgOperand(0);
4943 Info.offset = 0;
4944 Info.flags = MachineMemOperand::MOStore;
4945 Info.align.reset();
4946 return true;
4947 }
4948
4949 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
4950 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
4951 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
4952 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
4953 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
4954 Info.opc = ISD::INTRINSIC_VOID;
4955 Info.memVT = MVT::v4i32;
4956 Info.ptrVal = I.getArgOperand(0);
4957 Info.offset = 0;
4958 Info.flags = MachineMemOperand::MOStore;
4959 Info.align.reset();
4960 return true;
4961 }
4962
4963 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
4964 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
4965 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
4966 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
4967 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
4968 Info.opc = ISD::INTRINSIC_VOID;
4969 Info.memVT = MVT::v8i32;
4970 Info.ptrVal = I.getArgOperand(0);
4971 Info.offset = 0;
4972 Info.flags = MachineMemOperand::MOStore;
4973 Info.align.reset();
4974 return true;
4975 }
4976
4977 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
4978 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
4979 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
4980 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
4981 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
4982 Info.opc = ISD::INTRINSIC_VOID;
4983 Info.memVT = MVT::v16i32;
4984 Info.ptrVal = I.getArgOperand(0);
4985 Info.offset = 0;
4986 Info.flags = MachineMemOperand::MOStore;
4987 Info.align.reset();
4988 return true;
4989 }
4990
4991 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
4992 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
4993 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
4994 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
4995 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
4996 Info.opc = ISD::INTRINSIC_VOID;
4997 Info.memVT = MVT::v32i32;
4998 Info.ptrVal = I.getArgOperand(0);
4999 Info.offset = 0;
5000 Info.flags = MachineMemOperand::MOStore;
5001 Info.align.reset();
5002 return true;
5003 }
5004
5005 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
5006 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
5007 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
5008 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
5009 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
5010 Info.opc = ISD::INTRINSIC_VOID;
5011 Info.memVT = MVT::v64i32;
5012 Info.ptrVal = I.getArgOperand(0);
5013 Info.offset = 0;
5014 Info.flags = MachineMemOperand::MOStore;
5015 Info.align.reset();
5016 return true;
5017 }
5018
5019 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
5020 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
5021 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
5022 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
5023 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
5024 Info.opc = ISD::INTRINSIC_VOID;
5025 Info.memVT = MVT::v128i32;
5026 Info.ptrVal = I.getArgOperand(0);
5027 Info.offset = 0;
5028 Info.flags = MachineMemOperand::MOStore;
5029 Info.align.reset();
5030 return true;
5031 }
5032 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
5033 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
5034 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
5035 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
5036 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
5037 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
5038 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
5039 case Intrinsic::
5040 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
5041 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
5042 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
5043 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
5044 case Intrinsic::
5045 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {
5046 // We are reading and writing back to TMem
5047 Info.opc = ISD::INTRINSIC_VOID;
5048 Info.memVT = MVT::v4i32;
5049 Info.ptrVal = I.getArgOperand(0);
5050 Info.offset = 0;
5052 Info.align = Align(16);
5053 return true;
5054 }
5055
5056 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
5057 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
5058 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
5059 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
5060 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
5061 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
5062 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
5063 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
5064 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
5065 case Intrinsic::
5066 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
5067 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
5068 case Intrinsic::
5069 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {
5070 // We are reading and writing back to TMem
5071 Info.opc = ISD::INTRINSIC_VOID;
5072 Info.memVT = MVT::v8i32;
5073 Info.ptrVal = I.getArgOperand(0);
5074 Info.offset = 0;
5076 Info.align = Align(16);
5077 return true;
5078 }
5079 }
5080 return false;
5081}
5082
5083/// getFunctionParamOptimizedAlign - since function arguments are passed via
5084/// .param space, we may want to increase their alignment in a way that
5085/// ensures that we can effectively vectorize their loads & stores. We can
5086/// increase alignment only if the function has internal or has private
5087/// linkage as for other linkage types callers may already rely on default
5088/// alignment. To allow using 128-bit vectorized loads/stores, this function
5089/// ensures that alignment is 16 or greater.
5091 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5092 // Capping the alignment to 128 bytes as that is the maximum alignment
5093 // supported by PTX.
5094 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5095
5096 // If a function has linkage different from internal or private, we
5097 // must use default ABI alignment as external users rely on it. Same
5098 // for a function that may be called from a function pointer.
5099 if (!F || !F->hasLocalLinkage() ||
5100 F->hasAddressTaken(/*Users=*/nullptr,
5101 /*IgnoreCallbackUses=*/false,
5102 /*IgnoreAssumeLikeCalls=*/true,
5103 /*IgnoreLLVMUsed=*/true))
5104 return ABITypeAlign;
5105
5106 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5107 return std::max(Align(16), ABITypeAlign);
5108}
5109
5110/// Helper for computing alignment of a device function byval parameter.
5112 const Function *F, Type *ArgTy, Align InitialAlign,
5113 const DataLayout &DL) const {
5114 Align ArgAlign = InitialAlign;
5115 // Try to increase alignment to enhance vectorization options.
5116 if (F)
5117 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5118
5119 // Old ptx versions have a bug. When PTX code takes address of
5120 // byval parameter with alignment < 4, ptxas generates code to
5121 // spill argument into memory. Alas on sm_50+ ptxas generates
5122 // SASS code that fails with misaligned access. To work around
5123 // the problem, make sure that we align byval parameters by at
5124 // least 4. This bug seems to be fixed at least starting from
5125 // ptxas > 9.0.
5126 // TODO: remove this after verifying the bug is not reproduced
5127 // on non-deprecated ptxas versions.
5129 ArgAlign = std::max(ArgAlign, Align(4));
5130
5131 return ArgAlign;
5132}
5133
5134// Helper for getting a function parameter name. Name is composed from
5135// its index and the function name. Negative index corresponds to special
5136// parameter (unsized array) used for passing variable arguments.
5138 int Idx) const {
5139 std::string ParamName;
5140 raw_string_ostream ParamStr(ParamName);
5141
5142 ParamStr << getTargetMachine().getSymbol(F)->getName();
5143 if (Idx < 0)
5144 ParamStr << "_vararg";
5145 else
5146 ParamStr << "_param_" << Idx;
5147
5148 return ParamName;
5149}
5150
5151/// isLegalAddressingMode - Return true if the addressing mode represented
5152/// by AM is legal for this target, for a load/store of the specified type.
5153/// Used to guide target specific optimizations, like loop strength reduction
5154/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5155/// (CodeGenPrepare.cpp)
5157 const AddrMode &AM, Type *Ty,
5158 unsigned AS, Instruction *I) const {
5159 // AddrMode - This represents an addressing mode of:
5160 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5161 //
5162 // The legal address modes are
5163 // - [avar]
5164 // - [areg]
5165 // - [areg+immoff]
5166 // - [immAddr]
5167
5168 // immoff must fit in a signed 32-bit int
5169 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5170 return false;
5171
5172 if (AM.BaseGV)
5173 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5174
5175 switch (AM.Scale) {
5176 case 0: // "r", "r+i" or "i" is allowed
5177 break;
5178 case 1:
5179 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5180 return false;
5181 // Otherwise we have r+i.
5182 break;
5183 default:
5184 // No scale > 1 is allowed
5185 return false;
5186 }
5187 return true;
5188}
5189
5190//===----------------------------------------------------------------------===//
5191// NVPTX Inline Assembly Support
5192//===----------------------------------------------------------------------===//
5193
5194/// getConstraintType - Given a constraint letter, return the type of
5195/// constraint it is for this target.
5198 if (Constraint.size() == 1) {
5199 switch (Constraint[0]) {
5200 default:
5201 break;
5202 case 'b':
5203 case 'r':
5204 case 'h':
5205 case 'c':
5206 case 'l':
5207 case 'f':
5208 case 'd':
5209 case 'q':
5210 case '0':
5211 case 'N':
5212 return C_RegisterClass;
5213 }
5214 }
5215 return TargetLowering::getConstraintType(Constraint);
5216}
5217
5218std::pair<unsigned, const TargetRegisterClass *>
5220 StringRef Constraint,
5221 MVT VT) const {
5222 if (Constraint.size() == 1) {
5223 switch (Constraint[0]) {
5224 case 'b':
5225 return std::make_pair(0U, &NVPTX::B1RegClass);
5226 case 'c':
5227 case 'h':
5228 return std::make_pair(0U, &NVPTX::B16RegClass);
5229 case 'r':
5230 case 'f':
5231 return std::make_pair(0U, &NVPTX::B32RegClass);
5232 case 'l':
5233 case 'N':
5234 case 'd':
5235 return std::make_pair(0U, &NVPTX::B64RegClass);
5236 case 'q': {
5237 if (STI.getSmVersion() < 70)
5238 report_fatal_error("Inline asm with 128 bit operands is only "
5239 "supported for sm_70 and higher!");
5240 return std::make_pair(0U, &NVPTX::B128RegClass);
5241 }
5242 }
5243 }
5244 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5245}
5246
5247//===----------------------------------------------------------------------===//
5248// NVPTX DAG Combining
5249//===----------------------------------------------------------------------===//
5250
5252 CodeGenOptLevel OptLevel) const {
5253 // Always honor command-line argument
5254 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5255 return FMAContractLevelOpt > 0;
5256
5257 // Do not contract if we're not optimizing the code.
5258 if (OptLevel == CodeGenOptLevel::None)
5259 return false;
5260
5261 // Honor TargetOptions flags that explicitly say fusion is okay.
5263 return true;
5264
5265 return false;
5266}
5267
5268static bool isConstZero(const SDValue &Operand) {
5269 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5270 return Const && Const->getZExtValue() == 0;
5271}
5272
5273/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5274/// operands N0 and N1. This is a helper for PerformADDCombine that is
5275/// called with the default operands, and if that fails, with commuted
5276/// operands.
5277static SDValue
5280 EVT VT = N0.getValueType();
5281
5282 // Since integer multiply-add costs the same as integer multiply
5283 // but is more costly than integer add, do the fusion only when
5284 // the mul is only used in the add.
5285 // TODO: this may not be true for later architectures, consider relaxing this
5286 if (!N0.getNode()->hasOneUse())
5287 return SDValue();
5288
5289 // fold (add (select cond, 0, (mul a, b)), c)
5290 // -> (select cond, c, (add (mul a, b), c))
5291 //
5292 if (N0.getOpcode() == ISD::SELECT) {
5293 unsigned ZeroOpNum;
5294 if (isConstZero(N0->getOperand(1)))
5295 ZeroOpNum = 1;
5296 else if (isConstZero(N0->getOperand(2)))
5297 ZeroOpNum = 2;
5298 else
5299 return SDValue();
5300
5301 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5302 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5303 return SDValue();
5304
5305 SDLoc DL(N);
5306 SDValue Mul =
5307 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
5308 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
5309 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5310 ((ZeroOpNum == 1) ? N1 : MAD),
5311 ((ZeroOpNum == 1) ? MAD : N1));
5312 }
5313
5314 return SDValue();
5315}
5316
5317static SDValue
5320 CodeGenOptLevel OptLevel) {
5321 EVT VT = N0.getValueType();
5322 if (N0.getOpcode() == ISD::FMUL) {
5323 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5324 &DCI.DAG.getTargetLoweringInfo());
5325 if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
5326 (N->getFlags().hasAllowContract() &&
5327 N0->getFlags().hasAllowContract())))
5328 return SDValue();
5329
5330 // For floating point:
5331 // Do the fusion only when the mul has less than 5 uses and all
5332 // are add.
5333 // The heuristic is that if a use is not an add, then that use
5334 // cannot be fused into fma, therefore mul is still needed anyway.
5335 // If there are more than 4 uses, even if they are all add, fusing
5336 // them will increase register pressue.
5337 //
5338 int numUses = 0;
5339 int nonAddCount = 0;
5340 for (const SDNode *User : N0.getNode()->users()) {
5341 numUses++;
5342 if (User->getOpcode() != ISD::FADD)
5343 ++nonAddCount;
5344 if (numUses >= 5)
5345 return SDValue();
5346 }
5347 if (nonAddCount) {
5348 int orderNo = N->getIROrder();
5349 int orderNo2 = N0.getNode()->getIROrder();
5350 // simple heuristics here for considering potential register
5351 // pressure, the logics here is that the differnce are used
5352 // to measure the distance between def and use, the longer distance
5353 // more likely cause register pressure.
5354 if (orderNo - orderNo2 < 500)
5355 return SDValue();
5356
5357 // Now, check if at least one of the FMUL's operands is live beyond the
5358 // node N, which guarantees that the FMA will not increase register
5359 // pressure at node N.
5360 bool opIsLive = false;
5361 const SDNode *left = N0.getOperand(0).getNode();
5362 const SDNode *right = N0.getOperand(1).getNode();
5363
5364 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5365 opIsLive = true;
5366
5367 if (!opIsLive)
5368 for (const SDNode *User : left->users()) {
5369 int orderNo3 = User->getIROrder();
5370 if (orderNo3 > orderNo) {
5371 opIsLive = true;
5372 break;
5373 }
5374 }
5375
5376 if (!opIsLive)
5377 for (const SDNode *User : right->users()) {
5378 int orderNo3 = User->getIROrder();
5379 if (orderNo3 > orderNo) {
5380 opIsLive = true;
5381 break;
5382 }
5383 }
5384
5385 if (!opIsLive)
5386 return SDValue();
5387 }
5388
5389 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5390 N0.getOperand(1), N1);
5391 }
5392
5393 return SDValue();
5394}
5395
5396/// Fold unpacking movs into a load by increasing the number of return values.
5397///
5398/// ex:
5399/// L: v2f16,ch = load <p>
5400/// a: f16 = extractelt L:0, 0
5401/// b: f16 = extractelt L:0, 1
5402/// use(a, b)
5403///
5404/// ...is turned into...
5405///
5406/// L: f16,f16,ch = LoadV2 <p>
5407/// use(L:0, L:1)
5408static SDValue
5410 // Don't run this optimization before the legalizer
5411 if (!DCI.isAfterLegalizeDAG())
5412 return SDValue();
5413
5414 EVT ElementVT = N->getValueType(0);
5415 // Avoid non-packed types and v4i8
5416 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5417 return SDValue();
5418
5419 SmallVector<SDNode *> DeadCopyToRegs;
5420
5421 // Check whether all outputs are either used by an extractelt or are
5422 // glue/chain nodes
5423 if (!all_of(N->uses(), [&](SDUse &U) {
5424 // Skip glue, chain nodes
5425 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5426 return true;
5427 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5428 if (N->getOpcode() != ISD::LOAD)
5429 return true;
5430 // Since this is an ISD::LOAD, check all extractelts are used. If
5431 // any are not used, we don't want to defeat another optimization that
5432 // will narrow the load.
5433 //
5434 // For example:
5435 //
5436 // L: v2f16,ch = load <p>
5437 // e0: f16 = extractelt L:0, 0
5438 // e1: f16 = extractelt L:0, 1 <-- unused
5439 // store e0
5440 //
5441 // Can be optimized by DAGCombiner to:
5442 //
5443 // L: f16,ch = load <p>
5444 // store L:0
5445 return !U.getUser()->use_empty();
5446 }
5447
5448 // Otherwise, this use prevents us from splitting a value.
5449 return false;
5450 }))
5451 return SDValue();
5452
5453 auto *LD = cast<MemSDNode>(N);
5454 SDLoc DL(LD);
5455
5456 // the new opcode after we double the number of operands
5457 NVPTXISD::NodeType Opcode;
5459 unsigned OldNumOutputs; // non-glue, non-chain outputs
5460 switch (LD->getOpcode()) {
5461 case ISD::LOAD:
5462 OldNumOutputs = 1;
5463 // Any packed type is legal, so the legalizer will not have lowered
5464 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5465 // here.
5466 Opcode = NVPTXISD::LoadV2;
5467 Operands.push_back(DCI.DAG.getIntPtrConstant(
5468 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5469 break;
5470 case NVPTXISD::LoadV2:
5471 OldNumOutputs = 2;
5472 Opcode = NVPTXISD::LoadV4;
5473 break;
5474 case NVPTXISD::LoadV4:
5475 // V8 is only supported for f32. Don't forget, we're not changing the load
5476 // size here. This is already a 256-bit load.
5477 if (ElementVT != MVT::v2f32)
5478 return SDValue();
5479 OldNumOutputs = 4;
5480 Opcode = NVPTXISD::LoadV8;
5481 break;
5482 case NVPTXISD::LoadV8:
5483 // PTX doesn't support the next doubling of outputs
5484 return SDValue();
5485 }
5486
5487 // the non-glue, non-chain outputs in the new load
5488 const unsigned NewNumOutputs = OldNumOutputs * 2;
5489 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5490 // add remaining chain and glue values
5491 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5492
5493 // Create the new load
5494 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5495 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5496 LD->getMemOperand());
5497
5498 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5499 // the outputs the same. These nodes will be optimized away in later
5500 // DAGCombiner iterations.
5502 for (unsigned I : seq(OldNumOutputs))
5503 Results.push_back(DCI.DAG.getBuildVector(
5504 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5505 // Add remaining chain and glue nodes
5506 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5507 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5508
5509 return DCI.DAG.getMergeValues(Results, DL);
5510}
5511
5512/// Fold packing movs into a store.
5513///
5514/// ex:
5515/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5516/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5517/// StoreV2 v1, v2
5518///
5519/// ...is turned into...
5520///
5521/// StoreV4 a, b, c, d
5524 unsigned Front, unsigned Back) {
5525 // We want to run this as late as possible since other optimizations may
5526 // eliminate the BUILD_VECTORs.
5527 if (!DCI.isAfterLegalizeDAG())
5528 return SDValue();
5529
5530 // Get the type of the operands being stored.
5531 EVT ElementVT = N->getOperand(Front).getValueType();
5532
5533 // Avoid non-packed types and v4i8
5534 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5535 return SDValue();
5536
5537 auto *ST = cast<MemSDNode>(N);
5538
5539 // The new opcode after we double the number of operands.
5540 NVPTXISD::NodeType Opcode;
5541 switch (N->getOpcode()) {
5542 case ISD::STORE:
5543 // Any packed type is legal, so the legalizer will not have lowered
5544 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5545 // it here.
5546 Opcode = NVPTXISD::StoreV2;
5547 break;
5548 case NVPTXISD::StoreV2:
5549 Opcode = NVPTXISD::StoreV4;
5550 break;
5551 case NVPTXISD::StoreV4:
5552 // V8 is only supported for f32. Don't forget, we're not changing the store
5553 // size here. This is already a 256-bit store.
5554 if (ElementVT != MVT::v2f32)
5555 return SDValue();
5556 Opcode = NVPTXISD::StoreV8;
5557 break;
5558 case NVPTXISD::StoreV8:
5559 // PTX doesn't support the next doubling of operands
5560 return SDValue();
5561 default:
5562 llvm_unreachable("Unhandled store opcode");
5563 }
5564
5565 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5566 // their elements.
5567 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5568 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5569 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5570 return SDValue();
5571
5572 // If the operand has multiple uses, this optimization can increase register
5573 // pressure.
5574 if (!BV.hasOneUse())
5575 return SDValue();
5576
5577 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
5578 // any signs they may be folded by some other pattern or rule.
5579 for (SDValue Op : BV->ops()) {
5580 // Peek through bitcasts
5581 if (Op.getOpcode() == ISD::BITCAST)
5582 Op = Op.getOperand(0);
5583
5584 // This may be folded into a PRMT.
5585 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
5586 Op->getOperand(0).getValueType() == MVT::i32)
5587 return SDValue();
5588
5589 // This may be folded into cvt.bf16x2
5590 if (Op.getOpcode() == ISD::FP_ROUND)
5591 return SDValue();
5592 }
5593 Operands.append({BV.getOperand(0), BV.getOperand(1)});
5594 }
5595 Operands.append(N->op_end() - Back, N->op_end());
5596
5597 // Now we replace the store
5598 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
5599 ST->getMemoryVT(), ST->getMemOperand());
5600}
5601
5603 const NVPTXSubtarget &STI) {
5604
5605 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
5606 // Here is our chance to custom lower a store with a non-simple type.
5607 // Unfortunately, we can't do this in the legalizer because there is no
5608 // way to setOperationAction for an non-simple type.
5610 if (!ST->getValue().getValueType().isSimple())
5611 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
5612 }
5613
5614 return combinePackingMovIntoStore(N, DCI, 1, 2);
5615}
5616
5618 const NVPTXSubtarget &STI) {
5619 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
5620 // Here is our chance to custom lower a load with a non-simple type.
5621 // Unfortunately, we can't do this in the legalizer because there is no
5622 // way to setOperationAction for an non-simple type.
5623 if (!N->getValueType(0).isSimple())
5624 return lowerLoadVector(N, DCI.DAG, STI);
5625 }
5626
5627 return combineUnpackingMovIntoLoad(N, DCI);
5628}
5629
5630/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5631///
5634 CodeGenOptLevel OptLevel) {
5635 if (OptLevel == CodeGenOptLevel::None)
5636 return SDValue();
5637
5638 SDValue N0 = N->getOperand(0);
5639 SDValue N1 = N->getOperand(1);
5640
5641 // Skip non-integer, non-scalar case
5642 EVT VT = N0.getValueType();
5643 if (VT.isVector() || VT != MVT::i32)
5644 return SDValue();
5645
5646 // First try with the default operand order.
5647 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5648 return Result;
5649
5650 // If that didn't work, try again with the operands commuted.
5651 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5652}
5653
5654/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5655///
5658 CodeGenOptLevel OptLevel) {
5659 SDValue N0 = N->getOperand(0);
5660 SDValue N1 = N->getOperand(1);
5661
5662 EVT VT = N0.getValueType();
5663 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5664 return SDValue();
5665
5666 // First try with the default operand order.
5667 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5668 return Result;
5669
5670 // If that didn't work, try again with the operands commuted.
5671 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5672}
5673
5674/// Get 3-input version of a 2-input min/max opcode
5675static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode) {
5676 switch (MinMax2Opcode) {
5677 case ISD::FMAXNUM:
5678 case ISD::FMAXIMUMNUM:
5679 return NVPTXISD::FMAXNUM3;
5680 case ISD::FMINNUM:
5681 case ISD::FMINIMUMNUM:
5682 return NVPTXISD::FMINNUM3;
5683 case ISD::FMAXIMUM:
5684 return NVPTXISD::FMAXIMUM3;
5685 case ISD::FMINIMUM:
5686 return NVPTXISD::FMINIMUM3;
5687 default:
5688 llvm_unreachable("Invalid 2-input min/max opcode");
5689 }
5690}
5691
5692/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
5693/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
5696 unsigned PTXVersion, unsigned SmVersion) {
5697
5698 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
5699 EVT VT = N->getValueType(0);
5700 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
5701 return SDValue();
5702
5703 SDValue Op0 = N->getOperand(0);
5704 SDValue Op1 = N->getOperand(1);
5705 unsigned MinMaxOp2 = N->getOpcode();
5706 NVPTXISD::NodeType MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
5707
5708 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
5709 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
5710 SDValue A = Op0.getOperand(0);
5711 SDValue B = Op0.getOperand(1);
5712 SDValue C = Op1;
5713 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5714 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
5715 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
5716 SDValue A = Op0;
5717 SDValue B = Op1.getOperand(0);
5718 SDValue C = Op1.getOperand(1);
5719 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5720 }
5721 return SDValue();
5722}
5723
5726 CodeGenOptLevel OptLevel) {
5727 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5728
5729 // Don't do anything at less than -O2.
5730 if (OptLevel < CodeGenOptLevel::Default)
5731 return SDValue();
5732
5733 SelectionDAG &DAG = DCI.DAG;
5734 SDLoc DL(N);
5735 EVT VT = N->getValueType(0);
5736 bool IsSigned = N->getOpcode() == ISD::SREM;
5737 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5738
5739 const SDValue &Num = N->getOperand(0);
5740 const SDValue &Den = N->getOperand(1);
5741
5742 for (const SDNode *U : Num->users()) {
5743 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5744 U->getOperand(1) == Den) {
5745 // Num % Den -> Num - (Num / Den) * Den
5746 return DAG.getNode(ISD::SUB, DL, VT, Num,
5747 DAG.getNode(ISD::MUL, DL, VT,
5748 DAG.getNode(DivOpc, DL, VT, Num, Den),
5749 Den));
5750 }
5751 }
5752 return SDValue();
5753}
5754
5755// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
5757 CodeGenOptLevel OptLevel) {
5758 if (OptLevel == CodeGenOptLevel::None)
5759 return SDValue();
5760
5761 SDValue Op = N->getOperand(0);
5762 if (!Op.hasOneUse())
5763 return SDValue();
5764 EVT ToVT = N->getValueType(0);
5765 EVT FromVT = Op.getValueType();
5766 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
5767 (ToVT == MVT::i64 && FromVT == MVT::i32)))
5768 return SDValue();
5769 if (!(Op.getOpcode() == ISD::MUL ||
5770 (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
5771 return SDValue();
5772
5773 SDLoc DL(N);
5774 unsigned ExtOpcode = N->getOpcode();
5775 unsigned Opcode = 0;
5776 if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
5778 else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
5780 else
5781 return SDValue();
5782 SDValue RHS = Op.getOperand(1);
5783 if (Op.getOpcode() == ISD::SHL) {
5784 const auto ShiftAmt = Op.getConstantOperandVal(1);
5785 const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
5786 RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
5787 }
5788 return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
5789}
5790
5796
5797/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5798/// that can be demoted to \p OptSize bits without loss of information. The
5799/// signedness of the operand, if determinable, is placed in \p S.
5801 unsigned OptSize,
5802 OperandSignedness &S) {
5803 S = Unknown;
5804
5805 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5806 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5807 EVT OrigVT = Op.getOperand(0).getValueType();
5808 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5809 S = Signed;
5810 return true;
5811 }
5812 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5813 EVT OrigVT = Op.getOperand(0).getValueType();
5814 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5815 S = Unsigned;
5816 return true;
5817 }
5818 }
5819
5820 return false;
5821}
5822
5823/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5824/// be demoted to \p OptSize bits without loss of information. If the operands
5825/// contain a constant, it should appear as the RHS operand. The signedness of
5826/// the operands is placed in \p IsSigned.
5828 unsigned OptSize,
5829 bool &IsSigned) {
5830 OperandSignedness LHSSign;
5831
5832 // The LHS operand must be a demotable op
5833 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5834 return false;
5835
5836 // We should have been able to determine the signedness from the LHS
5837 if (LHSSign == Unknown)
5838 return false;
5839
5840 IsSigned = (LHSSign == Signed);
5841
5842 // The RHS can be a demotable op or a constant
5844 const APInt &Val = CI->getAPIntValue();
5845 if (LHSSign == Unsigned) {
5846 return Val.isIntN(OptSize);
5847 } else {
5848 return Val.isSignedIntN(OptSize);
5849 }
5850 } else {
5851 OperandSignedness RHSSign;
5852 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5853 return false;
5854
5855 return LHSSign == RHSSign;
5856 }
5857}
5858
5859/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5860/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5861/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5862/// amount.
5865 EVT MulType = N->getValueType(0);
5866 if (MulType != MVT::i32 && MulType != MVT::i64) {
5867 return SDValue();
5868 }
5869
5870 SDLoc DL(N);
5871 unsigned OptSize = MulType.getSizeInBits() >> 1;
5872 SDValue LHS = N->getOperand(0);
5873 SDValue RHS = N->getOperand(1);
5874
5875 // Canonicalize the multiply so the constant (if any) is on the right
5876 if (N->getOpcode() == ISD::MUL) {
5877 if (isa<ConstantSDNode>(LHS)) {
5878 std::swap(LHS, RHS);
5879 }
5880 }
5881
5882 // If we have a SHL, determine the actual multiply amount
5883 if (N->getOpcode() == ISD::SHL) {
5885 if (!ShlRHS) {
5886 return SDValue();
5887 }
5888
5889 APInt ShiftAmt = ShlRHS->getAPIntValue();
5890 unsigned BitWidth = MulType.getSizeInBits();
5891 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5892 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5893 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5894 } else {
5895 return SDValue();
5896 }
5897 }
5898
5899 bool Signed;
5900 // Verify that our operands are demotable
5901 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5902 return SDValue();
5903 }
5904
5905 EVT DemotedVT;
5906 if (MulType == MVT::i32) {
5907 DemotedVT = MVT::i16;
5908 } else {
5909 DemotedVT = MVT::i32;
5910 }
5911
5912 // Truncate the operands to the correct size. Note that these are just for
5913 // type consistency and will (likely) be eliminated in later phases.
5914 SDValue TruncLHS =
5915 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5916 SDValue TruncRHS =
5917 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5918
5919 unsigned Opc;
5920 if (Signed) {
5922 } else {
5924 }
5925
5926 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5927}
5928
5929static bool isConstOne(const SDValue &Operand) {
5930 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5931 return Const && Const->getZExtValue() == 1;
5932}
5933
5935 if (Add->getOpcode() != ISD::ADD)
5936 return SDValue();
5937
5938 if (isConstOne(Add->getOperand(0)))
5939 return Add->getOperand(1);
5940
5941 if (isConstOne(Add->getOperand(1)))
5942 return Add->getOperand(0);
5943
5944 return SDValue();
5945}
5946
5949
5951 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5952 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
5953 }
5954
5955 return SDValue();
5956}
5957
5959 SDLoc DL,
5961 if (Select->getOpcode() != ISD::SELECT)
5962 return SDValue();
5963
5964 SDValue Cond = Select->getOperand(0);
5965
5966 unsigned ConstOpNo;
5967 if (isConstOne(Select->getOperand(1)))
5968 ConstOpNo = 1;
5969 else if (isConstOne(Select->getOperand(2)))
5970 ConstOpNo = 2;
5971 else
5972 return SDValue();
5973
5974 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5975
5976 // Do not combine if the resulting sequence is not obviously profitable.
5978 return SDValue();
5979
5980 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5981
5982 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5983 (ConstOpNo == 1) ? X : NewMul,
5984 (ConstOpNo == 1) ? NewMul : X);
5985}
5986
5987static SDValue
5990
5991 EVT VT = N0.getValueType();
5992 if (VT.isVector())
5993 return SDValue();
5994
5995 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5996 return SDValue();
5997
5998 SDLoc DL(N);
5999
6000 // (mul x, (add y, 1)) -> (add (mul x, y), x)
6001 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
6002 return Res;
6003 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
6004 return Res;
6005
6006 // (mul x, (select y, 1)) -> (select (mul x, y), x)
6007 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
6008 return Res;
6009 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
6010 return Res;
6011
6012 return SDValue();
6013}
6014
6015/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
6018 CodeGenOptLevel OptLevel) {
6019 if (OptLevel == CodeGenOptLevel::None)
6020 return SDValue();
6021
6022 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6023 return Ret;
6024
6025 SDValue N0 = N->getOperand(0);
6026 SDValue N1 = N->getOperand(1);
6027 return PerformMULCombineWithOperands(N, N0, N1, DCI);
6028}
6029
6030/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
6033 CodeGenOptLevel OptLevel) {
6034 if (OptLevel > CodeGenOptLevel::None) {
6035 // Try mul.wide combining at OptLevel > 0
6036 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6037 return Ret;
6038 }
6039
6040 return SDValue();
6041}
6042
6045 unsigned int SmVersion) {
6046 EVT CCType = N->getValueType(0);
6047 SDValue A = N->getOperand(0);
6048 SDValue B = N->getOperand(1);
6049
6050 EVT AType = A.getValueType();
6051 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
6052 return SDValue();
6053
6054 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
6055 return SDValue();
6056
6057 SDLoc DL(N);
6058 // setp.f16x2 returns two scalar predicates, which we need to
6059 // convert back to v2i1. The returned result will be scalarized by
6060 // the legalizer, but the comparison will remain a single vector
6061 // instruction.
6062 SDValue CCNode = DCI.DAG.getNode(
6063 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
6065 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
6066 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
6067 CCNode.getValue(1));
6068}
6069
6072 SDValue Vector = N->getOperand(0);
6073 if (Vector->getOpcode() == ISD::FREEZE)
6074 Vector = Vector->getOperand(0);
6075 SDLoc DL(N);
6076 EVT VectorVT = Vector.getValueType();
6077 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
6078 IsPTXVectorType(VectorVT.getSimpleVT()))
6079 return SDValue(); // Native vector loads already combine nicely w/
6080 // extract_vector_elt.
6081 // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),
6082 // we already handle them OK.
6083 if (VectorVT.getVectorNumElements() == 1 ||
6084 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
6085 return SDValue();
6086
6087 // Don't mess with undef values as sra may be simplified to 0, not undef.
6088 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
6089 return SDValue();
6090
6091 uint64_t VectorBits = VectorVT.getSizeInBits();
6092 // We only handle the types we can extract in-register.
6093 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
6094 return SDValue();
6095
6096 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
6097 // Index == 0 is handled by generic DAG combiner.
6098 if (!Index || Index->getZExtValue() == 0)
6099 return SDValue();
6100
6101 MVT IVT = MVT::getIntegerVT(VectorBits);
6102 EVT EltVT = VectorVT.getVectorElementType();
6103 EVT EltIVT = EltVT.changeTypeToInteger();
6104 uint64_t EltBits = EltVT.getScalarSizeInBits();
6105
6106 SDValue Result = DCI.DAG.getNode(
6107 ISD::TRUNCATE, DL, EltIVT,
6108 DCI.DAG.getNode(
6109 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
6110 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
6111
6112 // If element has non-integer type, bitcast it back to the expected type.
6113 if (EltVT != EltIVT)
6114 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
6115 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
6116 if (EltVT != N->getValueType(0))
6117 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
6118
6119 return Result;
6120}
6121
6124 SDValue VA = N->getOperand(1);
6125 EVT VectorVT = VA.getValueType();
6126 if (VectorVT != MVT::v4i8)
6127 return SDValue();
6128
6129 // We need to split vselect into individual per-element operations Because we
6130 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
6131 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
6132 // to/from i16 normally used for i8 values.
6134 SDLoc DL(N);
6135 SDValue VCond = N->getOperand(0);
6136 SDValue VB = N->getOperand(2);
6137 for (int I = 0; I < 4; ++I) {
6138 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
6139 DCI.DAG.getConstant(I, DL, MVT::i32));
6140 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
6141 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
6142 DCI.DAG.getConstant(I, DL, MVT::i32)),
6143 DL, MVT::i32);
6144 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
6145 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
6146 DCI.DAG.getConstant(I, DL, MVT::i32)),
6147 DL, MVT::i32);
6148 E.push_back(DCI.DAG.getAnyExtOrTrunc(
6149 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
6150 }
6151 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
6152}
6153
6154static SDValue
6156 auto VT = N->getValueType(0);
6157 if (!DCI.isAfterLegalizeDAG() ||
6158 // only process v2*16 types
6159 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
6160 VT.getVectorNumElements() == 2))
6161 return SDValue();
6162
6163 auto Op0 = N->getOperand(0);
6164 auto Op1 = N->getOperand(1);
6165
6166 // Start out by assuming we want to take the lower 2 bytes of each i32
6167 // operand.
6168 uint64_t Op0Bytes = 0x10;
6169 uint64_t Op1Bytes = 0x54;
6170
6171 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6172 {&Op1, &Op1Bytes}};
6173
6174 // Check that each operand is an i16, truncated from an i32 operand. We'll
6175 // select individual bytes from those original operands. Optionally, fold in a
6176 // shift right of that original operand.
6177 for (auto &[Op, OpBytes] : OpData) {
6178 // Eat up any bitcast
6179 if (Op->getOpcode() == ISD::BITCAST)
6180 *Op = Op->getOperand(0);
6181
6182 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
6183 Op->getOperand(0).getValueType() == MVT::i32))
6184 return SDValue();
6185
6186 // If the truncate has multiple uses, this optimization can increase
6187 // register pressure
6188 if (!Op->hasOneUse())
6189 return SDValue();
6190
6191 *Op = Op->getOperand(0);
6192
6193 // Optionally, fold in a shift-right of the original operand and let permute
6194 // pick the two higher bytes of the original value directly.
6195 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6196 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6197 // Shift the PRMT byte selector to pick upper bytes from each respective
6198 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
6199 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
6200 "PRMT selector values out of range");
6201 *OpBytes += 0x22;
6202 *Op = Op->getOperand(0);
6203 }
6204 }
6205 }
6206
6207 SDLoc DL(N);
6208 auto &DAG = DCI.DAG;
6209
6210 auto PRMT =
6211 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
6212 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
6213 return DAG.getBitcast(VT, PRMT);
6214}
6215
6218 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
6219
6220 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
6221 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
6222
6223 // Fold asc[B -> A](asc[A -> B](x)) -> x
6224 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
6225 return ASCN2->getOperand(0);
6226 }
6227
6228 return SDValue();
6229}
6230
6231// Given a constant selector value and a prmt mode, return the selector value
6232// normalized to the generic prmt mode. See the PTX ISA documentation for more
6233// details:
6234// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
6235static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
6236 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6237
6239 return Selector;
6240
6241 const unsigned V = Selector.trunc(2).getZExtValue();
6242
6243 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
6244 unsigned S3) {
6245 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
6246 };
6247
6248 switch (Mode) {
6250 return GetSelector(V, V + 1, V + 2, V + 3);
6252 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
6254 return GetSelector(V, V, V, V);
6256 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
6258 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
6260 unsigned V1 = (V & 1) << 1;
6261 return GetSelector(V1, V1 + 1, V1, V1 + 1);
6262 }
6263 default:
6264 llvm_unreachable("Invalid PRMT mode");
6265 }
6266}
6267
6268static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
6269 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
6270 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6271 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6272 APInt BitField = B.concat(A);
6273 APInt SelectorVal = getPRMTSelector(Selector, Mode);
6274 APInt Result(32, 0);
6275 for (unsigned I : llvm::seq(4U)) {
6276 APInt Sel = SelectorVal.extractBits(4, I * 4);
6277 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6278 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6279 APInt Byte = BitField.extractBits(8, Idx * 8);
6280 if (Sign)
6281 Byte = Byte.ashr(8);
6282 Result.insertBits(Byte, I * 8);
6283 }
6284 return Result;
6285}
6286
6288 CodeGenOptLevel OptLevel) {
6289 if (OptLevel == CodeGenOptLevel::None)
6290 return SDValue();
6291
6292 // Constant fold PRMT
6293 if (isa<ConstantSDNode>(N->getOperand(0)) &&
6294 isa<ConstantSDNode>(N->getOperand(1)) &&
6295 isa<ConstantSDNode>(N->getOperand(2)))
6296 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
6297 N->getConstantOperandAPInt(1),
6298 N->getConstantOperandAPInt(2),
6299 N->getConstantOperandVal(3)),
6300 SDLoc(N), N->getValueType(0));
6301 return SDValue();
6302}
6303
6304// During call lowering we wrap the return values in a ProxyReg node which
6305// depend on the chain value produced by the completed call. This ensures that
6306// the full call is emitted in cases where libcalls are used to legalize
6307// operations. To improve the functioning of other DAG combines we pull all
6308// operations we can through one of these nodes, ensuring that the ProxyReg
6309// directly wraps a load. That is:
6310//
6311// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
6312//
6315 switch (R.getOpcode()) {
6316 case ISD::TRUNCATE:
6317 case ISD::ANY_EXTEND:
6318 case ISD::SIGN_EXTEND:
6319 case ISD::ZERO_EXTEND:
6320 case ISD::BITCAST: {
6321 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6322 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
6323 return SDValue();
6324 }
6325 case ISD::SHL:
6326 case ISD::SRL:
6327 case ISD::SRA:
6328 case ISD::OR: {
6329 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
6330 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
6331 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
6332 return SDValue();
6333 }
6334 case ISD::Constant:
6335 return R;
6336 case ISD::LOAD:
6337 case NVPTXISD::LoadV2:
6338 case NVPTXISD::LoadV4: {
6339 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
6340 {Chain, R});
6341 }
6342 case ISD::BUILD_VECTOR: {
6343 if (DCI.isBeforeLegalize())
6344 return SDValue();
6345
6347 for (auto &Op : R->ops()) {
6348 SDValue V = sinkProxyReg(Op, Chain, DCI);
6349 if (!V)
6350 return SDValue();
6351 Ops.push_back(V);
6352 }
6353 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
6354 }
6356 if (DCI.isBeforeLegalize())
6357 return SDValue();
6358
6359 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6361 R.getValueType(), V, R.getOperand(1));
6362 return SDValue();
6363 }
6364 default:
6365 return SDValue();
6366 }
6367}
6368
6371
6372 SDValue Chain = N->getOperand(0);
6373 SDValue Reg = N->getOperand(1);
6374
6375 // If the ProxyReg is not wrapping a load, try to pull the operations through
6376 // the ProxyReg.
6377 if (Reg.getOpcode() != ISD::LOAD) {
6378 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
6379 return V;
6380 }
6381
6382 return SDValue();
6383}
6384
6385SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6386 DAGCombinerInfo &DCI) const {
6388 switch (N->getOpcode()) {
6389 default:
6390 break;
6391 case ISD::ADD:
6392 return PerformADDCombine(N, DCI, OptLevel);
6393 case ISD::ADDRSPACECAST:
6394 return combineADDRSPACECAST(N, DCI);
6395 case ISD::SIGN_EXTEND:
6396 case ISD::ZERO_EXTEND:
6397 return combineMulWide(N, DCI, OptLevel);
6398 case ISD::BUILD_VECTOR:
6399 return PerformBUILD_VECTORCombine(N, DCI);
6401 return PerformEXTRACTCombine(N, DCI);
6402 case ISD::FADD:
6403 return PerformFADDCombine(N, DCI, OptLevel);
6404 case ISD::FMAXNUM:
6405 case ISD::FMINNUM:
6406 case ISD::FMAXIMUM:
6407 case ISD::FMINIMUM:
6408 case ISD::FMAXIMUMNUM:
6409 case ISD::FMINIMUMNUM:
6410 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
6411 STI.getSmVersion());
6412 case ISD::LOAD:
6413 case NVPTXISD::LoadV2:
6414 case NVPTXISD::LoadV4:
6415 return combineLOAD(N, DCI, STI);
6416 case ISD::MUL:
6417 return PerformMULCombine(N, DCI, OptLevel);
6418 case NVPTXISD::PRMT:
6419 return combinePRMT(N, DCI, OptLevel);
6420 case NVPTXISD::ProxyReg:
6421 return combineProxyReg(N, DCI);
6422 case ISD::SETCC:
6423 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6424 case ISD::SHL:
6425 return PerformSHLCombine(N, DCI, OptLevel);
6426 case ISD::SREM:
6427 case ISD::UREM:
6428 return PerformREMCombine(N, DCI, OptLevel);
6429 case ISD::STORE:
6430 case NVPTXISD::StoreV2:
6431 case NVPTXISD::StoreV4:
6432 return combineSTORE(N, DCI, STI);
6433 case ISD::VSELECT:
6434 return PerformVSELECTCombine(N, DCI);
6435 }
6436 return SDValue();
6437}
6438
6441 // Handle bitcasting to v2i8 without hitting the default promotion
6442 // strategy which goes through stack memory.
6443 SDValue Op(Node, 0);
6444 EVT ToVT = Op->getValueType(0);
6445 if (ToVT != MVT::v2i8) {
6446 return;
6447 }
6448
6449 // Bitcast to i16 and unpack elements into a vector
6450 SDLoc DL(Node);
6451 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
6452 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
6453 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
6454 SDValue Vec1 =
6455 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6456 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
6457 Results.push_back(
6458 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
6459}
6460
6463 SDValue Chain = N->getOperand(0);
6464 SDValue Intrin = N->getOperand(1);
6465 SDLoc DL(N);
6466
6467 // Get the intrinsic ID
6468 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6469 switch (IntrinNo) {
6470 default:
6471 return;
6472 case Intrinsic::nvvm_ldu_global_i:
6473 case Intrinsic::nvvm_ldu_global_f:
6474 case Intrinsic::nvvm_ldu_global_p: {
6475 EVT ResVT = N->getValueType(0);
6476
6477 if (ResVT.isVector()) {
6478 // Vector LDG/LDU
6479
6480 unsigned NumElts = ResVT.getVectorNumElements();
6481 EVT EltVT = ResVT.getVectorElementType();
6482
6483 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6484 // legalization.
6485 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6486 // loaded type to i16 and propagate the "real" type as the memory type.
6487 bool NeedTrunc = false;
6488 if (EltVT.getSizeInBits() < 16) {
6489 EltVT = MVT::i16;
6490 NeedTrunc = true;
6491 }
6492
6493 unsigned Opcode = 0;
6494 SDVTList LdResVTs;
6495
6496 switch (NumElts) {
6497 default:
6498 return;
6499 case 2:
6500 Opcode = NVPTXISD::LDUV2;
6501 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6502 break;
6503 case 4: {
6504 Opcode = NVPTXISD::LDUV4;
6505 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6506 LdResVTs = DAG.getVTList(ListVTs);
6507 break;
6508 }
6509 }
6510
6511 SmallVector<SDValue, 8> OtherOps;
6512
6513 // Copy regular operands
6514
6515 OtherOps.push_back(Chain); // Chain
6516 // Skip operand 1 (intrinsic ID)
6517 // Others
6518 OtherOps.append(N->op_begin() + 2, N->op_end());
6519
6521
6522 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6523 MemSD->getMemoryVT(),
6524 MemSD->getMemOperand());
6525
6526 SmallVector<SDValue, 4> ScalarRes;
6527
6528 for (unsigned i = 0; i < NumElts; ++i) {
6529 SDValue Res = NewLD.getValue(i);
6530 if (NeedTrunc)
6531 Res =
6532 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6533 ScalarRes.push_back(Res);
6534 }
6535
6536 SDValue LoadChain = NewLD.getValue(NumElts);
6537
6538 SDValue BuildVec =
6539 DAG.getBuildVector(ResVT, DL, ScalarRes);
6540
6541 Results.push_back(BuildVec);
6542 Results.push_back(LoadChain);
6543 } else {
6544 // i8 LDG/LDU
6545 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6546 "Custom handling of non-i8 ldu/ldg?");
6547
6548 // Just copy all operands as-is
6550
6551 // Force output to i16
6552 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6553
6555
6556 // We make sure the memory type is i8, which will be used during isel
6557 // to select the proper instruction.
6558 SDValue NewLD =
6560 MVT::i8, MemSD->getMemOperand());
6561
6562 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6563 NewLD.getValue(0)));
6564 Results.push_back(NewLD.getValue(1));
6565 }
6566 return;
6567 }
6568
6569 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
6570 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
6571 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
6572 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
6573 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
6574 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
6575 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
6576 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
6577 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
6578 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
6579 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
6580 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
6581 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
6582 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
6583 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
6584 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
6585 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
6586 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
6587 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
6588 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
6589 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
6590 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
6591 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
6592 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
6593 if (auto Res = lowerTcgen05Ld(N, DAG)) {
6594 Results.push_back(Res->first);
6595 Results.push_back(Res->second);
6596 }
6597 return;
6598
6599 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
6600 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
6601 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
6602 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
6603 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
6604 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
6605 if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {
6606 Results.push_back(Res->first);
6607 Results.push_back(Res->second);
6608 }
6609 return;
6610 }
6611}
6612
6615 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6616 // result so that it can pass the legalization
6617 SDLoc DL(N);
6618 SDValue Chain = N->getOperand(0);
6619 SDValue Reg = N->getOperand(1);
6620 SDValue Glue = N->getOperand(2);
6621
6622 assert(Reg.getValueType() == MVT::i128 &&
6623 "Custom lowering for CopyFromReg with 128-bit reg only");
6624 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6625 N->getValueType(2)};
6626 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6627
6628 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6629 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6630 {NewValue.getValue(0), NewValue.getValue(1)});
6631
6632 Results.push_back(Pair);
6633 Results.push_back(NewValue.getValue(2));
6634 Results.push_back(NewValue.getValue(3));
6635}
6636
6638 const TargetLowering &TLI,
6640 SDValue Chain = N->getOperand(0);
6641 SDValue Reg = N->getOperand(1);
6642
6643 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
6644
6645 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
6646 SDValue NewProxy =
6647 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
6648 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
6649
6650 Results.push_back(Res);
6651}
6652
6654 const NVPTXSubtarget &STI,
6656 assert(N->getValueType(0) == MVT::i128 &&
6657 "Custom lowering for atomic128 only supports i128");
6658
6660 SDLoc dl(N);
6661
6662 if (!STI.hasAtomSwap128()) {
6665 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
6666 "requires target sm_90.",
6667 dl.getDebugLoc()));
6668
6669 Results.push_back(DAG.getUNDEF(MVT::i128));
6670 Results.push_back(AN->getOperand(0)); // Chain
6671 return;
6672 }
6673
6675 Ops.push_back(AN->getOperand(0)); // Chain
6676 Ops.push_back(AN->getOperand(1)); // Ptr
6677 for (const auto &Op : AN->ops().drop_front(2)) {
6678 // Low part
6679 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6680 DAG.getIntPtrConstant(0, dl)));
6681 // High part
6682 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6683 DAG.getIntPtrConstant(1, dl)));
6684 }
6685 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
6688 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
6689 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
6690 AN->getMemOperand());
6691 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
6692 {Result.getValue(0), Result.getValue(1)}));
6693 Results.push_back(Result.getValue(2));
6694}
6695
6696void NVPTXTargetLowering::ReplaceNodeResults(
6698 switch (N->getOpcode()) {
6699 default:
6700 report_fatal_error("Unhandled custom legalization");
6701 case ISD::BITCAST:
6702 ReplaceBITCAST(N, DAG, Results);
6703 return;
6704 case ISD::LOAD:
6705 replaceLoadVector(N, DAG, Results, STI);
6706 return;
6709 return;
6710 case ISD::CopyFromReg:
6712 return;
6713 case NVPTXISD::ProxyReg:
6714 replaceProxyReg(N, DAG, *this, Results);
6715 return;
6716 case ISD::ATOMIC_CMP_SWAP:
6717 case ISD::ATOMIC_SWAP:
6718 replaceAtomicSwap128(N, DAG, STI, Results);
6719 return;
6720 }
6721}
6722
6725 Type *Ty = AI->getValOperand()->getType();
6726
6727 if (AI->isFloatingPointOperation()) {
6729 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6730 STI.getPTXVersion() >= 63)
6732 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6733 STI.getPTXVersion() >= 78)
6735 if (Ty->isFloatTy())
6737 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6739 }
6741 }
6742
6743 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6744 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
6745
6746 switch (AI->getOperation()) {
6747 default:
6750 if (BitWidth == 128)
6756 switch (BitWidth) {
6757 case 8:
6758 case 16:
6760 case 32:
6762 case 64:
6763 if (STI.hasAtomBitwise64())
6766 case 128:
6768 default:
6769 llvm_unreachable("unsupported width encountered");
6770 }
6777 switch (BitWidth) {
6778 case 8:
6779 case 16:
6781 case 32:
6783 case 64:
6784 if (STI.hasAtomMinMax64())
6787 case 128:
6789 default:
6790 llvm_unreachable("unsupported width encountered");
6791 }
6794 switch (BitWidth) {
6795 case 32:
6797 case 8:
6798 case 16:
6799 case 64:
6800 case 128:
6802 default:
6803 llvm_unreachable("unsupported width encountered");
6804 }
6805 }
6806
6808}
6809
6811 const Instruction *I) const {
6812 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6813 // When CAS bitwidth is not supported on the hardware, the CAS is emulated
6814 // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
6815 // the memory order using explicit fences around the retry loop.
6816 // The memory order of natively supported CAS operations can be enforced
6817 // by lowering to an atom.cas with the right memory synchronizing effect.
6818 // However, atom.cas only supports relaxed, acquire, release and acq_rel.
6819 // So we also use explicit fences for enforcing memory order for
6820 // seq_cast CAS with natively-supported bitwidths.
6821 return CI &&
6822 (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
6823 STI.getMinCmpXchgSizeInBits() ||
6824 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
6825}
6826
6828 const Instruction *I) const {
6829 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6830 bool BitwidthSupportedAndIsSeqCst =
6831 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
6832 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
6833 STI.getMinCmpXchgSizeInBits();
6834 return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
6836}
6837
6839 Instruction *Inst,
6840 AtomicOrdering Ord) const {
6841 if (!isa<AtomicCmpXchgInst>(Inst))
6842 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
6843
6844 // Specialize for cmpxchg
6845 // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
6846 SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
6847 if (isReleaseOrStronger(Ord))
6848 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
6849 ? Ord
6851 SSID);
6852
6853 return nullptr;
6854}
6855
6857 Instruction *Inst,
6858 AtomicOrdering Ord) const {
6859 // Specialize for cmpxchg
6860 if (!isa<AtomicCmpXchgInst>(Inst))
6861 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
6862
6863 auto *CI = cast<AtomicCmpXchgInst>(Inst);
6864 auto CASWidth =
6865 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
6866 SyncScope::ID SSID = CI->getSyncScopeID();
6867 // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
6868 if (isAcquireOrStronger(Ord) &&
6870 CASWidth < STI.getMinCmpXchgSizeInBits()))
6871 return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
6872
6873 return nullptr;
6874}
6875
6876// Rather than default to SINT when both UINT and SINT are custom, we only
6877// change the opcode when UINT is not legal and SINT is. UINT is preferred when
6878// both are custom since unsigned CVT instructions can lead to slightly better
6879// SASS code with fewer instructions.
6881 EVT ToVT) const {
6882 if (isOperationLegal(Op, ToVT))
6883 return Op;
6884 switch (Op) {
6885 case ISD::FP_TO_UINT:
6887 return ISD::FP_TO_SINT;
6888 break;
6892 break;
6893 case ISD::VP_FP_TO_UINT:
6894 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
6895 return ISD::VP_FP_TO_SINT;
6896 break;
6897 default:
6898 break;
6899 }
6900 return Op;
6901}
6902
6903// Pin NVPTXTargetObjectFile's vtables to this file.
6905
6910
6912 const SelectionDAG &DAG, unsigned Depth) {
6913 SDValue A = Op.getOperand(0);
6914 SDValue B = Op.getOperand(1);
6915 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6916 unsigned Mode = Op.getConstantOperandVal(3);
6917
6918 if (!Selector)
6919 return;
6920
6921 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
6922 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
6923
6924 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6925 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
6926 "PRMT must have i32 operands");
6927 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
6928 KnownBits BitField = BKnown.concat(AKnown);
6929
6930 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
6931 for (unsigned I : llvm::seq(4)) {
6932 APInt Sel = SelectorVal.extractBits(4, I * 4);
6933 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6934 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6935 KnownBits Byte = BitField.extractBits(8, Idx * 8);
6936 if (Sign)
6937 Byte = KnownBits::ashr(Byte, 8);
6938 Known.insertBits(Byte, I * 8);
6939 }
6940}
6941
6942static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
6944
6945 // We can't do anything without knowing the sign bit.
6946 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
6947 if (ExtType == ISD::SEXTLOAD)
6948 return;
6949
6950 // ExtLoading to vector types is weird and may not work well with known bits.
6951 auto DestVT = LD->getValueType(0);
6952 if (DestVT.isVector())
6953 return;
6954
6955 assert(Known.getBitWidth() == DestVT.getSizeInBits());
6956 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
6957 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
6958}
6959
6961 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
6962 const SelectionDAG &DAG, unsigned Depth) const {
6963 Known.resetAll();
6964
6965 switch (Op.getOpcode()) {
6966 case NVPTXISD::PRMT:
6967 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
6968 break;
6969 case NVPTXISD::LoadV2:
6970 case NVPTXISD::LoadV4:
6971 case NVPTXISD::LoadV8:
6973 break;
6974 default:
6975 break;
6976 }
6977}
6978
6979static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
6980 const APInt &DemandedBits) {
6981 APInt DemandedLHS = APInt(32, 0);
6982 APInt DemandedRHS = APInt(32, 0);
6983
6984 for (unsigned I : llvm::seq(4)) {
6985 if (DemandedBits.extractBits(8, I * 8).isZero())
6986 continue;
6987
6988 APInt Sel = SelectorVal.extractBits(4, I * 4);
6989 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6990 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6991
6992 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
6993 unsigned ByteStart = (Idx % 4) * 8;
6994 if (Sign)
6995 Src.setBit(ByteStart + 7);
6996 else
6997 Src.setBits(ByteStart, ByteStart + 8);
6998 }
6999
7000 return {DemandedLHS, DemandedRHS};
7001}
7002
7003// Replace undef with 0 as this is easier for other optimizations such as
7004// known bits.
7006 if (!Op)
7007 return SDValue();
7008 if (Op.isUndef())
7009 return DAG.getConstant(0, SDLoc(), MVT::i32);
7010 return Op;
7011}
7012
7014 const APInt &DemandedBits,
7015 SelectionDAG &DAG,
7016 const TargetLowering &TLI,
7017 unsigned Depth) {
7018 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
7019 SDValue Op0 = PRMT.getOperand(0);
7020 SDValue Op1 = PRMT.getOperand(1);
7021 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
7022 if (!SelectorConst)
7023 return SDValue();
7024
7025 unsigned Mode = PRMT.getConstantOperandVal(3);
7026 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
7027
7028 // Try to simplify the PRMT to one of the inputs if the used bytes are all
7029 // from the same input in the correct order.
7030 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
7031 const unsigned SelBits = (4 - LeadingBytes) * 4;
7032 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
7033 return Op0;
7034 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
7035 return Op1;
7036
7037 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
7038
7039 // Attempt to avoid multi-use ops if we don't need anything from them.
7040 SDValue DemandedOp0 =
7041 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
7042 SDValue DemandedOp1 =
7043 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
7044
7045 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
7046 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
7047 if ((DemandedOp0 && DemandedOp0 != Op0) ||
7048 (DemandedOp1 && DemandedOp1 != Op1)) {
7049 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
7050 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
7051 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
7052 }
7053
7054 return SDValue();
7055}
7056
7058 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
7059 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
7060 Known.resetAll();
7061
7062 switch (Op.getOpcode()) {
7063 case NVPTXISD::PRMT:
7065 *this, Depth)) {
7066 TLO.CombineTo(Op, Result);
7067 return true;
7068 }
7069 break;
7070 default:
7071 break;
7072 }
7073
7074 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
7075 return false;
7076}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition Compiler.h:404
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG)
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset=false)
static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG)
static std::optional< NVPTXISD::NodeType > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1130
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
This class represents a function call, abstracting a target machine's calling convention.
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:521
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3155
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:379
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CALL
This node represents a PTX call instruction.
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ DeclareScalarParam
These nodes represent a parameter declaration.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
@ BUILD_VECTOR
This node is similar to ISD::BUILD_VECTOR except that the output may be implicitly bitcast to a scala...
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:252
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1948
unsigned promoteScalarArgumentSize(unsigned size)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool shouldPassAsArray(Type *Ty)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:197
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:233
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:219
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...