LLVM 22.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
18#include "NVPTXSubtarget.h"
19#include "NVPTXTargetMachine.h"
21#include "NVPTXUtilities.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/StringRef.h"
38#include "llvm/IR/Argument.h"
39#include "llvm/IR/Attributes.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DataLayout.h"
44#include "llvm/IR/FPEnv.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/IRBuilder.h"
48#include "llvm/IR/Instruction.h"
50#include "llvm/IR/IntrinsicsNVPTX.h"
51#include "llvm/IR/Module.h"
52#include "llvm/IR/Type.h"
53#include "llvm/IR/Value.h"
65#include <algorithm>
66#include <cassert>
67#include <cmath>
68#include <cstdint>
69#include <iterator>
70#include <optional>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76#define DEBUG_TYPE "nvptx-lower"
77
78using namespace llvm;
79
81 "nvptx-sched4reg",
82 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
83
85 "nvptx-fma-level", cl::Hidden,
86 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
87 " 1: do it 2: do it aggressively"),
88 cl::init(2));
89
91 "nvptx-prec-divf32", cl::Hidden,
93 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
95 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
96 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
97 clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2",
98 "Use IEEE Compliant F32 div.rnd if available (default)"),
99 clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3",
100 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
101 cl::init(NVPTX::DivPrecisionLevel::IEEE754));
102
104 "nvptx-prec-sqrtf32", cl::Hidden,
105 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
106 cl::init(true));
107
108/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
109/// does NOT use lg2.approx for log2, so this is disabled by default.
111 "nvptx-approx-log2f32",
112 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
113 cl::init(false));
114
116 "nvptx-force-min-byval-param-align", cl::Hidden,
117 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
118 " params of device functions."),
119 cl::init(false));
120
123 const SDNode &N) const {
124 // If nvptx-prec-div32=N is used on the command-line, always honor it
125 if (UsePrecDivF32.getNumOccurrences() > 0)
126 return UsePrecDivF32;
127
128 const SDNodeFlags Flags = N.getFlags();
129 if (Flags.hasApproximateFuncs())
131
133}
134
136 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
138 return UsePrecSqrtF32;
139
140 if (N) {
141 const SDNodeFlags Flags = N->getFlags();
142 if (Flags.hasApproximateFuncs())
143 return false;
144 }
145
146 return true;
147}
148
152}
153
154static bool IsPTXVectorType(MVT VT) {
155 switch (VT.SimpleTy) {
156 default:
157 return false;
158 case MVT::v2i1:
159 case MVT::v4i1:
160 case MVT::v2i8:
161 case MVT::v4i8:
162 case MVT::v8i8: // <2 x i8x4>
163 case MVT::v16i8: // <4 x i8x4>
164 case MVT::v2i16:
165 case MVT::v4i16:
166 case MVT::v8i16: // <4 x i16x2>
167 case MVT::v2i32:
168 case MVT::v4i32:
169 case MVT::v2i64:
170 case MVT::v2f16:
171 case MVT::v4f16:
172 case MVT::v8f16: // <4 x f16x2>
173 case MVT::v2bf16:
174 case MVT::v4bf16:
175 case MVT::v8bf16: // <4 x bf16x2>
176 case MVT::v2f32:
177 case MVT::v4f32:
178 case MVT::v2f64:
179 case MVT::v4i64:
180 case MVT::v4f64:
181 case MVT::v8i32:
182 case MVT::v8f32:
183 case MVT::v16f16: // <8 x f16x2>
184 case MVT::v16bf16: // <8 x bf16x2>
185 case MVT::v16i16: // <8 x i16x2>
186 case MVT::v32i8: // <8 x i8x4>
187 return true;
188 }
189}
190
191// When legalizing vector loads/stores, this function is called, which does two
192// things:
193// 1. Determines Whether the vector is something we want to custom lower,
194// std::nullopt is returned if we do not want to custom lower it.
195// 2. If we do want to handle it, returns two parameters:
196// - unsigned int NumElts - The number of elements in the final vector
197// - EVT EltVT - The type of the elements in the final vector
198static std::optional<std::pair<unsigned int, MVT>>
200 unsigned AddressSpace) {
201 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
202
203 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
204 VectorEVT.getSizeInBits() == 256)
205 return {{4, MVT::i64}};
206
207 if (!VectorEVT.isSimple())
208 return std::nullopt;
209 const MVT VectorVT = VectorEVT.getSimpleVT();
210
211 if (!VectorVT.isVector()) {
212 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
213 return {{2, MVT::i64}};
214 return std::nullopt;
215 }
216
217 const MVT EltVT = VectorVT.getVectorElementType();
218 const unsigned NumElts = VectorVT.getVectorNumElements();
219
220 // The size of the PTX virtual register that holds a packed type.
221 unsigned PackRegSize;
222
223 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
224 // legal. We can (and should) split that into 2 stores of <2 x double> here
225 // but I'm leaving that as a TODO for now.
226 switch (VectorVT.SimpleTy) {
227 default:
228 return std::nullopt;
229 case MVT::v4i64:
230 case MVT::v4f64:
231 case MVT::v8i32:
232 // This is a "native" vector type iff the address space is global
233 // and the target supports 256-bit loads/stores
234 if (!CanLowerTo256Bit)
235 return std::nullopt;
237 case MVT::v2i8:
238 case MVT::v2i32:
239 case MVT::v2i64:
240 case MVT::v2f64:
241 case MVT::v4i32:
242 // This is a "native" vector type
243 return std::pair(NumElts, EltVT);
244 case MVT::v16f16: // <8 x f16x2>
245 case MVT::v16bf16: // <8 x bf16x2>
246 case MVT::v16i16: // <8 x i16x2>
247 case MVT::v32i8: // <8 x i8x4>
248 // This can be upsized into a "native" vector type iff the address space is
249 // global and the target supports 256-bit loads/stores.
250 if (!CanLowerTo256Bit)
251 return std::nullopt;
253 case MVT::v2i16: // <1 x i16x2>
254 case MVT::v2f16: // <1 x f16x2>
255 case MVT::v2bf16: // <1 x bf16x2>
256 case MVT::v4i8: // <1 x i8x4>
257 case MVT::v4i16: // <2 x i16x2>
258 case MVT::v4f16: // <2 x f16x2>
259 case MVT::v4bf16: // <2 x bf16x2>
260 case MVT::v8i8: // <2 x i8x4>
261 case MVT::v8f16: // <4 x f16x2>
262 case MVT::v8bf16: // <4 x bf16x2>
263 case MVT::v8i16: // <4 x i16x2>
264 case MVT::v16i8: // <4 x i8x4>
265 PackRegSize = 32;
266 break;
267 case MVT::v8f32: // <4 x f32x2>
268 if (!CanLowerTo256Bit)
269 return std::nullopt;
271 case MVT::v2f32: // <1 x f32x2>
272 case MVT::v4f32: // <2 x f32x2>
273 if (!STI.hasF32x2Instructions())
274 return std::pair(NumElts, EltVT);
275 PackRegSize = 64;
276 break;
277 }
278
279 // If we reach here, then we can pack 2 or more elements into a single 32-bit
280 // or 64-bit PTX register and treat the vector as a new vector containing
281 // packed elements.
282
283 // Number of elements to pack in one word.
284 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
285
286 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
287}
288
289/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
290/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
291/// the types as required by the calling convention (with special handling for
292/// i8s).
293/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
294/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
295/// LowerCall, and LowerReturn.
296static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
297 LLVMContext &Ctx, CallingConv::ID CallConv,
298 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
300 uint64_t StartingOffset = 0) {
301 SmallVector<EVT, 16> TempVTs;
302 SmallVector<uint64_t, 16> TempOffsets;
303 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
304
305 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
306 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
307 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
308
309 // Since we actually can load/store b8, we need to ensure that we'll use
310 // the original sized type for any i8s or i8 vectors.
311 if (VT.getScalarType() == MVT::i8) {
312 if (RegisterVT == MVT::i16)
313 RegisterVT = MVT::i8;
314 else if (RegisterVT == MVT::v2i16)
315 RegisterVT = MVT::v2i8;
316 else
317 assert(RegisterVT == MVT::v4i8 &&
318 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
319 }
320
321 // TODO: This is horribly incorrect for cases where the vector elements are
322 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
323 // has existed for as long as NVPTX has and no one has complained, so we'll
324 // leave it for now.
325 for (unsigned I : seq(NumRegs)) {
326 ValueVTs.push_back(RegisterVT);
327 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
328 }
329 }
330}
331
332// We return an EVT that can hold N VTs
333// If the VT is a vector, the resulting EVT is a flat vector with the same
334// element type as VT's element type.
335static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
336 if (N == 1)
337 return VT;
338
339 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
340 VT.getVectorNumElements() * N)
341 : EVT::getVectorVT(C, VT, N);
342}
343
345 const SDLoc &dl, SelectionDAG &DAG) {
346 if (V.getValueType() == VT) {
347 assert(I == 0 && "Index must be 0 for scalar value");
348 return V;
349 }
350
351 if (!VT.isVector())
352 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
353 DAG.getVectorIdxConstant(I, dl));
354
355 return DAG.getNode(
356 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
358}
359
360template <typename T>
361static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
362 SelectionDAG &DAG, T GetElement) {
363 if (N == 1)
364 return GetElement(0);
365
367 for (const unsigned I : llvm::seq(N)) {
368 SDValue Val = GetElement(I);
369 if (Val.getValueType().isVector())
370 DAG.ExtractVectorElements(Val, Values);
371 else
372 Values.push_back(Val);
373 }
374
375 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
376 Values.size());
377 return DAG.getBuildVector(VT, dl, Values);
378}
379
380/// PromoteScalarIntegerPTX
381/// Used to make sure the arguments/returns are suitable for passing
382/// and promote them to a larger size if they're not.
383///
384/// The promoted type is placed in \p PromoteVT if the function returns true.
386 if (VT.isScalarInteger()) {
387 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
388 default:
390 "Promotion is not suitable for scalars of size larger than 64-bits");
391 case 1:
392 return MVT::i1;
393 case 2:
394 case 4:
395 case 8:
396 return MVT::i8;
397 case 16:
398 return MVT::i16;
399 case 32:
400 return MVT::i32;
401 case 64:
402 return MVT::i64;
403 }
404 }
405 return VT;
406}
407
408// Check whether we can merge loads/stores of some of the pieces of a
409// flattened function parameter or return value into a single vector
410// load/store.
411//
412// The flattened parameter is represented as a list of EVTs and
413// offsets, and the whole structure is aligned to ParamAlignment. This
414// function determines whether we can load/store pieces of the
415// parameter starting at index Idx using a single vectorized op of
416// size AccessSize. If so, it returns the number of param pieces
417// covered by the vector op. Otherwise, it returns 1.
418template <typename T>
420 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
421 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
422
423 // Can't vectorize if param alignment is not sufficient.
424 if (ParamAlignment < AccessSize)
425 return 1;
426 // Can't vectorize if offset is not aligned.
427 if (Offsets[Idx] & (AccessSize - 1))
428 return 1;
429
430 EVT EltVT = ValueVTs[Idx];
431 unsigned EltSize = EltVT.getStoreSize();
432
433 // Element is too large to vectorize.
434 if (EltSize >= AccessSize)
435 return 1;
436
437 unsigned NumElts = AccessSize / EltSize;
438 // Can't vectorize if AccessBytes if not a multiple of EltSize.
439 if (AccessSize != EltSize * NumElts)
440 return 1;
441
442 // We don't have enough elements to vectorize.
443 if (Idx + NumElts > ValueVTs.size())
444 return 1;
445
446 // PTX ISA can only deal with 2- and 4-element vector ops.
447 if (NumElts != 4 && NumElts != 2)
448 return 1;
449
450 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
451 // Types do not match.
452 if (ValueVTs[j] != EltVT)
453 return 1;
454
455 // Elements are not contiguous.
456 if (Offsets[j] - Offsets[j - 1] != EltSize)
457 return 1;
458 }
459 // OK. We can vectorize ValueVTs[i..i+NumElts)
460 return NumElts;
461}
462
463// Computes whether and how we can vectorize the loads/stores of a
464// flattened function parameter or return value.
465//
466// The flattened parameter is represented as the list of ValueVTs and
467// Offsets, and is aligned to ParamAlignment bytes. We return a vector
468// of the same size as ValueVTs indicating how each piece should be
469// loaded/stored (i.e. as a scalar, or as part of a vector
470// load/store).
471template <typename T>
474 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
475 bool IsVAArg = false) {
476 // Set vector size to match ValueVTs and mark all elements as
477 // scalars by default.
478
479 if (IsVAArg)
480 return SmallVector<unsigned>(ValueVTs.size(), 1);
481
482 SmallVector<unsigned, 16> VectorInfo;
483
484 const auto GetNumElts = [&](unsigned I) -> unsigned {
485 for (const unsigned AccessSize : {16, 8, 4, 2}) {
486 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
487 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
488 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
489 "Unexpected vectorization size");
490 if (NumElts != 1)
491 return NumElts;
492 }
493 return 1;
494 };
495
496 // Check what we can vectorize using 128/64/32-bit accesses.
497 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
498 const unsigned NumElts = GetNumElts(I);
499 VectorInfo.push_back(NumElts);
500 I += NumElts;
501 }
502 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
503 ValueVTs.size());
504 return VectorInfo;
505}
506
507// NVPTXTargetLowering Constructor.
509 const NVPTXSubtarget &STI)
510 : TargetLowering(TM), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
511 // always lower memset, memcpy, and memmove intrinsics to load/store
512 // instructions, rather
513 // then generating calls to memset, mempcy or memmove.
517
520
521 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
522 // condition branches.
523 setJumpIsExpensive(true);
524
525 // Wide divides are _very_ slow. Try to reduce the width of the divide if
526 // possible.
527 addBypassSlowDiv(64, 32);
528
529 // By default, use the Source scheduling
530 if (sched4reg)
532 else
534
535 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
536 LegalizeAction NoF16Action) {
537 bool IsOpSupported = STI.allowFP16Math();
538 switch (Op) {
539 // Several FP16 instructions are available on sm_80 only.
540 case ISD::FMINNUM:
541 case ISD::FMAXNUM:
544 case ISD::FMAXIMUM:
545 case ISD::FMINIMUM:
546 case ISD::FMAXIMUMNUM:
547 case ISD::FMINIMUMNUM:
548 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
549 break;
550 case ISD::FEXP2:
551 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
552 break;
553 }
554 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
555 };
556
557 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
558 LegalizeAction NoBF16Action) {
559 bool IsOpSupported = STI.hasNativeBF16Support(Op);
561 Op, VT, IsOpSupported ? Action : NoBF16Action);
562 };
563
564 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
565 LegalizeAction NoI16x2Action) {
566 bool IsOpSupported = false;
567 // instructions are available on sm_90 only
568 switch (Op) {
569 case ISD::ADD:
570 case ISD::SMAX:
571 case ISD::SMIN:
572 case ISD::UMIN:
573 case ISD::UMAX:
574 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
575 break;
576 }
577 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
578 };
579
580 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
581 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
582 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
583 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
584 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
585 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
586 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
587 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
588 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
589 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
590 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
591 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
592
593 if (STI.hasF32x2Instructions())
594 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
595
596 // Conversion to/from FP16/FP16x2 is always legal.
601
603 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
605
606 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
607 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
608
609 // Conversion to/from BFP16/BFP16x2 is always legal.
614
615 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
616 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
617 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
618 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
619
620 // Conversion to/from i16/i16x2 is always legal.
625
630
631 // No support for these operations with v2f32.
634 // Need custom lowering in case the index is dynamic.
635 if (STI.hasF32x2Instructions())
637
638 // Custom conversions to/from v2i8.
640
641 // Only logical ops can be done on v4i8 directly, others must be done
642 // elementwise.
659 MVT::v4i8, Expand);
660
661 // Operations not directly supported by NVPTX.
662 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
663 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
664 MVT::v4i8, MVT::i32, MVT::i64}) {
667 }
668
669 // Not directly supported. TLI would attempt to expand operations like
670 // FMINIMUM(v2f32) using invalid SETCC and VSELECT nodes.
672
673 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
674 // For others we will expand to a SHL/SRA pair.
681
688
691
693 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
694 Expand);
695
696 if (STI.hasHWROT32()) {
699 Custom);
700 }
701
703
706
707 // We want to legalize constant related memmove and memcopy
708 // intrinsics.
710
711 // FP extload/truncstore is not legal in PTX. We need to expand all these.
712 for (auto FloatVTs :
714 for (MVT ValVT : FloatVTs) {
715 for (MVT MemVT : FloatVTs) {
716 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
717 setTruncStoreAction(ValVT, MemVT, Expand);
718 }
719 }
720 }
721
722 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
723 // how they'll be lowered in ISel anyway, and by doing this a little earlier
724 // we allow for more DAG combine opportunities.
725 for (auto IntVTs :
727 for (MVT ValVT : IntVTs)
728 for (MVT MemVT : IntVTs)
729 if (isTypeLegal(ValVT))
730 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
731
732 // PTX does not support load / store predicate registers
734 for (MVT VT : MVT::integer_valuetypes()) {
736 Promote);
737 setTruncStoreAction(VT, MVT::i1, Expand);
738 }
739
740 // Disable generations of extload/truncstore for v2i16/v2i8. The generic
741 // expansion for these nodes when they are unaligned is incorrect if the
742 // type is a vector.
743 //
744 // TODO: Fix the generic expansion for these nodes found in
745 // TargetLowering::expandUnalignedLoad/Store.
747 MVT::v2i8, Expand);
748 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
749
750 // Register custom handling for illegal type loads/stores. We'll try to custom
751 // lower almost all illegal types and logic in the lowering will discard cases
752 // we can't handle.
753 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
755 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
757
758 // Custom legalization for LDU intrinsics.
759 // TODO: The logic to lower these is not very robust and we should rewrite it.
760 // Perhaps LDU should not be represented as an intrinsic at all.
763 if (IsPTXVectorType(VT))
765
769 MVT::i1, Expand);
770
771 // This is legal in NVPTX
776
777 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
779
780 // TRAP can be lowered to PTX trap
781 setOperationAction(ISD::TRAP, MVT::Other, Legal);
782 // DEBUGTRAP can be lowered to PTX brkpt
784
785 // Support varargs.
790
792 {MVT::i16, MVT::i32, MVT::i64}, Legal);
793
795 Promote);
798
799 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
800 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
801 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
802 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
803 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
804 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
805 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
806
807 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
808 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
809 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
810 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
811 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
812 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
813
814 // Other arithmetic and logic ops are unsupported.
818 MVT::v2i16, Expand);
819
824 if (STI.getPTXVersion() >= 43) {
829 }
830
832 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
835
836 // PTX does not directly support SELP of i1, so promote to i32 first
838
839 // PTX cannot multiply two i64s in a single instruction.
842
843 // We have some custom DAG combine patterns for these nodes
848
849 // setcc for f16x2 and bf16x2 needs special handling to prevent
850 // legalizer's attempt to scalarize it due to v2i1 not being legal.
851 if (STI.allowFP16Math() || STI.hasBF16Math())
853
854 // Vector reduction operations. These may be turned into shuffle or tree
855 // reductions depending on what instructions are available for each type.
857 MVT EltVT = VT.getVectorElementType();
858 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
861 VT, Custom);
862 }
863 }
864
865 // Promote fp16 arithmetic if fp16 hardware isn't available or the
866 // user passed --nvptx-no-fp16-math. The flag is useful because,
867 // although sm_53+ GPUs have some sort of FP16 support in
868 // hardware, only sm_53 and sm_60 have full implementation. Others
869 // only have token amount of hardware and are likely to run faster
870 // by using fp32 units instead.
871 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
872 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
873 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
874 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
875 // bf16 must be promoted to f32.
876 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
877 if (getOperationAction(Op, MVT::bf16) == Promote)
878 AddPromotedToType(Op, MVT::bf16, MVT::f32);
879 setOperationAction(Op, MVT::v2f32,
881 }
882
883 // On SM80, we select add/mul/sub as fma to avoid promotion to float
884 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
885 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
888 }
889 }
890 }
891
892 // f16/f16x2 neg was introduced in PTX 60, SM_53.
893 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
894 STI.getPTXVersion() >= 60 &&
895 STI.allowFP16Math();
896 for (const auto &VT : {MVT::f16, MVT::v2f16})
898 IsFP16FP16x2NegAvailable ? Legal : Expand);
899
900 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
901 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
902 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
903 // (would be) Library functions.
904
905 // These map to conversion instructions for scalar FP types.
906 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
908 setOperationAction(Op, MVT::f16, Legal);
909 setOperationAction(Op, MVT::f32, Legal);
910 setOperationAction(Op, MVT::f64, Legal);
911 setOperationAction(Op, MVT::v2f16, Expand);
912 setOperationAction(Op, MVT::v2bf16, Expand);
913 setOperationAction(Op, MVT::v2f32, Expand);
914 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
915 if (getOperationAction(Op, MVT::bf16) == Promote)
916 AddPromotedToType(Op, MVT::bf16, MVT::f32);
917 }
918
919 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
921 }
922 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
923 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
926 }
927 }
928
929 // Expand v2f32 = fp_extend
931 // Expand v2[b]f16 = fp_round v2f32
932 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
933
934 // sm_80 only has conversions between f32 and bf16. Custom lower all other
935 // bf16 conversions.
936 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
937 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
940 VT, Custom);
941 }
944 MVT::bf16, Custom);
945 }
946
953 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
954
955 // 'Expand' implements FCOPYSIGN without calling an external library.
962
963 // These map to corresponding instructions for f32/f64. f16 must be
964 // promoted to f32. v2f16 is expanded to f16, which is then promoted
965 // to f32.
966 for (const auto &Op :
968 setOperationAction(Op, MVT::f16, Promote);
969 setOperationAction(Op, MVT::f32, Legal);
970 // only div/rem/sqrt are legal for f64
971 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
972 setOperationAction(Op, MVT::f64, Legal);
973 }
974 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
975 setOperationAction(Op, MVT::bf16, Promote);
976 AddPromotedToType(Op, MVT::bf16, MVT::f32);
977 }
978 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
979
980 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
981 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
982 if (STI.getPTXVersion() >= 65) {
983 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
984 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
985 } else {
987 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
988 }
989 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
990 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
991 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
992 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
993
994 for (const auto &Op :
996 setOperationAction(Op, MVT::f32, Legal);
997 setOperationAction(Op, MVT::f64, Legal);
998 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
999 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1000 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1001 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1002 if (getOperationAction(Op, MVT::bf16) == Promote)
1003 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1004 setOperationAction(Op, MVT::v2f32, Expand);
1005 }
1006 bool SupportsF32MinMaxNaN =
1007 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1008 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1009 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1010 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1011 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1012 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1013 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1014 setOperationAction(Op, MVT::v2f32, Expand);
1015 }
1016
1017 // Custom lowering for inline asm with 128-bit operands
1020
1021 // FEXP2 support:
1022 // - f32
1023 // - f16/f16x2 (sm_70+, PTX 7.0+)
1024 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1025 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1027 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1028 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1029 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1030 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1031 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1032
1033 // FLOG2 supports f32 only
1034 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1035 if (UseApproxLog2F32) {
1037 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1038 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1039 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1040 Expand);
1041 }
1042
1043 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1044
1045 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1046
1047 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1048 // type, we need to custom lower it.
1050 Custom);
1051
1052 // Now deduce the information based on the above mentioned
1053 // actions
1055
1056 // PTX support for 16-bit CAS is emulated. Only use 32+
1060
1061 // Custom lowering for tcgen05.ld vector operands
1063 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1064 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1065 Custom);
1066
1067 // Custom lowering for tcgen05.st vector operands
1069 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1070 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1071 Custom);
1072
1073 // Enable custom lowering for the following:
1074 // * MVT::i128 - clusterlaunchcontrol
1075 // * MVT::i32 - prmt
1076 // * MVT::Other - internal.addrspace.wrap
1077 setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
1078 Custom);
1079}
1080
1081const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
1082
1083#define MAKE_CASE(V) \
1084 case V: \
1085 return #V;
1086
1087 switch ((NVPTXISD::NodeType)Opcode) {
1089 break;
1090
1133 }
1134 return nullptr;
1135
1136#undef MAKE_CASE
1137}
1138
1141 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1142 VT.getScalarType() == MVT::i1)
1143 return TypeSplitVector;
1145}
1146
1148 int Enabled, int &ExtraSteps,
1149 bool &UseOneConst,
1150 bool Reciprocal) const {
1153 return SDValue();
1154
1155 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1156 ExtraSteps = 0;
1157
1158 SDLoc DL(Operand);
1159 EVT VT = Operand.getValueType();
1160 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1161
1162 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1163 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1164 DAG.getConstant(IID, DL, MVT::i32), Operand);
1165 };
1166
1167 // The sqrt and rsqrt refinement processes assume we always start out with an
1168 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1169 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1170 // any refinement, we must return a regular sqrt.
1171 if (Reciprocal || ExtraSteps > 0) {
1172 if (VT == MVT::f32)
1173 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1174 : Intrinsic::nvvm_rsqrt_approx_f);
1175 else if (VT == MVT::f64)
1176 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1177 else
1178 return SDValue();
1179 } else {
1180 if (VT == MVT::f32)
1181 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1182 : Intrinsic::nvvm_sqrt_approx_f);
1183 else {
1184 // There's no sqrt.approx.f64 instruction, so we emit
1185 // reciprocal(rsqrt(x)). This is faster than
1186 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1187 // x * rsqrt(x).)
1188 return DAG.getNode(
1190 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1191 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1192 }
1193 }
1194}
1195
1197 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1199 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1200 unsigned UniqueCallSite) const {
1201 auto PtrVT = getPointerTy(DL);
1202
1203 std::string Prototype;
1204 raw_string_ostream O(Prototype);
1205 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1206
1207 if (RetTy->isVoidTy()) {
1208 O << "()";
1209 } else {
1210 O << "(";
1211 if (shouldPassAsArray(RetTy)) {
1212 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1213 O << ".param .align " << RetAlign.value() << " .b8 _["
1214 << DL.getTypeAllocSize(RetTy) << "]";
1215 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1216 unsigned size = 0;
1217 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1218 size = ITy->getBitWidth();
1219 } else {
1220 assert(RetTy->isFloatingPointTy() &&
1221 "Floating point type expected here");
1222 size = RetTy->getPrimitiveSizeInBits();
1223 }
1224 // PTX ABI requires all scalar return values to be at least 32
1225 // bits in size. fp16 normally uses .b16 as its storage type in
1226 // PTX, so its size must be adjusted here, too.
1228
1229 O << ".param .b" << size << " _";
1230 } else if (isa<PointerType>(RetTy)) {
1231 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1232 } else {
1233 llvm_unreachable("Unknown return type");
1234 }
1235 O << ") ";
1236 }
1237 O << "_ (";
1238
1239 bool first = true;
1240
1241 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1242 auto AllOuts = ArrayRef(Outs);
1243 for (const unsigned I : llvm::seq(NumArgs)) {
1244 const auto ArgOuts =
1245 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1246 AllOuts = AllOuts.drop_front(ArgOuts.size());
1247
1248 Type *Ty = Args[I].Ty;
1249 if (!first) {
1250 O << ", ";
1251 }
1252 first = false;
1253
1254 if (ArgOuts[0].Flags.isByVal()) {
1255 // Indirect calls need strict ABI alignment so we disable optimizations by
1256 // not providing a function to optimize.
1257 Type *ETy = Args[I].IndirectType;
1258 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1259 Align ParamByValAlign =
1260 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1261
1262 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1263 << ArgOuts[0].Flags.getByValSize() << "]";
1264 } else {
1265 if (shouldPassAsArray(Ty)) {
1266 Align ParamAlign =
1267 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1268 O << ".param .align " << ParamAlign.value() << " .b8 _["
1269 << DL.getTypeAllocSize(Ty) << "]";
1270 continue;
1271 }
1272 // i8 types in IR will be i16 types in SDAG
1273 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1274 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1275 "type mismatch between callee prototype and arguments");
1276 // scalar type
1277 unsigned sz = 0;
1278 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1279 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1280 } else if (isa<PointerType>(Ty)) {
1281 sz = PtrVT.getSizeInBits();
1282 } else {
1283 sz = Ty->getPrimitiveSizeInBits();
1284 }
1285 O << ".param .b" << sz << " _";
1286 }
1287 }
1288
1289 if (FirstVAArg)
1290 O << (first ? "" : ",") << " .param .align "
1291 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1292 O << ")";
1293 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1294 O << " .noreturn";
1295 O << ";";
1296
1297 return Prototype;
1298}
1299
1301 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1302 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1303}
1304
1305Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1306 unsigned Idx,
1307 const DataLayout &DL) const {
1308 if (!CB) {
1309 // CallSite is zero, fallback to ABI type alignment
1310 return DL.getABITypeAlign(Ty);
1311 }
1312
1313 const Function *DirectCallee = CB->getCalledFunction();
1314
1315 if (!DirectCallee) {
1316 // We don't have a direct function symbol, but that may be because of
1317 // constant cast instructions in the call.
1318
1319 // With bitcast'd call targets, the instruction will be the call
1320 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1321 // Check if we have call alignment metadata
1322 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1323 return StackAlign.value();
1324 }
1325 DirectCallee = getMaybeBitcastedCallee(CB);
1326 }
1327
1328 // Check for function alignment information if we found that the
1329 // ultimate target is a Function
1330 if (DirectCallee)
1331 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1332
1333 // Call is indirect, fall back to the ABI type alignment
1334 return DL.getABITypeAlign(Ty);
1335}
1336
1338 const GlobalAddressSDNode *Func) {
1339 if (!Func)
1340 return false;
1341 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1342 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1343 return false;
1344}
1345
1347 const DataLayout &DL,
1348 const TargetLowering &TL) {
1349 if (Ptr->getOpcode() == ISD::FrameIndex) {
1350 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1353
1355 }
1356
1357 // Peel of an addrspacecast to generic and load directly from the specific
1358 // address space.
1359 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1360 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1361 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1362 Ptr = ASC->getOperand(0);
1363 return MachinePointerInfo(ASC->getSrcAddressSpace());
1364 }
1365 }
1366
1367 return MachinePointerInfo();
1368}
1369
1371 if (Flags.isSExt())
1372 return ISD::SIGN_EXTEND;
1373 if (Flags.isZExt())
1374 return ISD::ZERO_EXTEND;
1375 return ISD::ANY_EXTEND;
1376}
1377
1379 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1380 SDLoc dl) {
1381 const EVT ActualVT = V.getValueType();
1382 assert((ActualVT == ExpectedVT ||
1383 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1384 "Non-integer argument type size mismatch");
1385 if (ExpectedVT.bitsGT(ActualVT))
1386 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1387 if (ExpectedVT.bitsLT(ActualVT))
1388 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1389
1390 return V;
1391}
1392
1394 SmallVectorImpl<SDValue> &InVals) const {
1395
1396 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1398 "Support for variadic functions (unsized array parameter) introduced "
1399 "in PTX ISA version 6.0 and requires target sm_30.");
1400
1401 SelectionDAG &DAG = CLI.DAG;
1402 SDLoc dl = CLI.DL;
1403 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1404 SDValue Callee = CLI.Callee;
1405 ArgListTy &Args = CLI.getArgs();
1406 Type *RetTy = CLI.RetTy;
1407 const CallBase *CB = CLI.CB;
1408 const DataLayout &DL = DAG.getDataLayout();
1409 LLVMContext &Ctx = *DAG.getContext();
1410
1411 const auto GetI32 = [&](const unsigned I) {
1412 return DAG.getConstant(I, dl, MVT::i32);
1413 };
1414
1415 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1416 const SDValue CallChain = CLI.Chain;
1417 const SDValue StartChain =
1418 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1419 SDValue DeclareGlue = StartChain.getValue(1);
1420
1421 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1422
1423 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1424 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1425 // loaded/stored using i16, so it's handled here as well.
1426 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1427 SDValue Declare =
1428 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1429 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1430 CallPrereqs.push_back(Declare);
1431 DeclareGlue = Declare.getValue(1);
1432 return Declare;
1433 };
1434
1435 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1436 unsigned Size) {
1437 SDValue Declare = DAG.getNode(
1438 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1439 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1440 CallPrereqs.push_back(Declare);
1441 DeclareGlue = Declare.getValue(1);
1442 return Declare;
1443 };
1444
1445 // Variadic arguments.
1446 //
1447 // Normally, for each argument, we declare a param scalar or a param
1448 // byte array in the .param space, and store the argument value to that
1449 // param scalar or array starting at offset 0.
1450 //
1451 // In the case of the first variadic argument, we declare a vararg byte array
1452 // with size 0. The exact size of this array isn't known at this point, so
1453 // it'll be patched later. All the variadic arguments will be stored to this
1454 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1455 // initially set to 0, so it can be used for non-variadic arguments (which use
1456 // 0 offset) to simplify the code.
1457 //
1458 // After all vararg is processed, 'VAOffset' holds the size of the
1459 // vararg byte array.
1460 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1461 "Non-VarArg function with extra arguments");
1462
1463 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1464 unsigned VAOffset = 0; // current offset in the param array
1465
1466 const SDValue VADeclareParam =
1467 CLI.Args.size() > FirstVAArg
1468 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1470 : SDValue();
1471
1472 // Args.size() and Outs.size() need not match.
1473 // Outs.size() will be larger
1474 // * if there is an aggregate argument with multiple fields (each field
1475 // showing up separately in Outs)
1476 // * if there is a vector argument with more than typical vector-length
1477 // elements (generally if more than 4) where each vector element is
1478 // individually present in Outs.
1479 // So a different index should be used for indexing into Outs/OutVals.
1480 // See similar issue in LowerFormalArguments.
1481 auto AllOuts = ArrayRef(CLI.Outs);
1482 auto AllOutVals = ArrayRef(CLI.OutVals);
1483 assert(AllOuts.size() == AllOutVals.size() &&
1484 "Outs and OutVals must be the same size");
1485 // Declare the .params or .reg need to pass values
1486 // to the function
1487 for (const auto E : llvm::enumerate(Args)) {
1488 const auto ArgI = E.index();
1489 const auto Arg = E.value();
1490 const auto ArgOuts =
1491 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1492 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1493 AllOuts = AllOuts.drop_front(ArgOuts.size());
1494 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1495
1496 const bool IsVAArg = (ArgI >= FirstVAArg);
1497 const bool IsByVal = Arg.IsByVal;
1498
1499 const SDValue ParamSymbol =
1500 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1501
1502 assert((!IsByVal || Arg.IndirectType) &&
1503 "byval arg must have indirect type");
1504 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1505
1506 const Align ArgAlign = [&]() {
1507 if (IsByVal) {
1508 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1509 // so we don't need to worry whether it's naturally aligned or not.
1510 // See TargetLowering::LowerCallTo().
1511 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1513 InitialAlign, DL);
1514 }
1515 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1516 }();
1517
1518 const unsigned TySize = DL.getTypeAllocSize(ETy);
1519 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1520 "type size mismatch");
1521
1522 const SDValue ArgDeclare = [&]() {
1523 if (IsVAArg)
1524 return VADeclareParam;
1525
1526 if (IsByVal || shouldPassAsArray(Arg.Ty))
1527 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1528
1529 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1530 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1531 "Only int and float types are supported as non-array arguments");
1532
1533 return MakeDeclareScalarParam(ParamSymbol, TySize);
1534 }();
1535
1536 if (IsByVal) {
1537 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1538 SDValue SrcPtr = ArgOutVals[0];
1539 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1540 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1541
1542 if (IsVAArg)
1543 VAOffset = alignTo(VAOffset, ArgAlign);
1544
1545 SmallVector<EVT, 4> ValueVTs, MemVTs;
1547 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1548
1549 unsigned J = 0;
1550 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1551 for (const unsigned NumElts : VI) {
1552 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1553 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1554 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1555 SDValue SrcLoad =
1556 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1557
1558 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1559 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1560 SDValue ParamAddr =
1561 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1562 SDValue StoreParam =
1563 DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
1565 CallPrereqs.push_back(StoreParam);
1566
1567 J += NumElts;
1568 }
1569 if (IsVAArg)
1570 VAOffset += TySize;
1571 } else {
1574 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1575 VAOffset);
1576 assert(VTs.size() == Offsets.size() && "Size mismatch");
1577 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1578
1579 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1580 // than 32-bits are sign extended or zero extended, depending on
1581 // whether they are signed or unsigned types. This case applies
1582 // only to scalar parameters and not to aggregate values.
1583 const bool ExtendIntegerParam =
1584 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1585
1586 const auto GetStoredValue = [&](const unsigned I) {
1587 SDValue StVal = ArgOutVals[I];
1589 StVal.getValueType() &&
1590 "OutVal type should always be legal");
1591
1592 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1593 const EVT StoreVT =
1594 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1595
1596 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1597 };
1598
1599 unsigned J = 0;
1600 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1601 for (const unsigned NumElts : VI) {
1602 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1603
1604 unsigned Offset;
1605 if (IsVAArg) {
1606 // TODO: We may need to support vector types that can be passed
1607 // as scalars in variadic arguments.
1608 assert(NumElts == 1 &&
1609 "Vectorization should be disabled for vaargs.");
1610
1611 // Align each part of the variadic argument to their type.
1612 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1613 Offset = VAOffset;
1614
1615 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1616 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1617 } else {
1618 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1619 Offset = Offsets[J];
1620 }
1621
1622 SDValue Ptr =
1623 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1624
1625 const MaybeAlign CurrentAlign = ExtendIntegerParam
1626 ? MaybeAlign(std::nullopt)
1627 : commonAlignment(ArgAlign, Offset);
1628
1629 SDValue Val =
1630 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1631 return GetStoredValue(J + K);
1632 });
1633
1634 SDValue StoreParam =
1635 DAG.getStore(ArgDeclare, dl, Val, Ptr,
1637 CallPrereqs.push_back(StoreParam);
1638
1639 J += NumElts;
1640 }
1641 }
1642 }
1643
1644 // Handle Result
1645 if (!Ins.empty()) {
1646 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1647 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1648 if (shouldPassAsArray(RetTy)) {
1649 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1650 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1651 } else {
1652 MakeDeclareScalarParam(RetSymbol, ResultSize);
1653 }
1654 }
1655
1656 // Set the size of the vararg param byte array if the callee is a variadic
1657 // function and the variadic part is not empty.
1658 if (VADeclareParam) {
1659 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1660 VADeclareParam.getOperand(1),
1661 VADeclareParam.getOperand(2), GetI32(VAOffset),
1662 VADeclareParam.getOperand(4)};
1663 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1664 VADeclareParam->getVTList(), DeclareParamOps);
1665 }
1666
1667 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1668 // If the type of the callsite does not match that of the function, convert
1669 // the callsite to an indirect call.
1670 const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1671
1672 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1673 // between them we must rely on the call site value which is valid for
1674 // indirect calls but is always null for libcalls.
1675 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1676
1677 if (isa<ExternalSymbolSDNode>(Callee)) {
1678 Function* CalleeFunc = nullptr;
1679
1680 // Try to find the callee in the current module.
1681 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1682 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1683
1684 // Set the "libcall callee" attribute to indicate that the function
1685 // must always have a declaration.
1686 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1687 }
1688
1689 if (IsIndirectCall) {
1690 // This is indirect function call case : PTX requires a prototype of the
1691 // form
1692 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1693 // to be emitted, and the label has to used as the last arg of call
1694 // instruction.
1695 // The prototype is embedded in a string and put as the operand for a
1696 // CallPrototype SDNode which will print out to the value of the string.
1697 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1698 std::string Proto =
1699 getPrototype(DL, RetTy, Args, CLI.Outs,
1700 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1701 UniqueCallSite);
1702 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1703 const SDValue PrototypeDeclare = DAG.getNode(
1704 NVPTXISD::CallPrototype, dl, MVT::Other,
1705 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1706 CallPrereqs.push_back(PrototypeDeclare);
1707 }
1708
1709 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1710 const unsigned NumArgs =
1711 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1712 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1713 /// NumParams, Callee, Proto)
1714 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1715 const SDValue Call = DAG.getNode(
1716 NVPTXISD::CALL, dl, MVT::Other,
1717 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1718 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1719
1720 SmallVector<SDValue, 16> LoadChains{Call};
1721 SmallVector<SDValue, 16> ProxyRegOps;
1722 if (!Ins.empty()) {
1725 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1726 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1727
1728 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1729 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1730
1731 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1732 // 32-bits are sign extended or zero extended, depending on whether
1733 // they are signed or unsigned types.
1734 const bool ExtendIntegerRetVal =
1735 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1736
1737 unsigned I = 0;
1738 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1739 for (const unsigned NumElts : VI) {
1740 const MaybeAlign CurrentAlign =
1741 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1742 : commonAlignment(RetAlign, Offsets[I]);
1743
1744 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1745 const EVT LoadVT =
1746 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1747 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1748 SDValue Ptr =
1749 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1750
1751 SDValue R =
1752 DAG.getLoad(VecVT, dl, Call, Ptr,
1754
1755 LoadChains.push_back(R.getValue(1));
1756 for (const unsigned J : llvm::seq(NumElts))
1757 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1758 I += NumElts;
1759 }
1760 }
1761
1762 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1763 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1764 UniqueCallSite + 1, SDValue(), dl);
1765
1766 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1767 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1768 // dangling.
1769 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1770 SDValue Proxy =
1771 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1772 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1773 InVals.push_back(Ret);
1774 }
1775
1776 // set IsTailCall to false for now, until we figure out how to express
1777 // tail call optimization in PTX
1778 CLI.IsTailCall = false;
1779 return CallEnd;
1780}
1781
1783 SelectionDAG &DAG) const {
1784
1785 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1786 const Function &Fn = DAG.getMachineFunction().getFunction();
1787
1789 Fn,
1790 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1791 "requires target sm_52.",
1792 SDLoc(Op).getDebugLoc()));
1793 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1794 Op.getOperand(0)};
1795 return DAG.getMergeValues(Ops, SDLoc());
1796 }
1797
1798 SDLoc DL(Op.getNode());
1799 SDValue Chain = Op.getOperand(0);
1800 SDValue Size = Op.getOperand(1);
1801 uint64_t Align = Op.getConstantOperandVal(2);
1802
1803 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1804 // the default stack alignment should be used.
1805 if (Align == 0)
1807
1808 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1809 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1810
1811 SDValue Alloc =
1812 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1813 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1814 DAG.getTargetConstant(Align, DL, MVT::i32)});
1815
1816 SDValue ASC = DAG.getAddrSpaceCast(
1818
1819 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1820}
1821
1823 SelectionDAG &DAG) const {
1824 SDLoc DL(Op.getNode());
1825 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1826 const Function &Fn = DAG.getMachineFunction().getFunction();
1827
1829 Fn,
1830 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1831 ">= sm_52.",
1832 DL.getDebugLoc()));
1833 return Op.getOperand(0);
1834 }
1835
1836 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1837 SDValue Chain = Op.getOperand(0);
1838 SDValue Ptr = Op.getOperand(1);
1841 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1842}
1843
1845 SelectionDAG &DAG) const {
1846 SDLoc DL(Op.getNode());
1847 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1848 const Function &Fn = DAG.getMachineFunction().getFunction();
1849
1851 Fn,
1852 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1853 "sm_52.",
1854 DL.getDebugLoc()));
1855 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1856 return DAG.getMergeValues(Ops, DL);
1857 }
1858
1859 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1860 SDValue Chain = Op.getOperand(0);
1861 SDValue SS =
1862 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1863 SDValue ASC = DAG.getAddrSpaceCast(
1864 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1865 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1866}
1867
1868// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1869// (see LegalizeDAG.cpp). This is slow and uses local memory.
1870// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1871SDValue
1872NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1873 SDNode *Node = Op.getNode();
1874 SDLoc dl(Node);
1876 unsigned NumOperands = Node->getNumOperands();
1877 for (unsigned i = 0; i < NumOperands; ++i) {
1878 SDValue SubOp = Node->getOperand(i);
1879 EVT VVT = SubOp.getNode()->getValueType(0);
1880 EVT EltVT = VVT.getVectorElementType();
1881 unsigned NumSubElem = VVT.getVectorNumElements();
1882 for (unsigned j = 0; j < NumSubElem; ++j) {
1883 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1884 DAG.getIntPtrConstant(j, dl)));
1885 }
1886 }
1887 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1888}
1889
1891 SelectionDAG &DAG,
1892 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1893 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1894 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1895 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1896 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1897}
1898
1900 SelectionDAG &DAG,
1901 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1902 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1903}
1904
1905/// Reduces the elements using the scalar operations provided. The operations
1906/// are sorted descending in number of inputs they take. The flags on the
1907/// original reduction operation will be propagated to each scalar operation.
1908/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1909/// used in ExpandReductions and SelectionDAG.
1911 const SmallVector<SDValue> &Elements, EVT EltTy,
1912 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1913 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1914 // Build the reduction tree at each level, starting with all the elements.
1915 SmallVector<SDValue> Level = Elements;
1916
1917 unsigned OpIdx = 0;
1918 while (Level.size() > 1) {
1919 // Try to reduce this level using the current operator.
1920 const auto [Op, NumInputs] = Ops[OpIdx];
1921
1922 // Build the next level by partially reducing all elements.
1923 SmallVector<SDValue> ReducedLevel;
1924 unsigned I = 0, E = Level.size();
1925 for (; I + NumInputs <= E; I += NumInputs) {
1926 // Reduce elements in groups of [NumInputs], as much as possible.
1927 ReducedLevel.push_back(DAG.getNode(
1928 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1929 }
1930
1931 if (I < E) {
1932 // Handle leftover elements.
1933
1934 if (ReducedLevel.empty()) {
1935 // We didn't reduce anything at this level. We need to pick a smaller
1936 // operator.
1937 ++OpIdx;
1938 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1939 continue;
1940 }
1941
1942 // We reduced some things but there's still more left, meaning the
1943 // operator's number of inputs doesn't evenly divide this level size. Move
1944 // these elements to the next level.
1945 for (; I < E; ++I)
1946 ReducedLevel.push_back(Level[I]);
1947 }
1948
1949 // Process the next level.
1950 Level = ReducedLevel;
1951 }
1952
1953 return *Level.begin();
1954}
1955
1956// Get scalar reduction opcode
1957static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
1958 switch (ReductionOpcode) {
1960 return ISD::FMAXNUM;
1962 return ISD::FMINNUM;
1964 return ISD::FMAXIMUM;
1966 return ISD::FMINIMUM;
1967 default:
1968 llvm_unreachable("unhandled reduction opcode");
1969 }
1970}
1971
1972/// Get 3-input scalar reduction opcode
1973static std::optional<NVPTXISD::NodeType>
1974getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
1975 switch (ReductionOpcode) {
1977 return NVPTXISD::FMAXNUM3;
1979 return NVPTXISD::FMINNUM3;
1981 return NVPTXISD::FMAXIMUM3;
1983 return NVPTXISD::FMINIMUM3;
1984 default:
1985 return std::nullopt;
1986 }
1987}
1988
1989/// Lower reductions to either a sequence of operations or a tree if
1990/// reassociations are allowed. This method will use larger operations like
1991/// max3/min3 when the target supports them.
1992SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
1993 SelectionDAG &DAG) const {
1994 SDLoc DL(Op);
1995 const SDNodeFlags Flags = Op->getFlags();
1996 SDValue Vector = Op.getOperand(0);
1997
1998 const unsigned Opcode = Op->getOpcode();
1999 const EVT EltTy = Vector.getValueType().getVectorElementType();
2000
2001 // Whether we can use 3-input min/max when expanding the reduction.
2002 const bool CanUseMinMax3 =
2003 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
2004 STI.getPTXVersion() >= 88 &&
2005 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2006 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2007
2008 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2009 // number of inputs they take.
2010 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2011
2012 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2013 CanUseMinMax3 && Opcode3Elem)
2014 ScalarOps.push_back({*Opcode3Elem, 3});
2015 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2016
2018 DAG.ExtractVectorElements(Vector, Elements);
2019
2020 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2021}
2022
2023SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2024 // Handle bitcasting from v2i8 without hitting the default promotion
2025 // strategy which goes through stack memory.
2026 EVT FromVT = Op->getOperand(0)->getValueType(0);
2027 if (FromVT != MVT::v2i8) {
2028 return Op;
2029 }
2030
2031 // Pack vector elements into i16 and bitcast to final type
2032 SDLoc DL(Op);
2033 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2034 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2035 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2036 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2037 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2038 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2039 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2040 SDValue AsInt = DAG.getNode(
2041 ISD::OR, DL, MVT::i16,
2042 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2043 EVT ToVT = Op->getValueType(0);
2044 return DAG.getBitcast(ToVT, AsInt);
2045}
2046
2047// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2048// would get lowered as two constant loads and vector-packing move.
2049// Instead we want just a constant move:
2050// mov.b32 %r2, 0x40003C00
2051SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2052 SelectionDAG &DAG) const {
2053 EVT VT = Op->getValueType(0);
2054 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2055 return Op;
2056 SDLoc DL(Op);
2057
2058 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2059 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2060 isa<ConstantFPSDNode>(Operand);
2061 })) {
2062 if (VT != MVT::v4i8)
2063 return Op;
2064 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2065 // to optimize calculation of constant parts.
2066 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2067 uint64_t SelectionValue) -> SDValue {
2068 SDValue L = Left;
2069 SDValue R = Right;
2070 if (Cast) {
2071 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2072 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2073 }
2074 return getPRMT(L, R, SelectionValue, DL, DAG);
2075 };
2076 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2077 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2078 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2079 return DAG.getBitcast(VT, PRMT3210);
2080 }
2081
2082 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2083 auto GetOperand = [](SDValue Op, int N) -> APInt {
2084 const SDValue &Operand = Op->getOperand(N);
2085 EVT VT = Op->getValueType(0);
2086 if (Operand->isUndef())
2087 return APInt(32, 0);
2088 APInt Value;
2089 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2090 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2091 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2092 Value = Operand->getAsAPIntVal();
2093 else
2094 llvm_unreachable("Unsupported type");
2095 // i8 values are carried around as i16, so we need to zero out upper bits,
2096 // so they do not get in the way of combining individual byte values
2097 if (VT == MVT::v4i8)
2098 Value = Value.trunc(8);
2099 return Value.zext(32);
2100 };
2101
2102 // Construct a 32-bit constant by shifting into place smaller values
2103 // (elements of the vector type VT).
2104 // For example, if VT has 2 elements, then N == 2:
2105 // ShiftAmount = 32 / N = 16
2106 // Value |= Op0 (b16) << 0
2107 // Value |= Op1 (b16) << 16
2108 // If N == 4:
2109 // ShiftAmount = 32 / N = 8
2110 // Value |= Op0 (b8) << 0
2111 // Value |= Op1 (b8) << 8
2112 // Value |= Op2 (b8) << 16
2113 // Value |= Op3 (b8) << 24
2114 // ...etc
2115 APInt Value(32, 0);
2116 const unsigned NumElements = VT.getVectorNumElements();
2117 assert(32 % NumElements == 0 && "must evenly divide bit length");
2118 const unsigned ShiftAmount = 32 / NumElements;
2119 for (unsigned ElementNo : seq(NumElements))
2120 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2121 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2122 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2123}
2124
2125SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2126 SelectionDAG &DAG) const {
2127 SDValue Index = Op->getOperand(1);
2128 SDValue Vector = Op->getOperand(0);
2129 SDLoc DL(Op);
2130 EVT VectorVT = Vector.getValueType();
2131
2132 if (VectorVT == MVT::v4i8) {
2133 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2134 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2135 DAG.getConstant(0x7770, DL, MVT::i32));
2136 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2137 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2138 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2140 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2141 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2142 Ext->setFlags(Flags);
2143 return Ext;
2144 }
2145
2146 // Constant index will be matched by tablegen.
2147 if (isa<ConstantSDNode>(Index.getNode()))
2148 return Op;
2149
2150 // Extract individual elements and select one of them.
2151 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2152 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2153 EVT EltVT = VectorVT.getVectorElementType();
2154
2155 SDLoc dl(Op.getNode());
2156 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2157 DAG.getIntPtrConstant(0, dl));
2158 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2159 DAG.getIntPtrConstant(1, dl));
2160 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2162}
2163
2164SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2165 SelectionDAG &DAG) const {
2166 SDValue Vector = Op->getOperand(0);
2167 EVT VectorVT = Vector.getValueType();
2168
2169 if (VectorVT != MVT::v4i8)
2170 return Op;
2171 SDLoc DL(Op);
2172 SDValue Value = Op->getOperand(1);
2173 if (Value->isUndef())
2174 return Vector;
2175
2176 SDValue Index = Op->getOperand(2);
2177
2178 SDValue BFI =
2179 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2180 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2181 DAG.getNode(ISD::MUL, DL, MVT::i32,
2182 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2183 DAG.getConstant(8, DL, MVT::i32)),
2184 DAG.getConstant(8, DL, MVT::i32)});
2185 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2186}
2187
2188SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2189 SelectionDAG &DAG) const {
2190 SDValue V1 = Op.getOperand(0);
2191 EVT VectorVT = V1.getValueType();
2192 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2193 return Op;
2194
2195 // Lower shuffle to PRMT instruction.
2196 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2197 SDValue V2 = Op.getOperand(1);
2198 uint32_t Selector = 0;
2199 for (auto I : llvm::enumerate(SVN->getMask())) {
2200 if (I.value() != -1) // -1 is a placeholder for undef.
2201 Selector |= (I.value() << (I.index() * 4));
2202 }
2203
2204 SDLoc DL(Op);
2205 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2206 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2207 return DAG.getBitcast(Op.getValueType(), PRMT);
2208}
2209/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2210/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2211/// amount, or
2212/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2213/// amount.
2214SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2215 SelectionDAG &DAG) const {
2216 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2217 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2218
2219 EVT VT = Op.getValueType();
2220 unsigned VTBits = VT.getSizeInBits();
2221 SDLoc dl(Op);
2222 SDValue ShOpLo = Op.getOperand(0);
2223 SDValue ShOpHi = Op.getOperand(1);
2224 SDValue ShAmt = Op.getOperand(2);
2225 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2226
2227 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2228 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2229 // {dHi, dLo} = {aHi, aLo} >> Amt
2230 // dHi = aHi >> Amt
2231 // dLo = shf.r.clamp aLo, aHi, Amt
2232
2233 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2234 SDValue Lo =
2235 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2236
2237 SDValue Ops[2] = { Lo, Hi };
2238 return DAG.getMergeValues(Ops, dl);
2239 }
2240 else {
2241 // {dHi, dLo} = {aHi, aLo} >> Amt
2242 // - if (Amt>=size) then
2243 // dLo = aHi >> (Amt-size)
2244 // dHi = aHi >> Amt (this is either all 0 or all 1)
2245 // else
2246 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2247 // dHi = aHi >> Amt
2248
2249 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2250 DAG.getConstant(VTBits, dl, MVT::i32),
2251 ShAmt);
2252 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2253 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2254 DAG.getConstant(VTBits, dl, MVT::i32));
2255 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2256 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2257 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2258
2259 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2260 DAG.getConstant(VTBits, dl, MVT::i32),
2261 ISD::SETGE);
2262 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2263 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2264
2265 SDValue Ops[2] = { Lo, Hi };
2266 return DAG.getMergeValues(Ops, dl);
2267 }
2268}
2269
2270/// LowerShiftLeftParts - Lower SHL_PARTS, which
2271/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2272/// amount, or
2273/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2274/// amount.
2275SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2276 SelectionDAG &DAG) const {
2277 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2278 assert(Op.getOpcode() == ISD::SHL_PARTS);
2279
2280 EVT VT = Op.getValueType();
2281 unsigned VTBits = VT.getSizeInBits();
2282 SDLoc dl(Op);
2283 SDValue ShOpLo = Op.getOperand(0);
2284 SDValue ShOpHi = Op.getOperand(1);
2285 SDValue ShAmt = Op.getOperand(2);
2286
2287 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2288 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2289 // {dHi, dLo} = {aHi, aLo} << Amt
2290 // dHi = shf.l.clamp aLo, aHi, Amt
2291 // dLo = aLo << Amt
2292
2293 SDValue Hi =
2294 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2295 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2296
2297 SDValue Ops[2] = { Lo, Hi };
2298 return DAG.getMergeValues(Ops, dl);
2299 }
2300 else {
2301 // {dHi, dLo} = {aHi, aLo} << Amt
2302 // - if (Amt>=size) then
2303 // dLo = aLo << Amt (all 0)
2304 // dLo = aLo << (Amt-size)
2305 // else
2306 // dLo = aLo << Amt
2307 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2308
2309 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2310 DAG.getConstant(VTBits, dl, MVT::i32),
2311 ShAmt);
2312 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2313 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2314 DAG.getConstant(VTBits, dl, MVT::i32));
2315 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2316 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2317 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2318
2319 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2320 DAG.getConstant(VTBits, dl, MVT::i32),
2321 ISD::SETGE);
2322 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2323 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2324
2325 SDValue Ops[2] = { Lo, Hi };
2326 return DAG.getMergeValues(Ops, dl);
2327 }
2328}
2329
2330/// If the types match, convert the generic copysign to the NVPTXISD version,
2331/// otherwise bail ensuring that mismatched cases are properly expaned.
2332SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2333 SelectionDAG &DAG) const {
2334 EVT VT = Op.getValueType();
2335 SDLoc DL(Op);
2336
2337 SDValue In1 = Op.getOperand(0);
2338 SDValue In2 = Op.getOperand(1);
2339 EVT SrcVT = In2.getValueType();
2340
2341 if (!SrcVT.bitsEq(VT))
2342 return SDValue();
2343
2344 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2345}
2346
2347SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2348 EVT VT = Op.getValueType();
2349
2350 if (VT == MVT::f32)
2351 return LowerFROUND32(Op, DAG);
2352
2353 if (VT == MVT::f64)
2354 return LowerFROUND64(Op, DAG);
2355
2356 llvm_unreachable("unhandled type");
2357}
2358
2359// This is the the rounding method used in CUDA libdevice in C like code:
2360// float roundf(float A)
2361// {
2362// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2363// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2364// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2365// }
2366SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2367 SelectionDAG &DAG) const {
2368 SDLoc SL(Op);
2369 SDValue A = Op.getOperand(0);
2370 EVT VT = Op.getValueType();
2371
2372 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2373
2374 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2375 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2376 const unsigned SignBitMask = 0x80000000;
2377 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2378 DAG.getConstant(SignBitMask, SL, MVT::i32));
2379 const unsigned PointFiveInBits = 0x3F000000;
2380 SDValue PointFiveWithSignRaw =
2381 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2382 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2383 SDValue PointFiveWithSign =
2384 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2385 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2386 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2387
2388 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2389 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2390 SDValue IsLarge =
2391 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2392 ISD::SETOGT);
2393 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2394
2395 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2396 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2397 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2398 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2399 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2400}
2401
2402// The implementation of round(double) is similar to that of round(float) in
2403// that they both separate the value range into three regions and use a method
2404// specific to the region to round the values. However, round(double) first
2405// calculates the round of the absolute value and then adds the sign back while
2406// round(float) directly rounds the value with sign.
2407SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2408 SelectionDAG &DAG) const {
2409 SDLoc SL(Op);
2410 SDValue A = Op.getOperand(0);
2411 EVT VT = Op.getValueType();
2412
2413 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2414
2415 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2416 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2417 DAG.getConstantFP(0.5, SL, VT));
2418 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2419
2420 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2421 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2422 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2423 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2424 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2425 DAG.getConstantFP(0, SL, VT),
2426 RoundedA);
2427
2428 // Add sign to rounded_A
2429 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2430 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2431
2432 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2433 SDValue IsLarge =
2434 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2435 ISD::SETOGT);
2436 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2437}
2438
2440 EVT VT = N->getValueType(0);
2441 EVT NVT = MVT::f32;
2442 if (VT.isVector()) {
2443 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2444 }
2445 SDLoc DL(N);
2446 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2447 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2448 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2449 return DAG.getFPExtendOrRound(Res, DL, VT);
2450}
2451
2452SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2453 SelectionDAG &DAG) const {
2454 if (useF32FTZ(DAG.getMachineFunction())) {
2455 return PromoteBinOpToF32(Op.getNode(), DAG);
2456 }
2457 return Op;
2458}
2459
2460SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2461 SelectionDAG &DAG) const {
2462 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2463
2464 if (Op.getValueType() == MVT::bf16) {
2465 SDLoc Loc(Op);
2466 return DAG.getNode(
2467 ISD::FP_ROUND, Loc, MVT::bf16,
2468 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2469 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2470 }
2471
2472 // Everything else is considered legal.
2473 return Op;
2474}
2475
2476SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2477 SelectionDAG &DAG) const {
2478 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2479
2480 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2481 SDLoc Loc(Op);
2482 return DAG.getNode(
2483 Op.getOpcode(), Loc, Op.getValueType(),
2484 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2485 }
2486
2487 // Everything else is considered legal.
2488 return Op;
2489}
2490
2491SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2492 SelectionDAG &DAG) const {
2493 EVT NarrowVT = Op.getValueType();
2494 SDValue Wide = Op.getOperand(0);
2495 EVT WideVT = Wide.getValueType();
2496 if (NarrowVT.getScalarType() == MVT::bf16) {
2497 const TargetLowering *TLI = STI.getTargetLowering();
2498 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2499 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2500 }
2501 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2502 // This combination was the first to support f32 -> bf16.
2503 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2504 if (WideVT.getScalarType() == MVT::f32) {
2505 return Op;
2506 }
2507 if (WideVT.getScalarType() == MVT::f64) {
2508 SDLoc Loc(Op);
2509 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2510 // the hardware f32 -> bf16 instruction.
2512 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2513 : MVT::f32,
2514 Wide, Loc, DAG);
2515 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2516 }
2517 }
2518 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2519 }
2520 }
2521
2522 // Everything else is considered legal.
2523 return Op;
2524}
2525
2526SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2527 SelectionDAG &DAG) const {
2528 SDValue Narrow = Op.getOperand(0);
2529 EVT NarrowVT = Narrow.getValueType();
2530 EVT WideVT = Op.getValueType();
2531 if (NarrowVT.getScalarType() == MVT::bf16) {
2532 if (WideVT.getScalarType() == MVT::f32 &&
2533 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2534 SDLoc Loc(Op);
2535 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2536 }
2537 if (WideVT.getScalarType() == MVT::f64 &&
2538 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2539 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2540 : MVT::f32;
2541 SDLoc Loc(Op);
2542 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2543 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2544 } else {
2545 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2546 }
2547 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2548 }
2549 }
2550
2551 // Everything else is considered legal.
2552 return Op;
2553}
2554
2556 SDLoc DL(Op);
2557 if (Op.getValueType() != MVT::v2i16)
2558 return Op;
2559 EVT EltVT = Op.getValueType().getVectorElementType();
2560 SmallVector<SDValue> VecElements;
2561 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2562 SmallVector<SDValue> ScalarArgs;
2563 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2564 [&](const SDUse &O) {
2565 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2566 O.get(), DAG.getIntPtrConstant(I, DL));
2567 });
2568 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2569 }
2570 SDValue V =
2571 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2572 return V;
2573}
2574
2576 SDNode *N = Op.getNode();
2577 SDLoc DL(N);
2579
2580 // split the vector argument
2581 for (size_t I = 0; I < N->getNumOperands(); I++) {
2582 SDValue Val = N->getOperand(I);
2583 EVT ValVT = Val.getValueType();
2584 if (ValVT.isVector()) {
2585 EVT EltVT = ValVT.getVectorElementType();
2586 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2587 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2588 DAG.getIntPtrConstant(J, DL)));
2589 } else
2590 Ops.push_back(Val);
2591 }
2592
2593 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
2594 SDValue Tcgen05StNode =
2595 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2596 MemSD->getMemoryVT(), MemSD->getMemOperand());
2597
2598 return Tcgen05StNode;
2599}
2600
2602 SDNode *N = Op.getNode();
2603 SDValue Intrin = N->getOperand(1);
2604
2605 // Get the intrinsic ID
2606 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2607 switch (IntrinNo) {
2608 default:
2609 break;
2610 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2611 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2612 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2613 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2614 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2615 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2616 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2617 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2618 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2619 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2620 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2621 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2622 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2623 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2624 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2625 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2626 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2627 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2628 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2629 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2630 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2631 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2632 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2633 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2634 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2635 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2636 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2637 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2638 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2639 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2640 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2641 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2642 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2643 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2644 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2645 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2646 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2647 return LowerTcgen05St(Op, DAG);
2648 }
2649 return Op;
2650}
2651
2653 SelectionDAG &DAG) {
2654
2655 SDNode *N = Op.getNode();
2656 if (N->getOperand(1).getValueType() != MVT::i128) {
2657 // return, if the operand is already lowered
2658 return SDValue();
2659 }
2660
2661 unsigned IID =
2662 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2663 auto Opcode = [&]() {
2664 switch (IID) {
2665 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2667 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2669 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2671 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2673 default:
2674 llvm_unreachable("unsupported/unhandled intrinsic");
2675 }
2676 }();
2677
2678 SDLoc DL(N);
2679 SDValue TryCancelResponse = N->getOperand(1);
2680 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2681 SDValue TryCancelResponse0 =
2682 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2683 DAG.getIntPtrConstant(0, DL));
2684 SDValue TryCancelResponse1 =
2685 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2686 DAG.getIntPtrConstant(1, DL));
2687
2688 return DAG.getNode(Opcode, DL, N->getVTList(),
2689 {TryCancelResponse0, TryCancelResponse1});
2690}
2691
2693 const unsigned Mode = [&]() {
2694 switch (Op->getConstantOperandVal(0)) {
2695 case Intrinsic::nvvm_prmt:
2697 case Intrinsic::nvvm_prmt_b4e:
2699 case Intrinsic::nvvm_prmt_ecl:
2701 case Intrinsic::nvvm_prmt_ecr:
2703 case Intrinsic::nvvm_prmt_f4e:
2705 case Intrinsic::nvvm_prmt_rc16:
2707 case Intrinsic::nvvm_prmt_rc8:
2709 default:
2710 llvm_unreachable("unsupported/unhandled intrinsic");
2711 }
2712 }();
2713 SDLoc DL(Op);
2714 SDValue A = Op->getOperand(1);
2715 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
2716 : DAG.getConstant(0, DL, MVT::i32);
2717 SDValue Selector = (Op->op_end() - 1)->get();
2718 return getPRMT(A, B, Selector, DL, DAG, Mode);
2719}
2721 switch (Op->getConstantOperandVal(0)) {
2722 default:
2723 return Op;
2724 case Intrinsic::nvvm_prmt:
2725 case Intrinsic::nvvm_prmt_b4e:
2726 case Intrinsic::nvvm_prmt_ecl:
2727 case Intrinsic::nvvm_prmt_ecr:
2728 case Intrinsic::nvvm_prmt_f4e:
2729 case Intrinsic::nvvm_prmt_rc16:
2730 case Intrinsic::nvvm_prmt_rc8:
2731 return lowerPrmtIntrinsic(Op, DAG);
2732 case Intrinsic::nvvm_internal_addrspace_wrap:
2733 return Op.getOperand(1);
2734 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2735 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2736 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2737 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2739 }
2740}
2741
2742// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
2743// Lower these into a node returning the correct type which is zero-extended
2744// back to the correct size.
2746 SDValue V = Op->getOperand(0);
2747 assert(V.getValueType() == MVT::i64 &&
2748 "Unexpected CTLZ/CTPOP type to legalize");
2749
2750 SDLoc DL(Op);
2751 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
2752 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
2753}
2754
2756 unsigned Opcode, SelectionDAG &DAG) {
2757 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
2758
2759 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
2760 if (!AmtConst)
2761 return SDValue();
2762 const auto Amt = AmtConst->getZExtValue() & 63;
2763
2764 SDValue UnpackA =
2765 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
2766 SDValue UnpackB =
2767 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
2768
2769 // Arch is Little endiain: 0 = low bits, 1 = high bits
2770 SDValue ALo = UnpackA.getValue(0);
2771 SDValue AHi = UnpackA.getValue(1);
2772 SDValue BLo = UnpackB.getValue(0);
2773 SDValue BHi = UnpackB.getValue(1);
2774
2775 // The bitfeild consists of { AHi : ALo : BHi : BLo }
2776 //
2777 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
2778 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
2779 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
2780 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
2781 //
2782 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
2783 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
2784 // on the direction. Amt = 32 can be implemented by a packing and unpacking
2785 // move to select and arrange the 32bit values. For simplicity, these cases
2786 // are not handled here explicitly and instead we rely on DAGCombiner to
2787 // remove the no-op funnel shifts we insert.
2788 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
2789 ? std::make_tuple(AHi, ALo, BHi)
2790 : std::make_tuple(ALo, BHi, BLo);
2791
2792 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
2793 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
2794 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
2795
2796 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
2797}
2798
2800 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
2801 SDLoc(Op), Op->getOpcode(), DAG);
2802}
2803
2805 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
2806 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
2807 SDLoc(Op), Opcode, DAG);
2808}
2809
2811 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
2812 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
2813 // the semantics of LLVM's frem.
2814 SDLoc DL(Op);
2815 SDValue X = Op->getOperand(0);
2816 SDValue Y = Op->getOperand(1);
2817 EVT Ty = Op.getValueType();
2818 SDNodeFlags Flags = Op->getFlags();
2819
2820 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
2821 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
2822 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
2824 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
2826
2827 if (Flags.hasNoInfs())
2828 return Sub;
2829
2830 // If Y is infinite, return X
2831 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
2832 SDValue Inf =
2834 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
2835 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
2836}
2837
2839 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2840
2841 SDValue Cond = Op->getOperand(0);
2842 SDValue TrueVal = Op->getOperand(1);
2843 SDValue FalseVal = Op->getOperand(2);
2844 SDLoc DL(Op);
2845
2846 // If both operands are truncated, we push the select through the truncates.
2847 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
2848 FalseVal.getOpcode() == ISD::TRUNCATE) {
2849 TrueVal = TrueVal.getOperand(0);
2850 FalseVal = FalseVal.getOperand(0);
2851
2852 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
2853 ? TrueVal.getValueType()
2854 : FalseVal.getValueType();
2855 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
2856 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
2857 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
2858 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2859 }
2860
2861 // Otherwise, expand the select into a series of logical operations. These
2862 // often can be folded into other operations either by us or ptxas.
2863 TrueVal = DAG.getFreeze(TrueVal);
2864 FalseVal = DAG.getFreeze(FalseVal);
2865 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
2866 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
2867 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
2868 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
2869 return Or;
2870}
2871
2872SDValue
2874 switch (Op.getOpcode()) {
2875 case ISD::RETURNADDR:
2876 return SDValue();
2877 case ISD::FRAMEADDR:
2878 return SDValue();
2879 case ISD::ADDRSPACECAST:
2880 return LowerADDRSPACECAST(Op, DAG);
2882 return Op;
2884 return lowerIntrinsicWOChain(Op, DAG);
2886 return LowerIntrinsicVoid(Op, DAG);
2887 case ISD::BUILD_VECTOR:
2888 return LowerBUILD_VECTOR(Op, DAG);
2889 case ISD::BITCAST:
2890 return LowerBITCAST(Op, DAG);
2892 return Op;
2894 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2896 return LowerINSERT_VECTOR_ELT(Op, DAG);
2898 return LowerVECTOR_SHUFFLE(Op, DAG);
2900 return LowerCONCAT_VECTORS(Op, DAG);
2905 return LowerVECREDUCE(Op, DAG);
2906 case ISD::STORE:
2907 return LowerSTORE(Op, DAG);
2908 case ISD::LOAD:
2909 return LowerLOAD(Op, DAG);
2910 case ISD::SHL_PARTS:
2911 return LowerShiftLeftParts(Op, DAG);
2912 case ISD::SRA_PARTS:
2913 case ISD::SRL_PARTS:
2914 return LowerShiftRightParts(Op, DAG);
2915 case ISD::SELECT:
2916 return lowerSELECT(Op, DAG);
2917 case ISD::FROUND:
2918 return LowerFROUND(Op, DAG);
2919 case ISD::FCOPYSIGN:
2920 return LowerFCOPYSIGN(Op, DAG);
2921 case ISD::SINT_TO_FP:
2922 case ISD::UINT_TO_FP:
2923 return LowerINT_TO_FP(Op, DAG);
2924 case ISD::FP_TO_SINT:
2925 case ISD::FP_TO_UINT:
2926 return LowerFP_TO_INT(Op, DAG);
2927 case ISD::FP_ROUND:
2928 return LowerFP_ROUND(Op, DAG);
2929 case ISD::FP_EXTEND:
2930 return LowerFP_EXTEND(Op, DAG);
2931 case ISD::BR_JT:
2932 return LowerBR_JT(Op, DAG);
2933 case ISD::VAARG:
2934 return LowerVAARG(Op, DAG);
2935 case ISD::VASTART:
2936 return LowerVASTART(Op, DAG);
2937 case ISD::FSHL:
2938 case ISD::FSHR:
2939 return lowerFSH(Op, DAG);
2940 case ISD::ROTL:
2941 case ISD::ROTR:
2942 return lowerROT(Op, DAG);
2943 case ISD::ABS:
2944 case ISD::SMIN:
2945 case ISD::SMAX:
2946 case ISD::UMIN:
2947 case ISD::UMAX:
2948 case ISD::ADD:
2949 case ISD::SUB:
2950 case ISD::MUL:
2951 case ISD::SHL:
2952 case ISD::SREM:
2953 case ISD::UREM:
2954 return LowerVectorArith(Op, DAG);
2956 return LowerDYNAMIC_STACKALLOC(Op, DAG);
2957 case ISD::STACKRESTORE:
2958 return LowerSTACKRESTORE(Op, DAG);
2959 case ISD::STACKSAVE:
2960 return LowerSTACKSAVE(Op, DAG);
2961 case ISD::CopyToReg:
2962 return LowerCopyToReg_128(Op, DAG);
2963 case ISD::FADD:
2964 case ISD::FSUB:
2965 case ISD::FMUL:
2966 // Used only for bf16 on SM80, where we select fma for non-ftz operation
2967 return PromoteBinOpIfF32FTZ(Op, DAG);
2968 case ISD::CTPOP:
2969 case ISD::CTLZ:
2970 return lowerCTLZCTPOP(Op, DAG);
2971 case ISD::FREM:
2972 return lowerFREM(Op, DAG);
2973
2974 default:
2975 llvm_unreachable("Custom lowering not defined for operation");
2976 }
2977}
2978
2979SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
2980 SDLoc DL(Op);
2981 SDValue Chain = Op.getOperand(0);
2982 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
2983 SDValue Index = Op.getOperand(2);
2984
2985 unsigned JId = JT->getIndex();
2987 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
2988
2989 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
2990
2991 // Generate BrxStart node
2992 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2993 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
2994
2995 // Generate BrxItem nodes
2996 assert(!MBBs.empty());
2997 for (MachineBasicBlock *MBB : MBBs.drop_back())
2998 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
2999 DAG.getBasicBlock(MBB), Chain.getValue(1));
3000
3001 // Generate BrxEnd nodes
3002 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
3003 IdV, Chain.getValue(1)};
3004 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
3005
3006 return BrxEnd;
3007}
3008
3009// This will prevent AsmPrinter from trying to print the jump tables itself.
3012}
3013
3014SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3015 SelectionDAG &DAG) const {
3016 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
3017 unsigned SrcAS = N->getSrcAddressSpace();
3018 unsigned DestAS = N->getDestAddressSpace();
3019 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3020 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3021 // Shared and SharedCluster can be converted to each other through generic
3022 // space
3023 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3026 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3027 SDLoc DL(Op.getNode());
3028 const MVT GenerictVT =
3030 SDValue GenericConversion = DAG.getAddrSpaceCast(
3031 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3032 SDValue SharedClusterConversion =
3033 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3034 ADDRESS_SPACE_GENERIC, DestAS);
3035 return SharedClusterConversion;
3036 }
3037
3038 return DAG.getUNDEF(Op.getValueType());
3039 }
3040
3041 return Op;
3042}
3043
3044// This function is almost a copy of SelectionDAG::expandVAArg().
3045// The only diff is that this one produces loads from local address space.
3046SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3047 const TargetLowering *TLI = STI.getTargetLowering();
3048 SDLoc DL(Op);
3049
3050 SDNode *Node = Op.getNode();
3051 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3052 EVT VT = Node->getValueType(0);
3053 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3054 SDValue Tmp1 = Node->getOperand(0);
3055 SDValue Tmp2 = Node->getOperand(1);
3056 const MaybeAlign MA(Node->getConstantOperandVal(3));
3057
3058 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3059 Tmp1, Tmp2, MachinePointerInfo(V));
3060 SDValue VAList = VAListLoad;
3061
3062 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3063 VAList = DAG.getNode(
3064 ISD::ADD, DL, VAList.getValueType(), VAList,
3065 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3066
3067 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3068 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3069 VAList.getValueType()));
3070 }
3071
3072 // Increment the pointer, VAList, to the next vaarg
3073 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3075 DL, VAList.getValueType()));
3076
3077 // Store the incremented VAList to the legalized pointer
3078 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3080
3081 const Value *SrcV = Constant::getNullValue(
3083
3084 // Load the actual argument out of the pointer VAList
3085 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3086}
3087
3088SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3089 const TargetLowering *TLI = STI.getTargetLowering();
3090 SDLoc DL(Op);
3091 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3092
3093 // Store the address of unsized array <function>_vararg[] in the ap object.
3094 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3095
3096 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3097 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3098 MachinePointerInfo(SV));
3099}
3100
3101/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3102static std::optional<std::pair<SDValue, SDValue>>
3104 LoadSDNode *LD = cast<LoadSDNode>(N);
3105 const EVT ResVT = LD->getValueType(0);
3106 const EVT MemVT = LD->getMemoryVT();
3107
3108 // If we're doing sign/zero extension as part of the load, avoid lowering to
3109 // a LoadV node. TODO: consider relaxing this restriction.
3110 if (ResVT != MemVT)
3111 return std::nullopt;
3112
3113 const auto NumEltsAndEltVT =
3114 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3115 if (!NumEltsAndEltVT)
3116 return std::nullopt;
3117 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3118
3119 Align Alignment = LD->getAlign();
3120 const auto &TD = DAG.getDataLayout();
3121 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3122 if (Alignment < PrefAlign) {
3123 // This load is not sufficiently aligned, so bail out and let this vector
3124 // load be scalarized. Note that we may still be able to emit smaller
3125 // vector loads. For example, if we are loading a <4 x float> with an
3126 // alignment of 8, this check will fail but the legalizer will try again
3127 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3128 return std::nullopt;
3129 }
3130
3131 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3132 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3133 // loaded type to i16 and propagate the "real" type as the memory type.
3134 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3135
3136 unsigned Opcode;
3137 switch (NumElts) {
3138 default:
3139 return std::nullopt;
3140 case 2:
3141 Opcode = NVPTXISD::LoadV2;
3142 break;
3143 case 4:
3144 Opcode = NVPTXISD::LoadV4;
3145 break;
3146 case 8:
3147 Opcode = NVPTXISD::LoadV8;
3148 break;
3149 }
3150 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3151 ListVTs.push_back(MVT::Other);
3152 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3153
3154 SDLoc DL(LD);
3155
3156 // Copy regular operands
3157 SmallVector<SDValue, 8> OtherOps(LD->ops());
3158
3159 // The select routine does not have access to the LoadSDNode instance, so
3160 // pass along the extension information
3161 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
3162
3163 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3164 LD->getMemOperand());
3165
3166 SmallVector<SDValue> ScalarRes;
3167 if (EltVT.isVector()) {
3169 assert(NumElts * EltVT.getVectorNumElements() ==
3170 ResVT.getVectorNumElements());
3171 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3172 // into individual elements.
3173 for (const unsigned I : llvm::seq(NumElts)) {
3174 SDValue SubVector = NewLD.getValue(I);
3175 DAG.ExtractVectorElements(SubVector, ScalarRes);
3176 }
3177 } else {
3178 for (const unsigned I : llvm::seq(NumElts)) {
3179 SDValue Res = NewLD.getValue(I);
3180 if (LoadEltVT != EltVT)
3181 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3182 ScalarRes.push_back(Res);
3183 }
3184 }
3185
3186 SDValue LoadChain = NewLD.getValue(NumElts);
3187
3188 const MVT BuildVecVT =
3189 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3190 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3191 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3192
3193 return {{LoadValue, LoadChain}};
3194}
3195
3198 const NVPTXSubtarget &STI) {
3199 if (auto Res = replaceLoadVector(N, DAG, STI))
3200 Results.append({Res->first, Res->second});
3201}
3202
3204 const NVPTXSubtarget &STI) {
3205 if (auto Res = replaceLoadVector(N, DAG, STI))
3206 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3207 return SDValue();
3208}
3209
3210// v = ld i1* addr
3211// =>
3212// v1 = ld i8* addr (-> i16)
3213// v = trunc i16 to i1
3215 SDLoc dl(LD);
3216 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3217 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3218 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3219 LD->getBasePtr(), LD->getPointerInfo(),
3220 MVT::i8, LD->getAlign(),
3221 LD->getMemOperand()->getFlags());
3222 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3223 // The legalizer (the caller) is expecting two values from the legalized
3224 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3225 // in LegalizeDAG.cpp which also uses MergeValues.
3226 return DAG.getMergeValues({result, LD->getChain()}, dl);
3227}
3228
3229SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3230 LoadSDNode *LD = cast<LoadSDNode>(Op);
3231
3232 if (Op.getValueType() == MVT::i1)
3233 return lowerLOADi1(LD, DAG);
3234
3235 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3236 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3237 // we allow for more DAG combine opportunities.
3238 if (LD->getExtensionType() == ISD::EXTLOAD) {
3239 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3240 "Unexpected fpext-load");
3241 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3242 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3243 LD->getMemOperand());
3244 }
3245
3246 llvm_unreachable("Unexpected custom lowering for load");
3247}
3248
3250 const NVPTXSubtarget &STI) {
3251 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3252 SDValue Val = N->getOperand(1);
3253 SDLoc DL(N);
3254 const EVT ValVT = Val.getValueType();
3255 const EVT MemVT = N->getMemoryVT();
3256
3257 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3258 // TODO: consider relaxing this restriction.
3259 if (ValVT != MemVT)
3260 return SDValue();
3261
3262 const auto NumEltsAndEltVT =
3263 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3264 if (!NumEltsAndEltVT)
3265 return SDValue();
3266 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3267
3268 const DataLayout &TD = DAG.getDataLayout();
3269
3270 Align Alignment = N->getAlign();
3271 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3272 if (Alignment < PrefAlign) {
3273 // This store is not sufficiently aligned, so bail out and let this vector
3274 // store be scalarized. Note that we may still be able to emit smaller
3275 // vector stores. For example, if we are storing a <4 x float> with an
3276 // alignment of 8, this check will fail but the legalizer will try again
3277 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3278 return SDValue();
3279 }
3280
3281 unsigned Opcode;
3282 switch (NumElts) {
3283 default:
3284 return SDValue();
3285 case 2:
3286 Opcode = NVPTXISD::StoreV2;
3287 break;
3288 case 4:
3289 Opcode = NVPTXISD::StoreV4;
3290 break;
3291 case 8:
3292 Opcode = NVPTXISD::StoreV8;
3293 break;
3294 }
3295
3297
3298 // First is the chain
3299 Ops.push_back(N->getOperand(0));
3300
3301 // Then the split values
3302 if (EltVT.isVector()) {
3304 assert(NumElts * EltVT.getVectorNumElements() ==
3305 ValVT.getVectorNumElements());
3306 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3307 // stored as b32s
3308 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3309 for (const unsigned I : llvm::seq(NumElts)) {
3310 SmallVector<SDValue, 4> SubVectorElts;
3311 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3312 NumEltsPerSubVector);
3313 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3314 }
3315 } else {
3316 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3317 for (const unsigned I : llvm::seq(NumElts)) {
3318 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3319 DAG.getIntPtrConstant(I, DL));
3320
3321 // Since StoreV2 is a target node, we cannot rely on DAG type
3322 // legalization. Therefore, we must ensure the type is legal. For i1 and
3323 // i8, we set the stored type to i16 and propagate the "real" type as the
3324 // memory type.
3325 if (EltVT.getSizeInBits() < 16)
3326 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3327 Ops.push_back(ExtVal);
3328 }
3329 }
3330
3331 // Then any remaining arguments
3332 Ops.append(N->op_begin() + 2, N->op_end());
3333
3334 SDValue NewSt =
3335 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3336 N->getMemoryVT(), N->getMemOperand());
3337
3338 // return DCI.CombineTo(N, NewSt, true);
3339 return NewSt;
3340}
3341
3342SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3343 StoreSDNode *Store = cast<StoreSDNode>(Op);
3344 EVT VT = Store->getMemoryVT();
3345
3346 if (VT == MVT::i1)
3347 return LowerSTOREi1(Op, DAG);
3348
3349 // Lower store of any other vector type, including v2f32 as we want to break
3350 // it apart since this is not a widely-supported type.
3351 return lowerSTOREVector(Op, DAG, STI);
3352}
3353
3354// st i1 v, addr
3355// =>
3356// v1 = zxt v to i16
3357// st.u8 i16, addr
3358SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3359 SDNode *Node = Op.getNode();
3360 SDLoc dl(Node);
3361 StoreSDNode *ST = cast<StoreSDNode>(Node);
3362 SDValue Tmp1 = ST->getChain();
3363 SDValue Tmp2 = ST->getBasePtr();
3364 SDValue Tmp3 = ST->getValue();
3365 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3366 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3367 SDValue Result =
3368 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3369 ST->getAlign(), ST->getMemOperand()->getFlags());
3370 return Result;
3371}
3372
3373SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3374 SelectionDAG &DAG) const {
3375 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3376 // operand so that it can pass the legalization.
3377
3378 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3379 "Custom lowering for 128-bit CopyToReg only");
3380
3381 SDNode *Node = Op.getNode();
3382 SDLoc DL(Node);
3383
3384 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3385 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3386 DAG.getIntPtrConstant(0, DL));
3387 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3388 DAG.getIntPtrConstant(1, DL));
3389
3391 SmallVector<EVT, 3> ResultsType(Node->values());
3392
3393 NewOps[0] = Op->getOperand(0); // Chain
3394 NewOps[1] = Op->getOperand(1); // Dst Reg
3395 NewOps[2] = Lo; // Lower 64-bit
3396 NewOps[3] = Hi; // Higher 64-bit
3397 if (Op.getNumOperands() == 4)
3398 NewOps[4] = Op->getOperand(3); // Glue if exists
3399
3400 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3401}
3402
3403unsigned NVPTXTargetLowering::getNumRegisters(
3404 LLVMContext &Context, EVT VT,
3405 std::optional<MVT> RegisterVT = std::nullopt) const {
3406 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3407 return 1;
3408 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3409}
3410
3411bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3412 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3413 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3414 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3415 Parts[0] = Val;
3416 return true;
3417 }
3418 return false;
3419}
3420
3421// This creates target external symbol for a function parameter.
3422// Name of the symbol is composed from its index and the function name.
3423// Negative index corresponds to special parameter (unsized array) used for
3424// passing variable arguments.
3425SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
3426 EVT T) const {
3427 StringRef SavedStr = nvTM->getStrPool().save(
3429 return DAG.getExternalSymbol(SavedStr.data(), T);
3430}
3431
3432SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
3433 EVT T) const {
3434 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
3435 return DAG.getExternalSymbol(SavedStr.data(), T);
3436}
3437
3439 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3440 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3441 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3442 const DataLayout &DL = DAG.getDataLayout();
3443 LLVMContext &Ctx = *DAG.getContext();
3444 auto PtrVT = getPointerTy(DAG.getDataLayout());
3445
3446 const Function &F = DAG.getMachineFunction().getFunction();
3447
3448 SDValue Root = DAG.getRoot();
3449 SmallVector<SDValue, 16> OutChains;
3450
3451 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3452 // Ins.size() will be larger
3453 // * if there is an aggregate argument with multiple fields (each field
3454 // showing up separately in Ins)
3455 // * if there is a vector argument with more than typical vector-length
3456 // elements (generally if more than 4) where each vector element is
3457 // individually present in Ins.
3458 // So a different index should be used for indexing into Ins.
3459 // See similar issue in LowerCall.
3460
3461 auto AllIns = ArrayRef(Ins);
3462 for (const auto &Arg : F.args()) {
3463 const auto ArgIns = AllIns.take_while(
3464 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
3465 AllIns = AllIns.drop_front(ArgIns.size());
3466
3467 Type *Ty = Arg.getType();
3468
3469 if (ArgIns.empty())
3470 report_fatal_error("Empty parameter types are not supported");
3471
3472 if (Arg.use_empty()) {
3473 // argument is dead
3474 for (const auto &In : ArgIns) {
3475 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
3476 InVals.push_back(DAG.getUNDEF(In.VT));
3477 }
3478 continue;
3479 }
3480
3481 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
3482
3483 // In the following cases, assign a node order of "i+1"
3484 // to newly created nodes. The SDNodes for params have to
3485 // appear in the same order as their order of appearance
3486 // in the original function. "i+1" holds that order.
3487 if (Arg.hasByValAttr()) {
3488 // Param has ByVal attribute
3489 // Return MoveParam(param symbol).
3490 // Ideally, the param symbol can be returned directly,
3491 // but when SDNode builder decides to use it in a CopyToReg(),
3492 // machine instruction fails because TargetExternalSymbol
3493 // (not lowered) is target dependent, and CopyToReg assumes
3494 // the source is lowered.
3495 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
3496 const auto &ByvalIn = ArgIns[0];
3497 assert(getValueType(DL, Ty) == ByvalIn.VT &&
3498 "Ins type did not match function type");
3499 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
3500
3501 SDValue P;
3502 if (isKernelFunction(F)) {
3503 P = ArgSymbol;
3504 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3505 } else {
3506 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
3507 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3508 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
3510 }
3511 InVals.push_back(P);
3512 } else {
3515 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
3516 assert(VTs.size() == ArgIns.size() && "Size mismatch");
3517 assert(VTs.size() == Offsets.size() && "Size mismatch");
3518
3519 const Align ArgAlign = getFunctionArgumentAlignment(
3520 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
3521
3522 unsigned I = 0;
3523 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3524 for (const unsigned NumElts : VI) {
3525 // i1 is loaded/stored as i8
3526 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
3527 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
3528
3529 SDValue VecAddr = DAG.getObjectPtrOffset(
3530 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
3531
3532 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
3533 SDValue P =
3534 DAG.getLoad(VecVT, dl, Root, VecAddr,
3538 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3539 for (const unsigned J : llvm::seq(NumElts)) {
3540 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
3541
3542 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
3543 DAG, dl);
3544 InVals.push_back(Elt);
3545 }
3546 I += NumElts;
3547 }
3548 }
3549 }
3550
3551 if (!OutChains.empty())
3552 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
3553
3554 return Chain;
3555}
3556
3557SDValue
3559 bool isVarArg,
3561 const SmallVectorImpl<SDValue> &OutVals,
3562 const SDLoc &dl, SelectionDAG &DAG) const {
3563 const Function &F = DAG.getMachineFunction().getFunction();
3564 Type *RetTy = F.getReturnType();
3565
3566 if (RetTy->isVoidTy()) {
3567 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
3568 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3569 }
3570
3571 const DataLayout &DL = DAG.getDataLayout();
3572 LLVMContext &Ctx = *DAG.getContext();
3573
3574 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
3575 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
3576
3577 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3578 // 32-bits are sign extended or zero extended, depending on whether
3579 // they are signed or unsigned types.
3580 const bool ExtendIntegerRetVal =
3581 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3582
3585 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
3586 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3587
3588 const auto GetRetVal = [&](unsigned I) -> SDValue {
3589 SDValue RetVal = OutVals[I];
3591 RetVal.getValueType() &&
3592 "OutVal type should always be legal");
3593
3594 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
3595 const EVT StoreVT =
3596 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
3597 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
3598 };
3599
3600 unsigned I = 0;
3601 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
3602 for (const unsigned NumElts : VI) {
3603 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
3604 ? MaybeAlign(std::nullopt)
3605 : commonAlignment(RetAlign, Offsets[I]);
3606
3608 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
3609
3610 SDValue Ptr =
3611 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
3612
3613 Chain = DAG.getStore(Chain, dl, Val, Ptr,
3615
3616 I += NumElts;
3617 }
3618
3619 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3620}
3621
3623 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3624 SelectionDAG &DAG) const {
3625 if (Constraint.size() > 1)
3626 return;
3627 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3628}
3629
3630// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3631// TgtMemIntrinsic
3632// because we need the information that is only available in the "Value" type
3633// of destination
3634// pointer. In particular, the address space information.
3636 IntrinsicInfo &Info, const CallInst &I,
3637 MachineFunction &MF, unsigned Intrinsic) const {
3638 switch (Intrinsic) {
3639 default:
3640 return false;
3641 case Intrinsic::nvvm_match_all_sync_i32p:
3642 case Intrinsic::nvvm_match_all_sync_i64p:
3644 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3645 // in order to model data exchange with other threads, but perform no real
3646 // memory accesses.
3647 Info.memVT = MVT::i1;
3648
3649 // Our result depends on both our and other thread's arguments.
3651 return true;
3652 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3653 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3654 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3655 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3656 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3657 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3658 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3659 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3660 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3661 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3662 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3663 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3664 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3665 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3666 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3667 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3668 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3669 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3670 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3671 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3672 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3673 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3674 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3675 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3677 Info.memVT = MVT::v8f16;
3678 Info.ptrVal = I.getArgOperand(0);
3679 Info.offset = 0;
3681 Info.align = Align(16);
3682 return true;
3683 }
3684 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3685 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3686 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3687 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3688 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3689 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3690 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3691 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3692 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3693 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3694 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3695 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3696 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3697 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3698 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3699 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3700 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3701 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3702 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3703 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3704 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3705 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3706 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3707 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3709 Info.memVT = MVT::v2i32;
3710 Info.ptrVal = I.getArgOperand(0);
3711 Info.offset = 0;
3713 Info.align = Align(8);
3714 return true;
3715 }
3716
3717 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3718 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3719 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3720 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3721 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3722 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3723 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3724 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3725 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3726 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3727 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3728 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3729 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3730 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3731 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3732 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3733
3734 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3735 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3736 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3737 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3738 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3739 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3740 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3741 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3742 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3743 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3744 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3745 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3746 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3747 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3748 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3749 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3750 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3751 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
3752 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
3753 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
3754 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
3755 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
3756 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
3758 Info.memVT = MVT::v4i32;
3759 Info.ptrVal = I.getArgOperand(0);
3760 Info.offset = 0;
3762 Info.align = Align(16);
3763 return true;
3764 }
3765
3766 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3767 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3768 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3769 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3770 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3771 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3772 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3773 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3774
3775 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3776 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3777 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3778 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3779 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3780 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3781 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3782 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3783 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3784 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3785 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3786 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3787 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3788 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3789 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3790 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3791 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3792 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3793 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3794 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3795 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3796 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
3797 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
3798 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
3800 Info.memVT = MVT::i32;
3801 Info.ptrVal = I.getArgOperand(0);
3802 Info.offset = 0;
3804 Info.align = Align(4);
3805 return true;
3806 }
3807
3808 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3809 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3810 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3811 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3812 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3813 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3814 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3815 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3816 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3817 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3818 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3819 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3821 Info.memVT = MVT::v4f16;
3822 Info.ptrVal = I.getArgOperand(0);
3823 Info.offset = 0;
3825 Info.align = Align(16);
3826 return true;
3827 }
3828
3829 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3830 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3831 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3832 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3833 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3834 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3835 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3836 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3837 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3838 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3839 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3840 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
3841 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
3842 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
3843 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
3844 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
3846 Info.memVT = MVT::v8f32;
3847 Info.ptrVal = I.getArgOperand(0);
3848 Info.offset = 0;
3850 Info.align = Align(16);
3851 return true;
3852 }
3853
3854 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
3855 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
3856 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
3857 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
3858
3859 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
3860 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
3861 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
3862 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
3863
3864 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3865 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3866 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3867 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3868 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3869 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3870 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3871 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3872 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3873 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3874 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3875 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3877 Info.memVT = MVT::v8i32;
3878 Info.ptrVal = I.getArgOperand(0);
3879 Info.offset = 0;
3881 Info.align = Align(16);
3882 return true;
3883 }
3884
3885 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3886 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3887 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3888 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3889 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3890 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3891 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3892 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
3893 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
3894 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
3895 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
3896 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
3897 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
3898 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
3899 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
3901 Info.memVT = MVT::v2i32;
3902 Info.ptrVal = I.getArgOperand(0);
3903 Info.offset = 0;
3905 Info.align = Align(8);
3906 return true;
3907 }
3908
3909 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
3910 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
3911 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
3912 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
3913
3914 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
3915 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
3916 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
3917 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
3919 Info.memVT = MVT::f64;
3920 Info.ptrVal = I.getArgOperand(0);
3921 Info.offset = 0;
3923 Info.align = Align(8);
3924 return true;
3925 }
3926
3927 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
3928 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
3929 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
3930 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
3932 Info.memVT = MVT::v2f64;
3933 Info.ptrVal = I.getArgOperand(0);
3934 Info.offset = 0;
3936 Info.align = Align(16);
3937 return true;
3938 }
3939
3940 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3941 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3942 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3943 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3944 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3945 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3946 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3947 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3948 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3949 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3950 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3951 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3953 Info.memVT = MVT::v4f16;
3954 Info.ptrVal = I.getArgOperand(0);
3955 Info.offset = 0;
3957 Info.align = Align(16);
3958 return true;
3959 }
3960
3961 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3962 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3963 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3964 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3965 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3966 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3967 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3968 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3969 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3970 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3971 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3972 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
3973 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
3974 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
3975 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
3976 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
3978 Info.memVT = MVT::v8f32;
3979 Info.ptrVal = I.getArgOperand(0);
3980 Info.offset = 0;
3982 Info.align = Align(16);
3983 return true;
3984 }
3985
3986 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3987 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3988 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3989 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3990 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3991 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3992 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3993 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3994 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3995 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3996 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3997 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3999 Info.memVT = MVT::v8i32;
4000 Info.ptrVal = I.getArgOperand(0);
4001 Info.offset = 0;
4003 Info.align = Align(16);
4004 return true;
4005 }
4006
4007 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4008 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4009 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4010 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4011 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4012 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4013 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4014 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4015 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4016 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4017 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4019 Info.memVT = MVT::v2i32;
4020 Info.ptrVal = I.getArgOperand(0);
4021 Info.offset = 0;
4023 Info.align = Align(8);
4024 return true;
4025 }
4026
4027 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4028 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4029 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4030 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4032 Info.memVT = MVT::v2f64;
4033 Info.ptrVal = I.getArgOperand(0);
4034 Info.offset = 0;
4036 Info.align = Align(16);
4037 return true;
4038 }
4039
4040 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4041 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4042 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4044 Info.memVT = MVT::i32;
4045 Info.ptrVal = I.getArgOperand(0);
4046 Info.offset = 0;
4048 Info.align = Align(4);
4049 return true;
4050 }
4051
4052 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4053 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4054 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4056 Info.memVT = MVT::v4i32;
4057 Info.ptrVal = I.getArgOperand(0);
4058 Info.offset = 0;
4060 Info.align = Align(16);
4061 return true;
4062 }
4063
4064 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4065 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4066 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4067 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4068 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4069 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4070 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4071 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4072 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4073 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4074 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4075 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4076 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4077 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4078 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4079 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4080 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4081 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4082 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4083 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4084 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4085 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4086 auto &DL = I.getDataLayout();
4088 Info.memVT = getValueType(DL, I.getType());
4089 Info.ptrVal = I.getArgOperand(0);
4090 Info.offset = 0;
4092 Info.align.reset();
4093 return true;
4094 }
4095
4096 case Intrinsic::nvvm_prefetch_tensormap: {
4097 auto &DL = I.getDataLayout();
4099 Info.memVT = getPointerTy(DL);
4100 Info.ptrVal = I.getArgOperand(0);
4101 Info.offset = 0;
4102 Info.flags =
4104 Info.align.reset();
4105 return true;
4106 }
4107
4108 case Intrinsic::nvvm_ldu_global_i:
4109 case Intrinsic::nvvm_ldu_global_f:
4110 case Intrinsic::nvvm_ldu_global_p: {
4112 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4113 Info.ptrVal = I.getArgOperand(0);
4114 Info.offset = 0;
4116 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4117
4118 return true;
4119 }
4120 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4121 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4122 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4123 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4124 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4125 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4126 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4127 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4128 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4129 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4130 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4131 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4132 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4133 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4134 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4135 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4136 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4137 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4138 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4139 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4140 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4141 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4142 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4143 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4144 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4145 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4146 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4147 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4148 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4149 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4150 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4151 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4152 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4153 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4154 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4155 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4156 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4157 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4158 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4159 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4160 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4161 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4162 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4163 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4164 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4165 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4166 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4167 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4168 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4169 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4170 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4171 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4172 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4173 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4174 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4175 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4176 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4177 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4179 Info.memVT = MVT::v4f32;
4180 Info.ptrVal = nullptr;
4181 Info.offset = 0;
4183 Info.align = Align(16);
4184 return true;
4185
4186 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4187 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4188 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4189 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4190 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4191 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4192 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4193 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4194 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4195 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4196 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4197 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4198 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4199 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4200 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4201 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4202 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4203 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4204 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4205 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4206 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4207 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4208 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4209 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4210 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4211 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4212 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4213 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4214 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4215 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4216 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4217 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4218 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4219 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4220 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4221 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4222 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4223 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4224 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4225 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4226 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4227 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4228 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4229 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4230 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4231 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4232 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4233 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4234 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4235 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4236 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4237 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4238 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4239 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4240 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4241 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4242 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4243 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4244 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4245 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4246 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4247 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4248 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4249 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4250 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4251 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4252 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4253 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4254 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4255 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4256 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4257 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4258 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4259 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4260 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4261 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4262 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4263 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4264 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4265 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4266 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4267 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4268 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4269 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4270 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4271 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4272 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4273 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4274 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4275 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4276 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4277 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4278 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4279 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4280 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4281 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4282 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4283 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4284 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4285 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4286 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4287 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4288 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4289 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4290 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4291 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4292 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4293 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4294 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4295 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4296 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4297 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4298 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4299 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4300 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4301 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4303 Info.memVT = MVT::v4i32;
4304 Info.ptrVal = nullptr;
4305 Info.offset = 0;
4307 Info.align = Align(16);
4308 return true;
4309
4310 case Intrinsic::nvvm_suld_1d_i8_clamp:
4311 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4312 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4313 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4314 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4315 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4316 case Intrinsic::nvvm_suld_2d_i8_clamp:
4317 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4318 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4319 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4320 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4321 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4322 case Intrinsic::nvvm_suld_3d_i8_clamp:
4323 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4324 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4325 case Intrinsic::nvvm_suld_1d_i8_trap:
4326 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4327 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4328 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4329 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4330 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4331 case Intrinsic::nvvm_suld_2d_i8_trap:
4332 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4333 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4334 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4335 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4336 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4337 case Intrinsic::nvvm_suld_3d_i8_trap:
4338 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4339 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4340 case Intrinsic::nvvm_suld_1d_i8_zero:
4341 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4342 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4343 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4344 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4345 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4346 case Intrinsic::nvvm_suld_2d_i8_zero:
4347 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4348 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4349 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4350 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4351 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4352 case Intrinsic::nvvm_suld_3d_i8_zero:
4353 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4354 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4356 Info.memVT = MVT::i8;
4357 Info.ptrVal = nullptr;
4358 Info.offset = 0;
4360 Info.align = Align(16);
4361 return true;
4362
4363 case Intrinsic::nvvm_suld_1d_i16_clamp:
4364 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4365 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4366 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4367 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4368 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4369 case Intrinsic::nvvm_suld_2d_i16_clamp:
4370 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4371 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4372 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4373 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4374 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4375 case Intrinsic::nvvm_suld_3d_i16_clamp:
4376 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4377 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4378 case Intrinsic::nvvm_suld_1d_i16_trap:
4379 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4380 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4381 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4382 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4383 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4384 case Intrinsic::nvvm_suld_2d_i16_trap:
4385 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4386 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4387 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4388 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4389 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4390 case Intrinsic::nvvm_suld_3d_i16_trap:
4391 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4392 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4393 case Intrinsic::nvvm_suld_1d_i16_zero:
4394 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4395 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4396 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4397 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4398 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4399 case Intrinsic::nvvm_suld_2d_i16_zero:
4400 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4401 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4402 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4403 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4404 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4405 case Intrinsic::nvvm_suld_3d_i16_zero:
4406 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4407 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4409 Info.memVT = MVT::i16;
4410 Info.ptrVal = nullptr;
4411 Info.offset = 0;
4413 Info.align = Align(16);
4414 return true;
4415
4416 case Intrinsic::nvvm_suld_1d_i32_clamp:
4417 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4418 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4419 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4420 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4421 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4422 case Intrinsic::nvvm_suld_2d_i32_clamp:
4423 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4424 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4425 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4426 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4427 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4428 case Intrinsic::nvvm_suld_3d_i32_clamp:
4429 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4430 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4431 case Intrinsic::nvvm_suld_1d_i32_trap:
4432 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4433 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4434 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4435 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4436 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4437 case Intrinsic::nvvm_suld_2d_i32_trap:
4438 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4439 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4440 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4441 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4442 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4443 case Intrinsic::nvvm_suld_3d_i32_trap:
4444 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4445 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4446 case Intrinsic::nvvm_suld_1d_i32_zero:
4447 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4448 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4449 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4450 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4451 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4452 case Intrinsic::nvvm_suld_2d_i32_zero:
4453 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4454 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4455 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4456 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4457 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4458 case Intrinsic::nvvm_suld_3d_i32_zero:
4459 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4460 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4462 Info.memVT = MVT::i32;
4463 Info.ptrVal = nullptr;
4464 Info.offset = 0;
4466 Info.align = Align(16);
4467 return true;
4468
4469 case Intrinsic::nvvm_suld_1d_i64_clamp:
4470 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4471 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4472 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4473 case Intrinsic::nvvm_suld_2d_i64_clamp:
4474 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4475 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4476 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4477 case Intrinsic::nvvm_suld_3d_i64_clamp:
4478 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4479 case Intrinsic::nvvm_suld_1d_i64_trap:
4480 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4481 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4482 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4483 case Intrinsic::nvvm_suld_2d_i64_trap:
4484 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4485 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4486 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4487 case Intrinsic::nvvm_suld_3d_i64_trap:
4488 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4489 case Intrinsic::nvvm_suld_1d_i64_zero:
4490 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4491 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4492 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4493 case Intrinsic::nvvm_suld_2d_i64_zero:
4494 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4495 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4496 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4497 case Intrinsic::nvvm_suld_3d_i64_zero:
4498 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4500 Info.memVT = MVT::i64;
4501 Info.ptrVal = nullptr;
4502 Info.offset = 0;
4504 Info.align = Align(16);
4505 return true;
4506
4507 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
4508 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
4509 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
4511 Info.memVT = MVT::v1i32;
4512 Info.ptrVal = I.getArgOperand(0);
4513 Info.offset = 0;
4515 Info.align.reset();
4516 return true;
4517 }
4518
4519 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
4520 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
4521 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
4522 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: {
4524 Info.memVT = MVT::v2i32;
4525 Info.ptrVal = I.getArgOperand(0);
4526 Info.offset = 0;
4528 Info.align.reset();
4529 return true;
4530 }
4531
4532 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
4533 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
4534 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
4535 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
4536 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: {
4538 Info.memVT = MVT::v4i32;
4539 Info.ptrVal = I.getArgOperand(0);
4540 Info.offset = 0;
4542 Info.align.reset();
4543 return true;
4544 }
4545
4546 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
4547 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
4548 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
4549 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
4550 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: {
4552 Info.memVT = MVT::v8i32;
4553 Info.ptrVal = I.getArgOperand(0);
4554 Info.offset = 0;
4556 Info.align.reset();
4557 return true;
4558 }
4559
4560 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
4561 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
4562 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
4563 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
4564 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: {
4566 Info.memVT = MVT::v16i32;
4567 Info.ptrVal = I.getArgOperand(0);
4568 Info.offset = 0;
4570 Info.align.reset();
4571 return true;
4572 }
4573
4574 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
4575 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
4576 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
4577 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
4578 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: {
4580 Info.memVT = MVT::v32i32;
4581 Info.ptrVal = I.getArgOperand(0);
4582 Info.offset = 0;
4584 Info.align.reset();
4585 return true;
4586 }
4587
4588 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
4589 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
4590 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
4591 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
4592 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: {
4594 Info.memVT = MVT::v64i32;
4595 Info.ptrVal = I.getArgOperand(0);
4596 Info.offset = 0;
4598 Info.align.reset();
4599 return true;
4600 }
4601
4602 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
4603 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
4604 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
4605 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
4606 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
4608 Info.memVT = MVT::v128i32;
4609 Info.ptrVal = I.getArgOperand(0);
4610 Info.offset = 0;
4612 Info.align.reset();
4613 return true;
4614 }
4615
4616 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
4617 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
4618 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
4620 Info.memVT = MVT::i32;
4621 Info.ptrVal = I.getArgOperand(0);
4622 Info.offset = 0;
4624 Info.align.reset();
4625 return true;
4626 }
4627
4628 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
4629 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
4630 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
4631 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
4633 Info.memVT = MVT::v2i32;
4634 Info.ptrVal = I.getArgOperand(0);
4635 Info.offset = 0;
4637 Info.align.reset();
4638 return true;
4639 }
4640
4641 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
4642 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
4643 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
4644 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
4645 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
4647 Info.memVT = MVT::v4i32;
4648 Info.ptrVal = I.getArgOperand(0);
4649 Info.offset = 0;
4651 Info.align.reset();
4652 return true;
4653 }
4654
4655 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
4656 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
4657 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
4658 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
4659 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
4661 Info.memVT = MVT::v8i32;
4662 Info.ptrVal = I.getArgOperand(0);
4663 Info.offset = 0;
4665 Info.align.reset();
4666 return true;
4667 }
4668
4669 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
4670 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
4671 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
4672 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
4673 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
4675 Info.memVT = MVT::v16i32;
4676 Info.ptrVal = I.getArgOperand(0);
4677 Info.offset = 0;
4679 Info.align.reset();
4680 return true;
4681 }
4682
4683 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
4684 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
4685 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
4686 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
4687 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
4689 Info.memVT = MVT::v32i32;
4690 Info.ptrVal = I.getArgOperand(0);
4691 Info.offset = 0;
4693 Info.align.reset();
4694 return true;
4695 }
4696
4697 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
4698 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
4699 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
4700 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
4701 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
4703 Info.memVT = MVT::v64i32;
4704 Info.ptrVal = I.getArgOperand(0);
4705 Info.offset = 0;
4707 Info.align.reset();
4708 return true;
4709 }
4710
4711 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
4712 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
4713 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
4714 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
4715 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
4717 Info.memVT = MVT::v128i32;
4718 Info.ptrVal = I.getArgOperand(0);
4719 Info.offset = 0;
4721 Info.align.reset();
4722 return true;
4723 }
4724 }
4725 return false;
4726}
4727
4728/// getFunctionParamOptimizedAlign - since function arguments are passed via
4729/// .param space, we may want to increase their alignment in a way that
4730/// ensures that we can effectively vectorize their loads & stores. We can
4731/// increase alignment only if the function has internal or has private
4732/// linkage as for other linkage types callers may already rely on default
4733/// alignment. To allow using 128-bit vectorized loads/stores, this function
4734/// ensures that alignment is 16 or greater.
4736 const Function *F, Type *ArgTy, const DataLayout &DL) const {
4737 // Capping the alignment to 128 bytes as that is the maximum alignment
4738 // supported by PTX.
4739 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
4740
4741 // If a function has linkage different from internal or private, we
4742 // must use default ABI alignment as external users rely on it. Same
4743 // for a function that may be called from a function pointer.
4744 if (!F || !F->hasLocalLinkage() ||
4745 F->hasAddressTaken(/*Users=*/nullptr,
4746 /*IgnoreCallbackUses=*/false,
4747 /*IgnoreAssumeLikeCalls=*/true,
4748 /*IgnoreLLVMUsed=*/true))
4749 return ABITypeAlign;
4750
4751 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4752 return std::max(Align(16), ABITypeAlign);
4753}
4754
4755/// Helper for computing alignment of a device function byval parameter.
4757 const Function *F, Type *ArgTy, Align InitialAlign,
4758 const DataLayout &DL) const {
4759 Align ArgAlign = InitialAlign;
4760 // Try to increase alignment to enhance vectorization options.
4761 if (F)
4762 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
4763
4764 // Old ptx versions have a bug. When PTX code takes address of
4765 // byval parameter with alignment < 4, ptxas generates code to
4766 // spill argument into memory. Alas on sm_50+ ptxas generates
4767 // SASS code that fails with misaligned access. To work around
4768 // the problem, make sure that we align byval parameters by at
4769 // least 4. This bug seems to be fixed at least starting from
4770 // ptxas > 9.0.
4771 // TODO: remove this after verifying the bug is not reproduced
4772 // on non-deprecated ptxas versions.
4774 ArgAlign = std::max(ArgAlign, Align(4));
4775
4776 return ArgAlign;
4777}
4778
4779// Helper for getting a function parameter name. Name is composed from
4780// its index and the function name. Negative index corresponds to special
4781// parameter (unsized array) used for passing variable arguments.
4783 int Idx) const {
4784 std::string ParamName;
4785 raw_string_ostream ParamStr(ParamName);
4786
4787 ParamStr << getTargetMachine().getSymbol(F)->getName();
4788 if (Idx < 0)
4789 ParamStr << "_vararg";
4790 else
4791 ParamStr << "_param_" << Idx;
4792
4793 return ParamName;
4794}
4795
4796/// isLegalAddressingMode - Return true if the addressing mode represented
4797/// by AM is legal for this target, for a load/store of the specified type.
4798/// Used to guide target specific optimizations, like loop strength reduction
4799/// (LoopStrengthReduce.cpp) and memory optimization for address mode
4800/// (CodeGenPrepare.cpp)
4802 const AddrMode &AM, Type *Ty,
4803 unsigned AS, Instruction *I) const {
4804 // AddrMode - This represents an addressing mode of:
4805 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4806 //
4807 // The legal address modes are
4808 // - [avar]
4809 // - [areg]
4810 // - [areg+immoff]
4811 // - [immAddr]
4812
4813 // immoff must fit in a signed 32-bit int
4814 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
4815 return false;
4816
4817 if (AM.BaseGV)
4818 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4819
4820 switch (AM.Scale) {
4821 case 0: // "r", "r+i" or "i" is allowed
4822 break;
4823 case 1:
4824 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4825 return false;
4826 // Otherwise we have r+i.
4827 break;
4828 default:
4829 // No scale > 1 is allowed
4830 return false;
4831 }
4832 return true;
4833}
4834
4835//===----------------------------------------------------------------------===//
4836// NVPTX Inline Assembly Support
4837//===----------------------------------------------------------------------===//
4838
4839/// getConstraintType - Given a constraint letter, return the type of
4840/// constraint it is for this target.
4843 if (Constraint.size() == 1) {
4844 switch (Constraint[0]) {
4845 default:
4846 break;
4847 case 'b':
4848 case 'r':
4849 case 'h':
4850 case 'c':
4851 case 'l':
4852 case 'f':
4853 case 'd':
4854 case 'q':
4855 case '0':
4856 case 'N':
4857 return C_RegisterClass;
4858 }
4859 }
4860 return TargetLowering::getConstraintType(Constraint);
4861}
4862
4863std::pair<unsigned, const TargetRegisterClass *>
4865 StringRef Constraint,
4866 MVT VT) const {
4867 if (Constraint.size() == 1) {
4868 switch (Constraint[0]) {
4869 case 'b':
4870 return std::make_pair(0U, &NVPTX::B1RegClass);
4871 case 'c':
4872 case 'h':
4873 return std::make_pair(0U, &NVPTX::B16RegClass);
4874 case 'r':
4875 case 'f':
4876 return std::make_pair(0U, &NVPTX::B32RegClass);
4877 case 'l':
4878 case 'N':
4879 case 'd':
4880 return std::make_pair(0U, &NVPTX::B64RegClass);
4881 case 'q': {
4882 if (STI.getSmVersion() < 70)
4883 report_fatal_error("Inline asm with 128 bit operands is only "
4884 "supported for sm_70 and higher!");
4885 return std::make_pair(0U, &NVPTX::B128RegClass);
4886 }
4887 }
4888 }
4889 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4890}
4891
4892//===----------------------------------------------------------------------===//
4893// NVPTX DAG Combining
4894//===----------------------------------------------------------------------===//
4895
4897 CodeGenOptLevel OptLevel) const {
4898 // Always honor command-line argument
4899 if (FMAContractLevelOpt.getNumOccurrences() > 0)
4900 return FMAContractLevelOpt > 0;
4901
4902 // Do not contract if we're not optimizing the code.
4903 if (OptLevel == CodeGenOptLevel::None)
4904 return false;
4905
4906 // Honor TargetOptions flags that explicitly say fusion is okay.
4908 return true;
4909
4910 return false;
4911}
4912
4913static bool isConstZero(const SDValue &Operand) {
4914 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
4915 return Const && Const->getZExtValue() == 0;
4916}
4917
4918/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4919/// operands N0 and N1. This is a helper for PerformADDCombine that is
4920/// called with the default operands, and if that fails, with commuted
4921/// operands.
4922static SDValue
4925 EVT VT = N0.getValueType();
4926
4927 // Since integer multiply-add costs the same as integer multiply
4928 // but is more costly than integer add, do the fusion only when
4929 // the mul is only used in the add.
4930 // TODO: this may not be true for later architectures, consider relaxing this
4931 if (!N0.getNode()->hasOneUse())
4932 return SDValue();
4933
4934 // fold (add (select cond, 0, (mul a, b)), c)
4935 // -> (select cond, c, (add (mul a, b), c))
4936 //
4937 if (N0.getOpcode() == ISD::SELECT) {
4938 unsigned ZeroOpNum;
4939 if (isConstZero(N0->getOperand(1)))
4940 ZeroOpNum = 1;
4941 else if (isConstZero(N0->getOperand(2)))
4942 ZeroOpNum = 2;
4943 else
4944 return SDValue();
4945
4946 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
4947 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
4948 return SDValue();
4949
4950 SDLoc DL(N);
4951 SDValue Mul =
4952 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
4953 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
4954 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
4955 ((ZeroOpNum == 1) ? N1 : MAD),
4956 ((ZeroOpNum == 1) ? MAD : N1));
4957 }
4958
4959 return SDValue();
4960}
4961
4962static SDValue
4965 CodeGenOptLevel OptLevel) {
4966 EVT VT = N0.getValueType();
4967 if (N0.getOpcode() == ISD::FMUL) {
4968 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4969 &DCI.DAG.getTargetLoweringInfo());
4970 if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
4971 (N->getFlags().hasAllowContract() &&
4972 N0->getFlags().hasAllowContract())))
4973 return SDValue();
4974
4975 // For floating point:
4976 // Do the fusion only when the mul has less than 5 uses and all
4977 // are add.
4978 // The heuristic is that if a use is not an add, then that use
4979 // cannot be fused into fma, therefore mul is still needed anyway.
4980 // If there are more than 4 uses, even if they are all add, fusing
4981 // them will increase register pressue.
4982 //
4983 int numUses = 0;
4984 int nonAddCount = 0;
4985 for (const SDNode *User : N0.getNode()->users()) {
4986 numUses++;
4987 if (User->getOpcode() != ISD::FADD)
4988 ++nonAddCount;
4989 if (numUses >= 5)
4990 return SDValue();
4991 }
4992 if (nonAddCount) {
4993 int orderNo = N->getIROrder();
4994 int orderNo2 = N0.getNode()->getIROrder();
4995 // simple heuristics here for considering potential register
4996 // pressure, the logics here is that the differnce are used
4997 // to measure the distance between def and use, the longer distance
4998 // more likely cause register pressure.
4999 if (orderNo - orderNo2 < 500)
5000 return SDValue();
5001
5002 // Now, check if at least one of the FMUL's operands is live beyond the
5003 // node N, which guarantees that the FMA will not increase register
5004 // pressure at node N.
5005 bool opIsLive = false;
5006 const SDNode *left = N0.getOperand(0).getNode();
5007 const SDNode *right = N0.getOperand(1).getNode();
5008
5009 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5010 opIsLive = true;
5011
5012 if (!opIsLive)
5013 for (const SDNode *User : left->users()) {
5014 int orderNo3 = User->getIROrder();
5015 if (orderNo3 > orderNo) {
5016 opIsLive = true;
5017 break;
5018 }
5019 }
5020
5021 if (!opIsLive)
5022 for (const SDNode *User : right->users()) {
5023 int orderNo3 = User->getIROrder();
5024 if (orderNo3 > orderNo) {
5025 opIsLive = true;
5026 break;
5027 }
5028 }
5029
5030 if (!opIsLive)
5031 return SDValue();
5032 }
5033
5034 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5035 N0.getOperand(1), N1);
5036 }
5037
5038 return SDValue();
5039}
5040
5041/// Fold unpacking movs into a load by increasing the number of return values.
5042///
5043/// ex:
5044/// L: v2f16,ch = load <p>
5045/// a: f16 = extractelt L:0, 0
5046/// b: f16 = extractelt L:0, 1
5047/// use(a, b)
5048///
5049/// ...is turned into...
5050///
5051/// L: f16,f16,ch = LoadV2 <p>
5052/// use(L:0, L:1)
5053static SDValue
5055 // Don't run this optimization before the legalizer
5056 if (!DCI.isAfterLegalizeDAG())
5057 return SDValue();
5058
5059 EVT ElementVT = N->getValueType(0);
5060 // Avoid non-packed types and v4i8
5061 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5062 return SDValue();
5063
5064 SmallVector<SDNode *> DeadCopyToRegs;
5065
5066 // Check whether all outputs are either used by an extractelt or are
5067 // glue/chain nodes
5068 if (!all_of(N->uses(), [&](SDUse &U) {
5069 // Skip glue, chain nodes
5070 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5071 return true;
5072 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5073 if (N->getOpcode() != ISD::LOAD)
5074 return true;
5075 // Since this is an ISD::LOAD, check all extractelts are used. If
5076 // any are not used, we don't want to defeat another optimization that
5077 // will narrow the load.
5078 //
5079 // For example:
5080 //
5081 // L: v2f16,ch = load <p>
5082 // e0: f16 = extractelt L:0, 0
5083 // e1: f16 = extractelt L:0, 1 <-- unused
5084 // store e0
5085 //
5086 // Can be optimized by DAGCombiner to:
5087 //
5088 // L: f16,ch = load <p>
5089 // store L:0
5090 return !U.getUser()->use_empty();
5091 }
5092
5093 // Otherwise, this use prevents us from splitting a value.
5094 return false;
5095 }))
5096 return SDValue();
5097
5098 auto *LD = cast<MemSDNode>(N);
5099 SDLoc DL(LD);
5100
5101 // the new opcode after we double the number of operands
5102 NVPTXISD::NodeType Opcode;
5104 unsigned OldNumOutputs; // non-glue, non-chain outputs
5105 switch (LD->getOpcode()) {
5106 case ISD::LOAD:
5107 OldNumOutputs = 1;
5108 // Any packed type is legal, so the legalizer will not have lowered
5109 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5110 // here.
5111 Opcode = NVPTXISD::LoadV2;
5112 Operands.push_back(DCI.DAG.getIntPtrConstant(
5113 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5114 break;
5115 case NVPTXISD::LoadV2:
5116 OldNumOutputs = 2;
5117 Opcode = NVPTXISD::LoadV4;
5118 break;
5119 case NVPTXISD::LoadV4:
5120 // V8 is only supported for f32. Don't forget, we're not changing the load
5121 // size here. This is already a 256-bit load.
5122 if (ElementVT != MVT::v2f32)
5123 return SDValue();
5124 OldNumOutputs = 4;
5125 Opcode = NVPTXISD::LoadV8;
5126 break;
5127 case NVPTXISD::LoadV8:
5128 // PTX doesn't support the next doubling of outputs
5129 return SDValue();
5130 }
5131
5132 // the non-glue, non-chain outputs in the new load
5133 const unsigned NewNumOutputs = OldNumOutputs * 2;
5134 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5135 // add remaining chain and glue values
5136 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5137
5138 // Create the new load
5139 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5140 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5141 LD->getMemOperand());
5142
5143 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5144 // the outputs the same. These nodes will be optimized away in later
5145 // DAGCombiner iterations.
5147 for (unsigned I : seq(OldNumOutputs))
5148 Results.push_back(DCI.DAG.getBuildVector(
5149 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5150 // Add remaining chain and glue nodes
5151 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5152 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5153
5154 return DCI.DAG.getMergeValues(Results, DL);
5155}
5156
5157/// Fold packing movs into a store.
5158///
5159/// ex:
5160/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5161/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5162/// StoreV2 v1, v2
5163///
5164/// ...is turned into...
5165///
5166/// StoreV4 a, b, c, d
5169 unsigned Front, unsigned Back) {
5170 // We want to run this as late as possible since other optimizations may
5171 // eliminate the BUILD_VECTORs.
5172 if (!DCI.isAfterLegalizeDAG())
5173 return SDValue();
5174
5175 // Get the type of the operands being stored.
5176 EVT ElementVT = N->getOperand(Front).getValueType();
5177
5178 // Avoid non-packed types and v4i8
5179 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5180 return SDValue();
5181
5182 auto *ST = cast<MemSDNode>(N);
5183
5184 // The new opcode after we double the number of operands.
5185 NVPTXISD::NodeType Opcode;
5186 switch (N->getOpcode()) {
5187 case ISD::STORE:
5188 // Any packed type is legal, so the legalizer will not have lowered
5189 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5190 // it here.
5191 Opcode = NVPTXISD::StoreV2;
5192 break;
5193 case NVPTXISD::StoreV2:
5194 Opcode = NVPTXISD::StoreV4;
5195 break;
5196 case NVPTXISD::StoreV4:
5197 // V8 is only supported for f32. Don't forget, we're not changing the store
5198 // size here. This is already a 256-bit store.
5199 if (ElementVT != MVT::v2f32)
5200 return SDValue();
5201 Opcode = NVPTXISD::StoreV8;
5202 break;
5203 case NVPTXISD::StoreV8:
5204 // PTX doesn't support the next doubling of operands
5205 return SDValue();
5206 default:
5207 llvm_unreachable("Unhandled store opcode");
5208 }
5209
5210 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5211 // their elements.
5212 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5213 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5214 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5215 return SDValue();
5216
5217 // If the operand has multiple uses, this optimization can increase register
5218 // pressure.
5219 if (!BV.hasOneUse())
5220 return SDValue();
5221
5222 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
5223 // any signs they may be folded by some other pattern or rule.
5224 for (SDValue Op : BV->ops()) {
5225 // Peek through bitcasts
5226 if (Op.getOpcode() == ISD::BITCAST)
5227 Op = Op.getOperand(0);
5228
5229 // This may be folded into a PRMT.
5230 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
5231 Op->getOperand(0).getValueType() == MVT::i32)
5232 return SDValue();
5233
5234 // This may be folded into cvt.bf16x2
5235 if (Op.getOpcode() == ISD::FP_ROUND)
5236 return SDValue();
5237 }
5238 Operands.append({BV.getOperand(0), BV.getOperand(1)});
5239 }
5240 Operands.append(N->op_end() - Back, N->op_end());
5241
5242 // Now we replace the store
5243 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
5244 ST->getMemoryVT(), ST->getMemOperand());
5245}
5246
5248 const NVPTXSubtarget &STI) {
5249
5250 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
5251 // Here is our chance to custom lower a store with a non-simple type.
5252 // Unfortunately, we can't do this in the legalizer because there is no
5253 // way to setOperationAction for an non-simple type.
5254 StoreSDNode *ST = cast<StoreSDNode>(N);
5255 if (!ST->getValue().getValueType().isSimple())
5256 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
5257 }
5258
5259 return combinePackingMovIntoStore(N, DCI, 1, 2);
5260}
5261
5263 const NVPTXSubtarget &STI) {
5264 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
5265 // Here is our chance to custom lower a load with a non-simple type.
5266 // Unfortunately, we can't do this in the legalizer because there is no
5267 // way to setOperationAction for an non-simple type.
5268 if (!N->getValueType(0).isSimple())
5269 return lowerLoadVector(N, DCI.DAG, STI);
5270 }
5271
5272 return combineUnpackingMovIntoLoad(N, DCI);
5273}
5274
5275/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5276///
5279 CodeGenOptLevel OptLevel) {
5280 if (OptLevel == CodeGenOptLevel::None)
5281 return SDValue();
5282
5283 SDValue N0 = N->getOperand(0);
5284 SDValue N1 = N->getOperand(1);
5285
5286 // Skip non-integer, non-scalar case
5287 EVT VT = N0.getValueType();
5288 if (VT.isVector() || VT != MVT::i32)
5289 return SDValue();
5290
5291 // First try with the default operand order.
5292 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5293 return Result;
5294
5295 // If that didn't work, try again with the operands commuted.
5296 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5297}
5298
5299/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5300///
5303 CodeGenOptLevel OptLevel) {
5304 SDValue N0 = N->getOperand(0);
5305 SDValue N1 = N->getOperand(1);
5306
5307 EVT VT = N0.getValueType();
5308 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5309 return SDValue();
5310
5311 // First try with the default operand order.
5312 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5313 return Result;
5314
5315 // If that didn't work, try again with the operands commuted.
5316 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5317}
5318
5321 CodeGenOptLevel OptLevel) {
5322 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5323
5324 // Don't do anything at less than -O2.
5325 if (OptLevel < CodeGenOptLevel::Default)
5326 return SDValue();
5327
5328 SelectionDAG &DAG = DCI.DAG;
5329 SDLoc DL(N);
5330 EVT VT = N->getValueType(0);
5331 bool IsSigned = N->getOpcode() == ISD::SREM;
5332 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5333
5334 const SDValue &Num = N->getOperand(0);
5335 const SDValue &Den = N->getOperand(1);
5336
5337 for (const SDNode *U : Num->users()) {
5338 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5339 U->getOperand(1) == Den) {
5340 // Num % Den -> Num - (Num / Den) * Den
5341 return DAG.getNode(ISD::SUB, DL, VT, Num,
5342 DAG.getNode(ISD::MUL, DL, VT,
5343 DAG.getNode(DivOpc, DL, VT, Num, Den),
5344 Den));
5345 }
5346 }
5347 return SDValue();
5348}
5349
5350// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
5352 CodeGenOptLevel OptLevel) {
5353 if (OptLevel == CodeGenOptLevel::None)
5354 return SDValue();
5355
5356 SDValue Op = N->getOperand(0);
5357 if (!Op.hasOneUse())
5358 return SDValue();
5359 EVT ToVT = N->getValueType(0);
5360 EVT FromVT = Op.getValueType();
5361 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
5362 (ToVT == MVT::i64 && FromVT == MVT::i32)))
5363 return SDValue();
5364 if (!(Op.getOpcode() == ISD::MUL ||
5365 (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
5366 return SDValue();
5367
5368 SDLoc DL(N);
5369 unsigned ExtOpcode = N->getOpcode();
5370 unsigned Opcode = 0;
5371 if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
5373 else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
5375 else
5376 return SDValue();
5377 SDValue RHS = Op.getOperand(1);
5378 if (Op.getOpcode() == ISD::SHL) {
5379 const auto ShiftAmt = Op.getConstantOperandVal(1);
5380 const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
5381 RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
5382 }
5383 return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
5384}
5385
5389 Unknown
5391
5392/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5393/// that can be demoted to \p OptSize bits without loss of information. The
5394/// signedness of the operand, if determinable, is placed in \p S.
5396 unsigned OptSize,
5397 OperandSignedness &S) {
5398 S = Unknown;
5399
5400 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5401 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5402 EVT OrigVT = Op.getOperand(0).getValueType();
5403 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5404 S = Signed;
5405 return true;
5406 }
5407 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5408 EVT OrigVT = Op.getOperand(0).getValueType();
5409 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5410 S = Unsigned;
5411 return true;
5412 }
5413 }
5414
5415 return false;
5416}
5417
5418/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5419/// be demoted to \p OptSize bits without loss of information. If the operands
5420/// contain a constant, it should appear as the RHS operand. The signedness of
5421/// the operands is placed in \p IsSigned.
5423 unsigned OptSize,
5424 bool &IsSigned) {
5425 OperandSignedness LHSSign;
5426
5427 // The LHS operand must be a demotable op
5428 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5429 return false;
5430
5431 // We should have been able to determine the signedness from the LHS
5432 if (LHSSign == Unknown)
5433 return false;
5434
5435 IsSigned = (LHSSign == Signed);
5436
5437 // The RHS can be a demotable op or a constant
5438 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5439 const APInt &Val = CI->getAPIntValue();
5440 if (LHSSign == Unsigned) {
5441 return Val.isIntN(OptSize);
5442 } else {
5443 return Val.isSignedIntN(OptSize);
5444 }
5445 } else {
5446 OperandSignedness RHSSign;
5447 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5448 return false;
5449
5450 return LHSSign == RHSSign;
5451 }
5452}
5453
5454/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5455/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5456/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5457/// amount.
5460 EVT MulType = N->getValueType(0);
5461 if (MulType != MVT::i32 && MulType != MVT::i64) {
5462 return SDValue();
5463 }
5464
5465 SDLoc DL(N);
5466 unsigned OptSize = MulType.getSizeInBits() >> 1;
5467 SDValue LHS = N->getOperand(0);
5468 SDValue RHS = N->getOperand(1);
5469
5470 // Canonicalize the multiply so the constant (if any) is on the right
5471 if (N->getOpcode() == ISD::MUL) {
5472 if (isa<ConstantSDNode>(LHS)) {
5473 std::swap(LHS, RHS);
5474 }
5475 }
5476
5477 // If we have a SHL, determine the actual multiply amount
5478 if (N->getOpcode() == ISD::SHL) {
5479 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5480 if (!ShlRHS) {
5481 return SDValue();
5482 }
5483
5484 APInt ShiftAmt = ShlRHS->getAPIntValue();
5485 unsigned BitWidth = MulType.getSizeInBits();
5486 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5487 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5488 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5489 } else {
5490 return SDValue();
5491 }
5492 }
5493
5494 bool Signed;
5495 // Verify that our operands are demotable
5496 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5497 return SDValue();
5498 }
5499
5500 EVT DemotedVT;
5501 if (MulType == MVT::i32) {
5502 DemotedVT = MVT::i16;
5503 } else {
5504 DemotedVT = MVT::i32;
5505 }
5506
5507 // Truncate the operands to the correct size. Note that these are just for
5508 // type consistency and will (likely) be eliminated in later phases.
5509 SDValue TruncLHS =
5510 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5511 SDValue TruncRHS =
5512 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5513
5514 unsigned Opc;
5515 if (Signed) {
5517 } else {
5519 }
5520
5521 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5522}
5523
5524static bool isConstOne(const SDValue &Operand) {
5525 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5526 return Const && Const->getZExtValue() == 1;
5527}
5528
5530 if (Add->getOpcode() != ISD::ADD)
5531 return SDValue();
5532
5533 if (isConstOne(Add->getOperand(0)))
5534 return Add->getOperand(1);
5535
5536 if (isConstOne(Add->getOperand(1)))
5537 return Add->getOperand(0);
5538
5539 return SDValue();
5540}
5541
5544
5546 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5547 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
5548 }
5549
5550 return SDValue();
5551}
5552
5554 SDLoc DL,
5556 if (Select->getOpcode() != ISD::SELECT)
5557 return SDValue();
5558
5559 SDValue Cond = Select->getOperand(0);
5560
5561 unsigned ConstOpNo;
5562 if (isConstOne(Select->getOperand(1)))
5563 ConstOpNo = 1;
5564 else if (isConstOne(Select->getOperand(2)))
5565 ConstOpNo = 2;
5566 else
5567 return SDValue();
5568
5569 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5570
5571 // Do not combine if the resulting sequence is not obviously profitable.
5573 return SDValue();
5574
5575 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5576
5577 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5578 (ConstOpNo == 1) ? X : NewMul,
5579 (ConstOpNo == 1) ? NewMul : X);
5580}
5581
5582static SDValue
5585
5586 EVT VT = N0.getValueType();
5587 if (VT.isVector())
5588 return SDValue();
5589
5590 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5591 return SDValue();
5592
5593 SDLoc DL(N);
5594
5595 // (mul x, (add y, 1)) -> (add (mul x, y), x)
5596 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5597 return Res;
5598 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5599 return Res;
5600
5601 // (mul x, (select y, 1)) -> (select (mul x, y), x)
5602 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5603 return Res;
5604 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5605 return Res;
5606
5607 return SDValue();
5608}
5609
5610/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5613 CodeGenOptLevel OptLevel) {
5614 if (OptLevel == CodeGenOptLevel::None)
5615 return SDValue();
5616
5617 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5618 return Ret;
5619
5620 SDValue N0 = N->getOperand(0);
5621 SDValue N1 = N->getOperand(1);
5622 return PerformMULCombineWithOperands(N, N0, N1, DCI);
5623}
5624
5625/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5628 CodeGenOptLevel OptLevel) {
5629 if (OptLevel > CodeGenOptLevel::None) {
5630 // Try mul.wide combining at OptLevel > 0
5631 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5632 return Ret;
5633 }
5634
5635 return SDValue();
5636}
5637
5640 unsigned int SmVersion) {
5641 EVT CCType = N->getValueType(0);
5642 SDValue A = N->getOperand(0);
5643 SDValue B = N->getOperand(1);
5644
5645 EVT AType = A.getValueType();
5646 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5647 return SDValue();
5648
5649 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5650 return SDValue();
5651
5652 SDLoc DL(N);
5653 // setp.f16x2 returns two scalar predicates, which we need to
5654 // convert back to v2i1. The returned result will be scalarized by
5655 // the legalizer, but the comparison will remain a single vector
5656 // instruction.
5657 SDValue CCNode = DCI.DAG.getNode(
5658 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5660 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5661 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5662 CCNode.getValue(1));
5663}
5664
5667 SDValue Vector = N->getOperand(0);
5668 if (Vector->getOpcode() == ISD::FREEZE)
5669 Vector = Vector->getOperand(0);
5670 SDLoc DL(N);
5671 EVT VectorVT = Vector.getValueType();
5672 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5673 IsPTXVectorType(VectorVT.getSimpleVT()))
5674 return SDValue(); // Native vector loads already combine nicely w/
5675 // extract_vector_elt.
5676 // Don't mess with singletons or packed types (v2f32, v2*16, v4i8 and v8i8),
5677 // we already handle them OK.
5678 if (VectorVT.getVectorNumElements() == 1 ||
5679 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
5680 return SDValue();
5681
5682 // Don't mess with undef values as sra may be simplified to 0, not undef.
5683 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5684 return SDValue();
5685
5686 uint64_t VectorBits = VectorVT.getSizeInBits();
5687 // We only handle the types we can extract in-register.
5688 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5689 return SDValue();
5690
5691 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5692 // Index == 0 is handled by generic DAG combiner.
5693 if (!Index || Index->getZExtValue() == 0)
5694 return SDValue();
5695
5696 MVT IVT = MVT::getIntegerVT(VectorBits);
5697 EVT EltVT = VectorVT.getVectorElementType();
5698 EVT EltIVT = EltVT.changeTypeToInteger();
5699 uint64_t EltBits = EltVT.getScalarSizeInBits();
5700
5701 SDValue Result = DCI.DAG.getNode(
5702 ISD::TRUNCATE, DL, EltIVT,
5703 DCI.DAG.getNode(
5704 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5705 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5706
5707 // If element has non-integer type, bitcast it back to the expected type.
5708 if (EltVT != EltIVT)
5709 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5710 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5711 if (EltVT != N->getValueType(0))
5712 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5713
5714 return Result;
5715}
5716
5719 SDValue VA = N->getOperand(1);
5720 EVT VectorVT = VA.getValueType();
5721 if (VectorVT != MVT::v4i8)
5722 return SDValue();
5723
5724 // We need to split vselect into individual per-element operations Because we
5725 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5726 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5727 // to/from i16 normally used for i8 values.
5729 SDLoc DL(N);
5730 SDValue VCond = N->getOperand(0);
5731 SDValue VB = N->getOperand(2);
5732 for (int I = 0; I < 4; ++I) {
5733 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5734 DCI.DAG.getConstant(I, DL, MVT::i32));
5735 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5736 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5737 DCI.DAG.getConstant(I, DL, MVT::i32)),
5738 DL, MVT::i32);
5739 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5740 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5741 DCI.DAG.getConstant(I, DL, MVT::i32)),
5742 DL, MVT::i32);
5744 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5745 }
5746 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5747}
5748
5749static SDValue
5751 auto VT = N->getValueType(0);
5752 if (!DCI.isAfterLegalizeDAG() ||
5753 // only process v2*16 types
5754 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
5755 VT.getVectorNumElements() == 2))
5756 return SDValue();
5757
5758 auto Op0 = N->getOperand(0);
5759 auto Op1 = N->getOperand(1);
5760
5761 // Start out by assuming we want to take the lower 2 bytes of each i32
5762 // operand.
5763 uint64_t Op0Bytes = 0x10;
5764 uint64_t Op1Bytes = 0x54;
5765
5766 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
5767 {&Op1, &Op1Bytes}};
5768
5769 // Check that each operand is an i16, truncated from an i32 operand. We'll
5770 // select individual bytes from those original operands. Optionally, fold in a
5771 // shift right of that original operand.
5772 for (auto &[Op, OpBytes] : OpData) {
5773 // Eat up any bitcast
5774 if (Op->getOpcode() == ISD::BITCAST)
5775 *Op = Op->getOperand(0);
5776
5777 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
5778 Op->getOperand(0).getValueType() == MVT::i32))
5779 return SDValue();
5780
5781 // If the truncate has multiple uses, this optimization can increase
5782 // register pressure
5783 if (!Op->hasOneUse())
5784 return SDValue();
5785
5786 *Op = Op->getOperand(0);
5787
5788 // Optionally, fold in a shift-right of the original operand and let permute
5789 // pick the two higher bytes of the original value directly.
5790 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
5791 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
5792 // Shift the PRMT byte selector to pick upper bytes from each respective
5793 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
5794 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
5795 "PRMT selector values out of range");
5796 *OpBytes += 0x22;
5797 *Op = Op->getOperand(0);
5798 }
5799 }
5800 }
5801
5802 SDLoc DL(N);
5803 auto &DAG = DCI.DAG;
5804
5805 auto PRMT =
5806 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
5807 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
5808 return DAG.getBitcast(VT, PRMT);
5809}
5810
5813 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
5814
5815 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
5816 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
5817
5818 // Fold asc[B -> A](asc[A -> B](x)) -> x
5819 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
5820 return ASCN2->getOperand(0);
5821 }
5822
5823 return SDValue();
5824}
5825
5826// Given a constant selector value and a prmt mode, return the selector value
5827// normalized to the generic prmt mode. See the PTX ISA documentation for more
5828// details:
5829// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
5830static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
5831 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
5832
5833 if (Mode == NVPTX::PTXPrmtMode::NONE)
5834 return Selector;
5835
5836 const unsigned V = Selector.trunc(2).getZExtValue();
5837
5838 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
5839 unsigned S3) {
5840 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
5841 };
5842
5843 switch (Mode) {
5845 return GetSelector(V, V + 1, V + 2, V + 3);
5847 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
5849 return GetSelector(V, V, V, V);
5851 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
5853 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
5855 unsigned V1 = (V & 1) << 1;
5856 return GetSelector(V1, V1 + 1, V1, V1 + 1);
5857 }
5858 default:
5859 llvm_unreachable("Invalid PRMT mode");
5860 }
5861}
5862
5863static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
5864 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
5865 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
5866 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
5867 APInt BitField = B.concat(A);
5868 APInt SelectorVal = getPRMTSelector(Selector, Mode);
5869 APInt Result(32, 0);
5870 for (unsigned I : llvm::seq(4U)) {
5871 APInt Sel = SelectorVal.extractBits(4, I * 4);
5872 unsigned Idx = Sel.getLoBits(3).getZExtValue();
5873 unsigned Sign = Sel.getHiBits(1).getZExtValue();
5874 APInt Byte = BitField.extractBits(8, Idx * 8);
5875 if (Sign)
5876 Byte = Byte.ashr(8);
5877 Result.insertBits(Byte, I * 8);
5878 }
5879 return Result;
5880}
5881
5883 CodeGenOptLevel OptLevel) {
5884 if (OptLevel == CodeGenOptLevel::None)
5885 return SDValue();
5886
5887 // Constant fold PRMT
5888 if (isa<ConstantSDNode>(N->getOperand(0)) &&
5889 isa<ConstantSDNode>(N->getOperand(1)) &&
5890 isa<ConstantSDNode>(N->getOperand(2)))
5891 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
5892 N->getConstantOperandAPInt(1),
5893 N->getConstantOperandAPInt(2),
5894 N->getConstantOperandVal(3)),
5895 SDLoc(N), N->getValueType(0));
5896 return SDValue();
5897}
5898
5899// During call lowering we wrap the return values in a ProxyReg node which
5900// depend on the chain value produced by the completed call. This ensures that
5901// the full call is emitted in cases where libcalls are used to legalize
5902// operations. To improve the functioning of other DAG combines we pull all
5903// operations we can through one of these nodes, ensuring that the ProxyReg
5904// directly wraps a load. That is:
5905//
5906// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
5907//
5910 switch (R.getOpcode()) {
5911 case ISD::TRUNCATE:
5912 case ISD::ANY_EXTEND:
5913 case ISD::SIGN_EXTEND:
5914 case ISD::ZERO_EXTEND:
5915 case ISD::BITCAST: {
5916 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
5917 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
5918 return SDValue();
5919 }
5920 case ISD::SHL:
5921 case ISD::SRL:
5922 case ISD::SRA:
5923 case ISD::OR: {
5924 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
5925 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
5926 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
5927 return SDValue();
5928 }
5929 case ISD::Constant:
5930 return R;
5931 case ISD::LOAD:
5932 case NVPTXISD::LoadV2:
5933 case NVPTXISD::LoadV4: {
5934 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
5935 {Chain, R});
5936 }
5937 case ISD::BUILD_VECTOR: {
5938 if (DCI.isBeforeLegalize())
5939 return SDValue();
5940
5942 for (auto &Op : R->ops()) {
5943 SDValue V = sinkProxyReg(Op, Chain, DCI);
5944 if (!V)
5945 return SDValue();
5946 Ops.push_back(V);
5947 }
5948 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
5949 }
5951 if (DCI.isBeforeLegalize())
5952 return SDValue();
5953
5954 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
5956 R.getValueType(), V, R.getOperand(1));
5957 return SDValue();
5958 }
5959 default:
5960 return SDValue();
5961 }
5962}
5963
5966
5967 SDValue Chain = N->getOperand(0);
5968 SDValue Reg = N->getOperand(1);
5969
5970 // If the ProxyReg is not wrapping a load, try to pull the operations through
5971 // the ProxyReg.
5972 if (Reg.getOpcode() != ISD::LOAD) {
5973 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
5974 return V;
5975 }
5976
5977 return SDValue();
5978}
5979
5980SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5981 DAGCombinerInfo &DCI) const {
5983 switch (N->getOpcode()) {
5984 default:
5985 break;
5986 case ISD::ADD:
5987 return PerformADDCombine(N, DCI, OptLevel);
5988 case ISD::ADDRSPACECAST:
5989 return combineADDRSPACECAST(N, DCI);
5990 case ISD::SIGN_EXTEND:
5991 case ISD::ZERO_EXTEND:
5992 return combineMulWide(N, DCI, OptLevel);
5993 case ISD::BUILD_VECTOR:
5994 return PerformBUILD_VECTORCombine(N, DCI);
5996 return PerformEXTRACTCombine(N, DCI);
5997 case ISD::FADD:
5998 return PerformFADDCombine(N, DCI, OptLevel);
5999 case ISD::LOAD:
6000 case NVPTXISD::LoadV2:
6001 case NVPTXISD::LoadV4:
6002 return combineLOAD(N, DCI, STI);
6003 case ISD::MUL:
6004 return PerformMULCombine(N, DCI, OptLevel);
6005 case NVPTXISD::PRMT:
6006 return combinePRMT(N, DCI, OptLevel);
6007 case NVPTXISD::ProxyReg:
6008 return combineProxyReg(N, DCI);
6009 case ISD::SETCC:
6010 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6011 case ISD::SHL:
6012 return PerformSHLCombine(N, DCI, OptLevel);
6013 case ISD::SREM:
6014 case ISD::UREM:
6015 return PerformREMCombine(N, DCI, OptLevel);
6016 case ISD::STORE:
6017 case NVPTXISD::StoreV2:
6018 case NVPTXISD::StoreV4:
6019 return combineSTORE(N, DCI, STI);
6020 case ISD::VSELECT:
6021 return PerformVSELECTCombine(N, DCI);
6022 }
6023 return SDValue();
6024}
6025
6028 // Handle bitcasting to v2i8 without hitting the default promotion
6029 // strategy which goes through stack memory.
6030 SDValue Op(Node, 0);
6031 EVT ToVT = Op->getValueType(0);
6032 if (ToVT != MVT::v2i8) {
6033 return;
6034 }
6035
6036 // Bitcast to i16 and unpack elements into a vector
6037 SDLoc DL(Node);
6038 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
6039 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
6040 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
6041 SDValue Vec1 =
6042 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6043 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
6044 Results.push_back(
6045 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
6046}
6047
6048// Lower vector return type of tcgen05.ld intrinsics
6051 bool hasOffset = false) {
6052 SDLoc DL(N);
6053 EVT ResVT = N->getValueType(0);
6054 if (!ResVT.isVector())
6055 return; // already legalized.
6056
6057 const unsigned NumElts = ResVT.getVectorNumElements();
6058
6059 // Create the return type of the instructions
6060 SmallVector<EVT, 5> ListVTs;
6061 for (unsigned i = 0; i < NumElts; ++i)
6062 ListVTs.push_back(MVT::i32);
6063
6064 ListVTs.push_back(N->getValueType(1)); // Chain
6065
6066 SDVTList ResVTs = DAG.getVTList(ListVTs);
6067
6068 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
6069 N->getOperand(2)};
6070
6071 if (hasOffset) {
6072 Ops.push_back(N->getOperand(3)); // offset
6073 Ops.push_back(N->getOperand(4)); // Pack flag
6074 } else
6075 Ops.push_back(N->getOperand(3)); // Pack flag
6076
6077 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6078 SDValue NewNode =
6080 MemSD->getMemoryVT(), MemSD->getMemOperand());
6081
6082 // split the vector result
6083 SmallVector<SDValue, 4> ScalarRes;
6084 for (unsigned i = 0; i < NumElts; ++i) {
6085 SDValue Res = NewNode.getValue(i);
6086 ScalarRes.push_back(Res);
6087 }
6088
6089 SDValue Chain = NewNode.getValue(NumElts);
6090 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
6091 Results.push_back(BuildVector); // Build Vector
6092 Results.push_back(Chain); // Chain
6093}
6094
6097 SDValue Chain = N->getOperand(0);
6098 SDValue Intrin = N->getOperand(1);
6099 SDLoc DL(N);
6100
6101 // Get the intrinsic ID
6102 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6103 switch (IntrinNo) {
6104 default:
6105 return;
6106 case Intrinsic::nvvm_ldu_global_i:
6107 case Intrinsic::nvvm_ldu_global_f:
6108 case Intrinsic::nvvm_ldu_global_p: {
6109 EVT ResVT = N->getValueType(0);
6110
6111 if (ResVT.isVector()) {
6112 // Vector LDG/LDU
6113
6114 unsigned NumElts = ResVT.getVectorNumElements();
6115 EVT EltVT = ResVT.getVectorElementType();
6116
6117 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6118 // legalization.
6119 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6120 // loaded type to i16 and propagate the "real" type as the memory type.
6121 bool NeedTrunc = false;
6122 if (EltVT.getSizeInBits() < 16) {
6123 EltVT = MVT::i16;
6124 NeedTrunc = true;
6125 }
6126
6127 unsigned Opcode = 0;
6128 SDVTList LdResVTs;
6129
6130 switch (NumElts) {
6131 default:
6132 return;
6133 case 2:
6134 Opcode = NVPTXISD::LDUV2;
6135 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6136 break;
6137 case 4: {
6138 Opcode = NVPTXISD::LDUV4;
6139 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6140 LdResVTs = DAG.getVTList(ListVTs);
6141 break;
6142 }
6143 }
6144
6145 SmallVector<SDValue, 8> OtherOps;
6146
6147 // Copy regular operands
6148
6149 OtherOps.push_back(Chain); // Chain
6150 // Skip operand 1 (intrinsic ID)
6151 // Others
6152 OtherOps.append(N->op_begin() + 2, N->op_end());
6153
6154 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6155
6156 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6157 MemSD->getMemoryVT(),
6158 MemSD->getMemOperand());
6159
6160 SmallVector<SDValue, 4> ScalarRes;
6161
6162 for (unsigned i = 0; i < NumElts; ++i) {
6163 SDValue Res = NewLD.getValue(i);
6164 if (NeedTrunc)
6165 Res =
6166 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6167 ScalarRes.push_back(Res);
6168 }
6169
6170 SDValue LoadChain = NewLD.getValue(NumElts);
6171
6172 SDValue BuildVec =
6173 DAG.getBuildVector(ResVT, DL, ScalarRes);
6174
6175 Results.push_back(BuildVec);
6176 Results.push_back(LoadChain);
6177 } else {
6178 // i8 LDG/LDU
6179 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6180 "Custom handling of non-i8 ldu/ldg?");
6181
6182 // Just copy all operands as-is
6183 SmallVector<SDValue, 4> Ops(N->ops());
6184
6185 // Force output to i16
6186 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6187
6188 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6189
6190 // We make sure the memory type is i8, which will be used during isel
6191 // to select the proper instruction.
6192 SDValue NewLD =
6193 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
6194 MVT::i8, MemSD->getMemOperand());
6195
6196 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6197 NewLD.getValue(0)));
6198 Results.push_back(NewLD.getValue(1));
6199 }
6200 return;
6201 }
6202
6203 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
6204 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
6205 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
6206 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
6207 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
6208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
6209 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
6210 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
6211 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
6212 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
6213 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
6214 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
6215 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
6216 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
6217 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
6218 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
6219 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
6220 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
6221 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
6222 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
6223 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
6224 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
6225 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
6226 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
6227 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
6228 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
6229 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
6230 return ReplaceTcgen05Ld(N, DAG, Results);
6231
6232 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
6233 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
6234 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
6235 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
6236 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
6237 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
6238 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
6239 return ReplaceTcgen05Ld(N, DAG, Results, /* Offset */ true);
6240 }
6241}
6242
6245 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6246 // result so that it can pass the legalization
6247 SDLoc DL(N);
6248 SDValue Chain = N->getOperand(0);
6249 SDValue Reg = N->getOperand(1);
6250 SDValue Glue = N->getOperand(2);
6251
6252 assert(Reg.getValueType() == MVT::i128 &&
6253 "Custom lowering for CopyFromReg with 128-bit reg only");
6254 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6255 N->getValueType(2)};
6256 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6257
6258 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6259 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6260 {NewValue.getValue(0), NewValue.getValue(1)});
6261
6262 Results.push_back(Pair);
6263 Results.push_back(NewValue.getValue(2));
6264 Results.push_back(NewValue.getValue(3));
6265}
6266
6268 const TargetLowering &TLI,
6270 SDValue Chain = N->getOperand(0);
6271 SDValue Reg = N->getOperand(1);
6272
6273 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
6274
6275 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
6276 SDValue NewProxy =
6277 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
6278 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
6279
6280 Results.push_back(Res);
6281}
6282
6284 const NVPTXSubtarget &STI,
6286 assert(N->getValueType(0) == MVT::i128 &&
6287 "Custom lowering for atomic128 only supports i128");
6288
6289 AtomicSDNode *AN = cast<AtomicSDNode>(N);
6290 SDLoc dl(N);
6291
6292 if (!STI.hasAtomSwap128()) {
6295 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
6296 "requires target sm_90.",
6297 dl.getDebugLoc()));
6298
6299 Results.push_back(DAG.getUNDEF(MVT::i128));
6300 Results.push_back(AN->getOperand(0)); // Chain
6301 return;
6302 }
6303
6305 Ops.push_back(AN->getOperand(0)); // Chain
6306 Ops.push_back(AN->getOperand(1)); // Ptr
6307 for (const auto &Op : AN->ops().drop_front(2)) {
6308 // Low part
6309 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6310 DAG.getIntPtrConstant(0, dl)));
6311 // High part
6312 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6313 DAG.getIntPtrConstant(1, dl)));
6314 }
6315 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
6318 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
6319 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
6320 AN->getMemOperand());
6321 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
6322 {Result.getValue(0), Result.getValue(1)}));
6323 Results.push_back(Result.getValue(2));
6324}
6325
6326void NVPTXTargetLowering::ReplaceNodeResults(
6328 switch (N->getOpcode()) {
6329 default:
6330 report_fatal_error("Unhandled custom legalization");
6331 case ISD::BITCAST:
6332 ReplaceBITCAST(N, DAG, Results);
6333 return;
6334 case ISD::LOAD:
6335 replaceLoadVector(N, DAG, Results, STI);
6336 return;
6339 return;
6340 case ISD::CopyFromReg:
6342 return;
6343 case NVPTXISD::ProxyReg:
6344 replaceProxyReg(N, DAG, *this, Results);
6345 return;
6347 case ISD::ATOMIC_SWAP:
6348 replaceAtomicSwap128(N, DAG, STI, Results);
6349 return;
6350 }
6351}
6352
6355 Type *Ty = AI->getValOperand()->getType();
6356
6357 if (AI->isFloatingPointOperation()) {
6359 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6360 STI.getPTXVersion() >= 63)
6362 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6363 STI.getPTXVersion() >= 78)
6365 if (Ty->isFloatTy())
6367 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6369 }
6371 }
6372
6373 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6374 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
6375
6376 switch (AI->getOperation()) {
6377 default:
6380 if (BitWidth == 128)
6386 switch (BitWidth) {
6387 case 8:
6388 case 16:
6390 case 32:
6392 case 64:
6393 if (STI.hasAtomBitwise64())
6396 case 128:
6398 default:
6399 llvm_unreachable("unsupported width encountered");
6400 }
6407 switch (BitWidth) {
6408 case 8:
6409 case 16:
6411 case 32:
6413 case 64:
6414 if (STI.hasAtomMinMax64())
6417 case 128:
6419 default:
6420 llvm_unreachable("unsupported width encountered");
6421 }
6424 switch (BitWidth) {
6425 case 32:
6427 case 8:
6428 case 16:
6429 case 64:
6430 case 128:
6432 default:
6433 llvm_unreachable("unsupported width encountered");
6434 }
6435 }
6436
6438}
6439
6441 const Instruction *I) const {
6442 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6443 // When CAS bitwidth is not supported on the hardware, the CAS is emulated
6444 // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
6445 // the memory order using explicit fences around the retry loop.
6446 // The memory order of natively supported CAS operations can be enforced
6447 // by lowering to an atom.cas with the right memory synchronizing effect.
6448 // However, atom.cas only supports relaxed, acquire, release and acq_rel.
6449 // So we also use explicit fences for enforcing memory order for
6450 // seq_cast CAS with natively-supported bitwidths.
6451 return CI &&
6452 (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
6454 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
6455}
6456
6458 const Instruction *I) const {
6459 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6460 bool BitwidthSupportedAndIsSeqCst =
6461 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
6462 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
6464 return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
6466}
6467
6469 Instruction *Inst,
6470 AtomicOrdering Ord) const {
6471 if (!isa<AtomicCmpXchgInst>(Inst))
6472 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
6473
6474 // Specialize for cmpxchg
6475 // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
6476 SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
6477 if (isReleaseOrStronger(Ord))
6479 ? Ord
6481 SSID);
6482
6483 return nullptr;
6484}
6485
6487 Instruction *Inst,
6488 AtomicOrdering Ord) const {
6489 // Specialize for cmpxchg
6490 if (!isa<AtomicCmpXchgInst>(Inst))
6491 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
6492
6493 auto *CI = cast<AtomicCmpXchgInst>(Inst);
6494 auto CASWidth =
6495 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
6496 SyncScope::ID SSID = CI->getSyncScopeID();
6497 // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
6498 if (isAcquireOrStronger(Ord) &&
6500 CASWidth < STI.getMinCmpXchgSizeInBits()))
6501 return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
6502
6503 return nullptr;
6504}
6505
6506// Rather than default to SINT when both UINT and SINT are custom, we only
6507// change the opcode when UINT is not legal and SINT is. UINT is preferred when
6508// both are custom since unsigned CVT instructions can lead to slightly better
6509// SASS code with fewer instructions.
6511 EVT ToVT) const {
6512 if (isOperationLegal(Op, ToVT))
6513 return Op;
6514 switch (Op) {
6515 case ISD::FP_TO_UINT:
6517 return ISD::FP_TO_SINT;
6518 break;
6522 break;
6523 case ISD::VP_FP_TO_UINT:
6524 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
6525 return ISD::VP_FP_TO_SINT;
6526 break;
6527 default:
6528 break;
6529 }
6530 return Op;
6531}
6532
6533// Pin NVPTXTargetObjectFile's vtables to this file.
6535
6537 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6538 return getDataSection();
6539}
6540
6542 const SelectionDAG &DAG, unsigned Depth) {
6543 SDValue A = Op.getOperand(0);
6544 SDValue B = Op.getOperand(1);
6545 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6546 unsigned Mode = Op.getConstantOperandVal(3);
6547
6548 if (!Selector)
6549 return;
6550
6551 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
6552 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
6553
6554 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6555 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
6556 "PRMT must have i32 operands");
6557 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
6558 KnownBits BitField = BKnown.concat(AKnown);
6559
6560 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
6561 for (unsigned I : llvm::seq(4)) {
6562 APInt Sel = SelectorVal.extractBits(4, I * 4);
6563 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6564 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6565 KnownBits Byte = BitField.extractBits(8, Idx * 8);
6566 if (Sign)
6567 Byte = KnownBits::ashr(Byte, 8);
6568 Known.insertBits(Byte, I * 8);
6569 }
6570}
6571
6572static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
6573 MemSDNode *LD = cast<MemSDNode>(Op);
6574
6575 // We can't do anything without knowing the sign bit.
6576 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
6577 if (ExtType == ISD::SEXTLOAD)
6578 return;
6579
6580 // ExtLoading to vector types is weird and may not work well with known bits.
6581 auto DestVT = LD->getValueType(0);
6582 if (DestVT.isVector())
6583 return;
6584
6585 assert(Known.getBitWidth() == DestVT.getSizeInBits());
6586 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
6587 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
6588}
6589
6591 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
6592 const SelectionDAG &DAG, unsigned Depth) const {
6593 Known.resetAll();
6594
6595 switch (Op.getOpcode()) {
6596 case NVPTXISD::PRMT:
6597 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
6598 break;
6599 case NVPTXISD::LoadV2:
6600 case NVPTXISD::LoadV4:
6601 case NVPTXISD::LoadV8:
6603 break;
6604 default:
6605 break;
6606 }
6607}
6608
6609static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
6610 const APInt &DemandedBits) {
6611 APInt DemandedLHS = APInt(32, 0);
6612 APInt DemandedRHS = APInt(32, 0);
6613
6614 for (unsigned I : llvm::seq(4)) {
6615 if (DemandedBits.extractBits(8, I * 8).isZero())
6616 continue;
6617
6618 APInt Sel = SelectorVal.extractBits(4, I * 4);
6619 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6620 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6621
6622 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
6623 unsigned ByteStart = (Idx % 4) * 8;
6624 if (Sign)
6625 Src.setBit(ByteStart + 7);
6626 else
6627 Src.setBits(ByteStart, ByteStart + 8);
6628 }
6629
6630 return {DemandedLHS, DemandedRHS};
6631}
6632
6633// Replace undef with 0 as this is easier for other optimizations such as
6634// known bits.
6636 if (!Op)
6637 return SDValue();
6638 if (Op.isUndef())
6639 return DAG.getConstant(0, SDLoc(), MVT::i32);
6640 return Op;
6641}
6642
6644 const APInt &DemandedBits,
6645 SelectionDAG &DAG,
6646 const TargetLowering &TLI,
6647 unsigned Depth) {
6648 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
6649 SDValue Op0 = PRMT.getOperand(0);
6650 SDValue Op1 = PRMT.getOperand(1);
6651 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
6652 if (!SelectorConst)
6653 return SDValue();
6654
6655 unsigned Mode = PRMT.getConstantOperandVal(3);
6656 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
6657
6658 // Try to simplify the PRMT to one of the inputs if the used bytes are all
6659 // from the same input in the correct order.
6660 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
6661 const unsigned SelBits = (4 - LeadingBytes) * 4;
6662 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
6663 return Op0;
6664 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
6665 return Op1;
6666
6667 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
6668
6669 // Attempt to avoid multi-use ops if we don't need anything from them.
6670 SDValue DemandedOp0 =
6671 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
6672 SDValue DemandedOp1 =
6673 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
6674
6675 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
6676 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
6677 if ((DemandedOp0 && DemandedOp0 != Op0) ||
6678 (DemandedOp1 && DemandedOp1 != Op1)) {
6679 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
6680 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
6681 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
6682 }
6683
6684 return SDValue();
6685}
6686
6688 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
6689 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
6690 Known.resetAll();
6691
6692 switch (Op.getOpcode()) {
6693 case NVPTXISD::PRMT:
6695 *this, Depth)) {
6696 TLO.CombineTo(Op, Result);
6697 return true;
6698 }
6699 break;
6700 default:
6701 break;
6702 }
6703
6704 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
6705 return false;
6706}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:687
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:404
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register const TargetRegisterInfo * TRI
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results, bool hasOffset=false)
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< NVPTXISD::NodeType > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05St(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
OperandSignedness
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1098
Class for arbitrary precision integers.
Definition: APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition: APInt.cpp:644
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1391
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition: APInt.cpp:639
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:936
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:435
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1130
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:432
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:156
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:206
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:191
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:709
@ Add
*p = old + v
Definition: Instructions.h:725
@ FAdd
*p = old + v
Definition: Instructions.h:746
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:739
@ Or
*p = old | v
Definition: Instructions.h:733
@ Sub
*p = old - v
Definition: Instructions.h:727
@ And
*p = old & v
Definition: Instructions.h:729
@ Xor
*p = old ^ v
Definition: Instructions.h:735
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:769
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:737
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:743
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:741
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:773
bool isFloatingPointOperation() const
Definition: Instructions.h:898
BinOp getOperation() const
Definition: Instructions.h:819
Value * getValOperand()
Definition: Instructions.h:890
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1205
This class represents a function call, abstracting a target machine's calling convention.
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:637
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
FenceInst * CreateFence(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System, const Twine &Name="")
Definition: IRBuilder.h:1891
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition: MCSection.h:496
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasAtomAddF64() const
bool hasHWROT32() const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getMinCmpXchgSizeInBits() const
unsigned getPTXVersion() const
bool hasNativeBF16Support(int Opcode) const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getSmVersion() const
bool hasAtomBitwise64() const
bool hasBF16Math() const
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool allowFP16Math() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
UniqueStringSaver & getStrPool() const
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition: SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:578
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:500
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:587
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
ArrayRef< int > getMask() const
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:154
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:148
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
StringRef save(const char *S)
Definition: StringSaver.h:53
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
int getNumOccurrences() const
Definition: CommandLine.h:400
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:662
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition: APInt.cpp:3155
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1232
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1265
@ ConstantFP
Definition: ISDOpcodes.h:87
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:289
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1476
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1480
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ FrameIndex
Definition: ISDOpcodes.h:90
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1298
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1477
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1187
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1162
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:242
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1261
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1321
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1358
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition: ISDOpcodes.h:379
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:994
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:471
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:470
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1374
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1292
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:299
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition: ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1372
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1481
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1256
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ ADDRESS_SPACE_SHARED_CLUSTER
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED
@ CALL
This node represents a PTX call instruction.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y
@ DeclareScalarParam
These nodes represent a parameter declaration.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
@ BUILD_VECTOR
This node is similar to ISD::BUILD_VECTOR except that the output may be implicitly bitcast to a scala...
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition: NVPTX.h:251
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:712
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1702
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1987
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
unsigned promoteScalarArgumentSize(unsigned size)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
bool shouldPassAsArray(Type *Ty)
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:82
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:119
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
bool isKernelFunction(const Function &F)
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:266
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition: ValueTypes.h:197
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:330
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:427
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition: KnownBits.h:226
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:74
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition: KnownBits.h:212
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)