LLVM 22.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTX.h"
15#include "NVPTXUtilities.h"
16#include "llvm/ADT/APInt.h"
21#include "llvm/IR/GlobalValue.h"
23#include "llvm/IR/IntrinsicsNVPTX.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
36
37static cl::opt<bool>
38 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
39 cl::desc("Enable reciprocal sqrt optimization"));
40
41/// createNVPTXISelDag - This pass converts a legalized DAG into a
42/// NVPTX-specific DAG, ready for instruction scheduling.
44 llvm::CodeGenOptLevel OptLevel) {
45 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
46}
47
49 CodeGenOptLevel OptLevel)
51 ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {}
52
54
56
58 CodeGenOptLevel OptLevel)
59 : SelectionDAGISel(tm, OptLevel), TM(tm) {}
60
65}
66
68NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
70}
71
72bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
74}
75
76bool NVPTXDAGToDAGISel::useF32FTZ() const {
78}
79
80bool NVPTXDAGToDAGISel::allowFMA() const {
82 return TL->allowFMA(*MF, OptLevel);
83}
84
85bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
86
87/// Select - Select instructions not customized! Used for
88/// expanded, promoted and normal instructions.
89void NVPTXDAGToDAGISel::Select(SDNode *N) {
90
91 if (N->isMachineOpcode()) {
92 N->setNodeId(-1);
93 return; // Already selected.
94 }
95
96 switch (N->getOpcode()) {
97 case ISD::LOAD:
99 if (tryLoad(N))
100 return;
101 break;
102 case ISD::STORE:
104 if (tryStore(N))
105 return;
106 break;
108 if (tryFence(N))
109 return;
110 break;
112 tryUNPACK_VECTOR(N);
113 return;
115 if (tryEXTRACT_VECTOR_ELEMENT(N))
116 return;
117 break;
119 SelectSETP_F16X2(N);
120 return;
122 SelectSETP_BF16X2(N);
123 return;
124 case NVPTXISD::LoadV2:
125 case NVPTXISD::LoadV4:
126 case NVPTXISD::LoadV8:
127 if (tryLoadVector(N))
128 return;
129 break;
130 case NVPTXISD::LDUV2:
131 case NVPTXISD::LDUV4:
132 if (tryLDU(N))
133 return;
134 break;
138 if (tryStoreVector(N))
139 return;
140 break;
142 if (tryIntrinsicChain(N))
143 return;
144 break;
146 if (tryIntrinsicVoid(N))
147 return;
148 break;
149 case ISD::AND:
150 case ISD::SRA:
151 case ISD::SRL:
152 // Try to select BFE
153 if (tryBFE(N))
154 return;
155 break;
157 SelectAddrSpaceCast(N);
158 return;
159 case ISD::CopyToReg: {
160 if (N->getOperand(1).getValueType() == MVT::i128) {
161 SelectV2I64toI128(N);
162 return;
163 }
164 break;
165 }
166 case ISD::CopyFromReg: {
167 if (N->getOperand(1).getValueType() == MVT::i128) {
168 SelectI128toV2I64(N);
169 return;
170 }
171 break;
172 }
175 selectAtomicSwap128(N);
176 return;
177 case ISD::FADD:
178 case ISD::FMUL:
179 case ISD::FSUB:
180 if (tryBF16ArithToFMA(N))
181 return;
182 break;
183 default:
184 break;
185 }
186 SelectCode(N);
187}
188
189#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
190 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
191 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
192
193static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
194 switch (IID) {
195 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
196 return TCGEN05_LD_OPCODE(16x64b, x1);
197 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
198 return TCGEN05_LD_OPCODE(16x64b, x2);
199 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
200 return TCGEN05_LD_OPCODE(16x64b, x4);
201 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
202 return TCGEN05_LD_OPCODE(16x64b, x8);
203 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
204 return TCGEN05_LD_OPCODE(16x64b, x16);
205 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
206 return TCGEN05_LD_OPCODE(16x64b, x32);
207 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
208 return TCGEN05_LD_OPCODE(16x64b, x64);
209 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
210 return TCGEN05_LD_OPCODE(16x64b, x128);
211 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
212 return TCGEN05_LD_OPCODE(16x128b, x1);
213 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
214 return TCGEN05_LD_OPCODE(16x128b, x2);
215 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
216 return TCGEN05_LD_OPCODE(16x128b, x4);
217 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
218 return TCGEN05_LD_OPCODE(16x128b, x8);
219 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
220 return TCGEN05_LD_OPCODE(16x128b, x16);
221 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
222 return TCGEN05_LD_OPCODE(16x128b, x32);
223 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
224 return TCGEN05_LD_OPCODE(16x128b, x64);
225 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
226 return TCGEN05_LD_OPCODE(16x256b, x1);
227 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
228 return TCGEN05_LD_OPCODE(16x256b, x2);
229 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
230 return TCGEN05_LD_OPCODE(16x256b, x4);
231 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
232 return TCGEN05_LD_OPCODE(16x256b, x8);
233 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
234 return TCGEN05_LD_OPCODE(16x256b, x16);
235 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
236 return TCGEN05_LD_OPCODE(16x256b, x32);
237 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
238 return TCGEN05_LD_OPCODE(16x32bx2, x1);
239 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
240 return TCGEN05_LD_OPCODE(16x32bx2, x2);
241 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
242 return TCGEN05_LD_OPCODE(16x32bx2, x4);
243 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
244 return TCGEN05_LD_OPCODE(16x32bx2, x8);
245 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
246 return TCGEN05_LD_OPCODE(16x32bx2, x16);
247 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
248 return TCGEN05_LD_OPCODE(16x32bx2, x32);
249 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
250 return TCGEN05_LD_OPCODE(16x32bx2, x64);
251 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
252 return TCGEN05_LD_OPCODE(16x32bx2, x128);
253 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
254 return TCGEN05_LD_OPCODE(32x32b, x1);
255 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
256 return TCGEN05_LD_OPCODE(32x32b, x2);
257 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
258 return TCGEN05_LD_OPCODE(32x32b, x4);
259 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
260 return TCGEN05_LD_OPCODE(32x32b, x8);
261 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
262 return TCGEN05_LD_OPCODE(32x32b, x16);
263 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
264 return TCGEN05_LD_OPCODE(32x32b, x32);
265 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
266 return TCGEN05_LD_OPCODE(32x32b, x64);
267 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
268 return TCGEN05_LD_OPCODE(32x32b, x128);
269 }
270 llvm_unreachable("unhandled tcgen05.ld lowering");
271}
272
273void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
274 SDLoc DL(N);
275 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
276
277 if (hasOffset) {
278 bool enablePack = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
279 auto OffsetNode = CurDAG->getTargetConstant(
280 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL, MVT::i32);
282 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
283 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
284 } else {
285 bool enablePack = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
287 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
288 {N->getOperand(2), N->getOperand(0)}));
289 }
290}
291
292bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
293 unsigned IID = N->getConstantOperandVal(1);
294 switch (IID) {
295 default:
296 return false;
297 case Intrinsic::nvvm_ldu_global_f:
298 case Intrinsic::nvvm_ldu_global_i:
299 case Intrinsic::nvvm_ldu_global_p:
300 return tryLDU(N);
301
302 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
303 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
304 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
305 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
306 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
307 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
308 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
309 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
310 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
311 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
312 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
313 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
314 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
315 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
316 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
317 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
318 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
319 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
320 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
321 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
322 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
323 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
324 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
325 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
326 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
327 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
328 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
329 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
330 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
331 SelectTcgen05Ld(N);
332 return true;
333 }
334
335 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
336 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
337 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
338 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
339 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
340 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
341 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
342 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
343 SelectTcgen05Ld(N, /* hasOffset */ true);
344 return true;
345 }
346 }
347}
348
349// Map ISD:CONDCODE value to appropriate CmpMode expected by
350// NVPTXInstPrinter::printCmpMode()
351SDValue NVPTXDAGToDAGISel::getPTXCmpMode(const CondCodeSDNode &CondCode) {
353 const unsigned PTXCmpMode = [](ISD::CondCode CC) {
354 switch (CC) {
355 default:
356 llvm_unreachable("Unexpected condition code.");
357 case ISD::SETOEQ:
358 case ISD::SETEQ:
359 return CmpMode::EQ;
360 case ISD::SETOGT:
361 case ISD::SETGT:
362 return CmpMode::GT;
363 case ISD::SETOGE:
364 case ISD::SETGE:
365 return CmpMode::GE;
366 case ISD::SETOLT:
367 case ISD::SETLT:
368 return CmpMode::LT;
369 case ISD::SETOLE:
370 case ISD::SETLE:
371 return CmpMode::LE;
372 case ISD::SETONE:
373 case ISD::SETNE:
374 return CmpMode::NE;
375 case ISD::SETO:
376 return CmpMode::NUM;
377 case ISD::SETUO:
378 return CmpMode::NotANumber;
379 case ISD::SETUEQ:
380 return CmpMode::EQU;
381 case ISD::SETUGT:
382 return CmpMode::GTU;
383 case ISD::SETUGE:
384 return CmpMode::GEU;
385 case ISD::SETULT:
386 return CmpMode::LTU;
387 case ISD::SETULE:
388 return CmpMode::LEU;
389 case ISD::SETUNE:
390 return CmpMode::NEU;
391 }
392 }(CondCode.get());
393 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
394}
395
396bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
397 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
398 SDLoc DL(N);
400 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1,
401 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
402 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
403 ReplaceNode(N, SetP);
404 return true;
405}
406
407bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
408 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
409 SDLoc DL(N);
411 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1,
412 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
413 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
414 ReplaceNode(N, SetP);
415 return true;
416}
417
418bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
419 SDValue Vector = N->getOperand(0);
420 MVT EltVT = N->getSimpleValueType(0);
421
422 MachineSDNode *N2 =
423 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(N), EltVT, EltVT, Vector);
424
425 ReplaceNode(N, N2);
426 return true;
427}
428
429// Find all instances of extract_vector_elt that use this v2f16 vector
430// and coalesce them into a scattering move instruction.
431bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
432 SDValue Vector = N->getOperand(0);
433
434 MVT VT = Vector.getSimpleValueType();
435 if (!(NVPTX::isPackedVectorTy(VT) && VT.getVectorNumElements() == 2))
436 return false;
437
438 unsigned Opcode;
439 if (VT.is32BitVector())
440 Opcode = NVPTX::I32toV2I16;
441 else if (VT.is64BitVector())
442 Opcode = NVPTX::I64toV2I32;
443 else
444 llvm_unreachable("Unhandled packed type");
445
446 // Find and record all uses of this vector that extract element 0 or 1.
448 for (auto *U : Vector.getNode()->users()) {
449 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
450 continue;
451 if (U->getOperand(0) != Vector)
452 continue;
453 if (const ConstantSDNode *IdxConst =
454 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
455 if (IdxConst->getZExtValue() == 0)
456 E0.push_back(U);
457 else if (IdxConst->getZExtValue() == 1)
458 E1.push_back(U);
459 else
460 llvm_unreachable("Invalid vector index.");
461 }
462 }
463
464 // There's no point scattering f16x2 if we only ever access one
465 // element of it.
466 if (E0.empty() || E1.empty())
467 return false;
468
469 // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1))
470 // into EltTy,EltTy Split[EltTy]x2(V)
471 MVT EltVT = VT.getVectorElementType();
472 SDNode *ScatterOp =
473 CurDAG->getMachineNode(Opcode, SDLoc(N), EltVT, EltVT, Vector);
474 for (auto *Node : E0)
475 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
476 for (auto *Node : E1)
477 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
478
479 return true;
480}
481
482static std::optional<NVPTX::AddressSpace> convertAS(unsigned AS) {
483 switch (AS) {
498 default:
499 return std::nullopt;
500 }
501}
502
504 return convertAS(N->getMemOperand()->getAddrSpace())
506}
507
508NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const {
509 // No "sem" orderings for SM/PTX versions which do not support memory ordering
512 auto Ordering = N->getMergedOrdering();
513 switch (Ordering) {
527 }
528 llvm_unreachable("Invalid atomic ordering");
529}
530
531NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const {
532 // No "scope" modifier for SM/PTX versions which do not support scoped atomics
533 // Functionally, these atomics are at device scope
534 if (!Subtarget->hasAtomScope())
536 return Scopes[N->getSyncScopeID()];
537}
538
539namespace {
540
541struct OperationOrderings {
543 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
544 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
545 : InstructionOrdering(IO), FenceOrdering(FO) {}
546};
547
548static OperationOrderings
549getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
550 AtomicOrdering Ordering = N->getSuccessOrdering();
551 auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N);
552
553 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
554 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
555
556 // clang-format off
557
558 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
559 // Note: uses of Relaxed in the Atomic column of this table refer
560 // to LLVM AtomicOrdering::Monotonic.
561 //
562 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
563 // |---------|----------|--------------------|------------|------------------------------|
564 // | No | No | All | plain | .weak |
565 // | No | Yes | Generic,Shared, | .volatile | .volatile |
566 // | | | Global [0] | | |
567 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
568 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
569 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
570 // | | | Global [0] | | |
571 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
572 // | | | Global [0] | | |
573 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
574 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
575 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
576 // | | | | | or .volatile (PTX 8.1-) |
577 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
578 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
579 // | | | / Global [0] | | |
580
581 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
582 // by following the ABI proven sound in:
583 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
584 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
585 //
586 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
587 // |------------------------------------------------------|-------------------------------|
588 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
589 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
590 // |------------------------------------------------------|-------------------------------|
591 // | cuda::atomic_load | fence.sc.<scope>; |
592 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
593 // |------------------------------------------------------|-------------------------------|
594 // | cuda::atomic_store | fence.sc.<scope>; |
595 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
596 // |------------------------------------------------------|-------------------------------|
597 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
598 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
599
600 // clang-format on
601
602 // [0]: volatile and atomics are only supported on global or shared
603 // memory locations, accessed via generic/shared/global pointers.
604 // MMIO is only supported on global memory locations,
605 // accessed via generic/global pointers.
606 // TODO: Implement MMIO access via generic pointer to global.
607 // Currently implemented for global pointers only.
608
609 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
610 // PTX instructions fails to preserve their C++ side-effects.
611 //
612 // Example (https://github.com/llvm/llvm-project/issues/62057):
613 //
614 // void example() {
615 // std::atomic<bool> True = true;
616 // while (True.load(std::memory_order_relaxed));
617 // }
618 //
619 // A C++ program that calls "example" is well-defined: the infinite loop
620 // performs an atomic operation. By lowering volatile/atomics to
621 // "weak" memory operations, we are transforming the above into:
622 //
623 // void undefined_behavior() {
624 // bool True = true;
625 // while (True);
626 // }
627 //
628 // which exhibits undefined behavior in both C++ and PTX.
629 //
630 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
631 // behavior due to lack of Independent Forward Progress. Lowering these
632 // to weak memory operations in sm_60- is therefore fine.
633 //
634 // TODO: lower atomic and volatile operations to memory locations
635 // in local, const, and param to two PTX instructions in sm_70+:
636 // - the "weak" memory instruction we are currently lowering to, and
637 // - some other instruction that preserves the side-effect, e.g.,
638 // a dead dummy volatile load.
639 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
640 CodeAddrSpace == NVPTX::AddressSpace::Const ||
641 CodeAddrSpace == NVPTX::AddressSpace::Param) {
643 }
644
645 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
646 // supported on sm_60 and older; this includes volatile atomics.
647 if (!(Ordering == AtomicOrdering::NotAtomic ||
648 Ordering == AtomicOrdering::Unordered ||
649 Ordering == AtomicOrdering::Monotonic) &&
650 !HasMemoryOrdering) {
652 formatv("PTX does not support \"atomic\" for orderings different than"
653 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
654 "is: \"{}\".",
655 toIRString(Ordering)));
656 }
657
658 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
659 // the volatile semantics and preserve the atomic ones.
660
661 // PTX volatile and PTX atomics are not available for statespace that differ
662 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
663 // atomics is undefined if the generic address does not refer to a .global or
664 // .shared memory location.
665 bool AddrGenericOrGlobalOrShared =
666 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
667 CodeAddrSpace == NVPTX::AddressSpace::Global ||
668 CodeAddrSpace == NVPTX::AddressSpace::Shared ||
669 CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
670 if (!AddrGenericOrGlobalOrShared)
672
673 bool UseRelaxedMMIO =
674 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
675
676 switch (Ordering) {
678 return N->isVolatile() ? NVPTX::Ordering::Volatile
681 // We lower unordered in the exact same way as 'monotonic' to respect
682 // LLVM IR atomicity requirements.
684 if (N->isVolatile())
685 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
687 else
688 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
690 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
691 // Acquire.
693 if (!N->readMem())
695 formatv("PTX only supports Acquire Ordering on reads: {}",
696 N->getOperationName()));
699 if (!N->writeMem())
701 formatv("PTX only supports Release Ordering on writes: {}",
702 N->getOperationName()));
706 formatv("NVPTX does not support AcquireRelease Ordering on "
707 "read-modify-write "
708 "yet and PTX does not support it on loads or stores: {}",
709 N->getOperationName()));
710 }
712 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
713 // sequence including a "fence.sc.sco" and the memory instruction with an
714 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
715 // whether the memory operation is a read, write, or read-modify-write.
716 //
717 // This sets the ordering of the fence to SequentiallyConsistent, and
718 // sets the corresponding ordering for the instruction.
719 NVPTX::Ordering InstrOrder;
720 if (N->readMem())
721 InstrOrder = NVPTX::Ordering::Acquire;
722 else if (N->writeMem())
723 InstrOrder = NVPTX::Ordering::Release;
724 else
726 formatv("NVPTX does not support SequentiallyConsistent Ordering on "
727 "read-modify-writes yet: {}",
728 N->getOperationName()));
729 return OperationOrderings(InstrOrder,
731 }
732 }
734 formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
735 toIRString(Ordering)));
736}
737
738} // namespace
739
740NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
741 NVPTX::Ordering O) const {
742 switch (O) {
744 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
745 // NVPTX uses Thread scope as the scope of non-atomic operations.
748 // RelaxedMMIO operations are always system scope.
749 // If a RelaxedMMIO order was generated from an atomic volatile operation
750 // with a smaller thread scope, we bump it here to system scope.
757 auto S = Scopes[N->getSyncScopeID()];
758
759 // Atomic operations must have a scope greater than thread.
760 if (S == NVPTX::Scope::Thread)
762 formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
763
764 // If scope is cluster, clusters must be supported.
765 if (S == NVPTX::Scope::Cluster)
766 Subtarget->failIfClustersUnsupported("cluster scope");
767
768 // If operation is volatile, then its scope is system.
769 return N->isVolatile() ? NVPTX::Scope::System : S;
770 }
771 llvm_unreachable("unhandled ordering");
772}
773
774static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
775 NVPTX::AddressSpace CodeAddrSpace) {
776 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
777 // space.
778 return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
779 N.isInvariant();
780}
781
782static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
783 NVPTXSubtarget const *T) {
784 if (S == NVPTX::Scope::Cluster)
785 T->failIfClustersUnsupported(".cluster scope fence");
786
787 // Fall back to .acq_rel if .acquire, .release is not supported.
788 if (!T->hasSplitAcquireAndReleaseFences() &&
791
792 switch (O) {
794 switch (S) {
796 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
797 : NVPTX::INT_MEMBAR_SYS;
799 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
800 : NVPTX::INT_MEMBAR_CTA;
802 return NVPTX::atomic_thread_fence_acquire_cluster;
804 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
805 : NVPTX::INT_MEMBAR_GL;
809 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
810 ScopeToString(S)));
811 }
812 break;
814 switch (S) {
816 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
817 : NVPTX::INT_MEMBAR_SYS;
819 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
820 : NVPTX::INT_MEMBAR_CTA;
822 return NVPTX::atomic_thread_fence_release_cluster;
824 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
825 : NVPTX::INT_MEMBAR_GL;
829 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
830 ScopeToString(S)));
831 }
832 break;
834 switch (S) {
836 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
837 : NVPTX::INT_MEMBAR_SYS;
839 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
840 : NVPTX::INT_MEMBAR_CTA;
842 return NVPTX::atomic_thread_fence_acq_rel_cluster;
844 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
845 : NVPTX::INT_MEMBAR_GL;
849 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
850 ScopeToString(S)));
851 }
852 break;
853 }
855 switch (S) {
857 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
858 : NVPTX::INT_MEMBAR_SYS;
860 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
861 : NVPTX::INT_MEMBAR_CTA;
863 return NVPTX::atomic_thread_fence_seq_cst_cluster;
865 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
866 : NVPTX::INT_MEMBAR_GL;
869 report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
870 ScopeToString(S)));
871 }
872 break;
873 }
879 formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
880 OrderingToString(O), ScopeToString(S)));
881 }
882 llvm_unreachable("unhandled ordering");
883}
884
885// Returns Memory Order and Scope of a memory instruction, and
886// inserts any fence before the instruction that's required to
887// implement its memory ordering.
888std::pair<NVPTX::Ordering, NVPTX::Scope>
889NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
890 MemSDNode *N) {
891 auto [InstructionOrdering, FenceOrdering] =
892 getOperationOrderings(N, Subtarget);
893 auto Scope = getOperationScope(N, InstructionOrdering);
894
895 // If a fence is required before the operation, insert it:
896 switch (NVPTX::Ordering(FenceOrdering)) {
898 break;
900 auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
901 Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
902 break;
903 }
904 default:
906 formatv("Unexpected fence ordering: \"{}\".",
907 OrderingToString(NVPTX::Ordering(FenceOrdering))));
908 }
909 return {InstructionOrdering, Scope};
910}
911
912void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
913 SDValue Src = N->getOperand(0);
914 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
915 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
916 unsigned DstAddrSpace = CastN->getDestAddressSpace();
917 SDLoc DL(N);
918 assert(SrcAddrSpace != DstAddrSpace &&
919 "addrspacecast must be between different address spaces");
920
921 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
922 // Specific to generic
923
924 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
925 SDValue CvtNone =
927 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64,
928 Src, CvtNone);
929 Src = SDValue(Cvt, 0);
930 }
931
932 unsigned Opc;
933 switch (SrcAddrSpace) {
934 default: report_fatal_error("Bad address space in addrspacecast");
936 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
937 break;
939 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
940 break;
942 if (!TM.is64Bit())
944 "Shared cluster address space is only supported in 64-bit mode");
945 Opc = NVPTX::cvta_shared_cluster_64;
946 break;
948 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
949 break;
951 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
952 break;
954 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
955 break;
956 }
957 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
958 return;
959 } else {
960 // Generic to specific
961 if (SrcAddrSpace != 0)
962 report_fatal_error("Cannot cast between two non-generic address spaces");
963 unsigned Opc;
964 switch (DstAddrSpace) {
965 default: report_fatal_error("Bad address space in addrspacecast");
967 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
968 break;
970 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
971 break;
973 if (!TM.is64Bit())
975 "Shared cluster address space is only supported in 64-bit mode");
976 Opc = NVPTX::cvta_to_shared_cluster_64;
977 break;
979 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
980 break;
982 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
983 break;
985 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
986 break;
987 }
988
989 SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src);
990 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
991 SDValue CvtNone =
993 CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32,
994 SDValue(CVTA, 0), CvtNone);
995 }
996
997 ReplaceNode(N, CVTA);
998 return;
999 }
1000}
1001
1002// Helper function template to reduce amount of boilerplate code for
1003// opcode selection.
1004static std::optional<unsigned>
1005pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
1006 std::optional<unsigned> Opcode_i32,
1007 std::optional<unsigned> Opcode_i64) {
1008 switch (VT) {
1009 case MVT::f16:
1010 case MVT::i16:
1011 case MVT::bf16:
1012 return Opcode_i16;
1013 case MVT::v2f16:
1014 case MVT::v2bf16:
1015 case MVT::v2i16:
1016 case MVT::v4i8:
1017 case MVT::i32:
1018 case MVT::f32:
1019 return Opcode_i32;
1020 case MVT::v2f32:
1021 case MVT::i64:
1022 case MVT::f64:
1023 return Opcode_i64;
1024 default:
1025 return std::nullopt;
1026 }
1027}
1028
1029static inline bool isAddLike(const SDValue V) {
1030 return V.getOpcode() == ISD::ADD ||
1031 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
1032}
1033
1035 if (N.getOpcode() == ISD::AssertAlign)
1036 N = N.getOperand(0);
1037 return N;
1038}
1039
1040// selectBaseADDR - Match a dag node which will serve as the base address for an
1041// ADDR operand pair.
1043 N = stripAssertAlign(N);
1044 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
1045 return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
1046 GA->getValueType(0), GA->getOffset(),
1047 GA->getTargetFlags());
1048 if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
1049 return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
1050 ES->getTargetFlags());
1051 if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
1052 return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
1053
1054 return N;
1055}
1056
1059 APInt AccumulatedOffset(64u, 0);
1060 while (isAddLike(Addr)) {
1061 const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1062 if (!CN)
1063 break;
1064
1065 const APInt CI = CN->getAPIntValue().sext(64);
1066 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1067 break;
1068
1069 AccumulatedOffset += CI;
1070 Addr = stripAssertAlign(Addr->getOperand(0));
1071 }
1072 return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
1073 MVT::i32);
1074}
1075
1076static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) {
1079 return {Base, Offset};
1080}
1081
1082// Select a pair of operands which represent a valid PTX address, this could be
1083// one of the following things:
1084// - [var] - Offset is simply set to 0
1085// - [reg] - Offset is simply set to 0
1086// - [reg+immOff]
1087// - [var+immOff]
1088// Note that immOff must fit into a 32-bit signed integer.
1089bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
1090 SDValue &Offset) {
1091 std::tie(Base, Offset) = selectADDR(Addr, CurDAG);
1092 return true;
1093}
1094
1095bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1096 MemSDNode *LD = cast<MemSDNode>(N);
1097 assert(LD->readMem() && "Expected load");
1098
1099 // do not support pre/post inc/dec
1100 const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
1101 if (PlainLoad && PlainLoad->isIndexed())
1102 return false;
1103
1104 // Address Space Setting
1105 const auto CodeAddrSpace = getAddrSpace(LD);
1106 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1107 return tryLDG(LD);
1108
1109 SDLoc DL(LD);
1110 SDValue Chain = N->getOperand(0);
1111 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1112
1113 const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits();
1114
1115 // Vector Setting
1116 const unsigned FromType =
1117 (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1120
1121 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1122 FromTypeWidth <= 128 && "Invalid width for load");
1123
1124 // Create the machine instruction DAG
1125 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1126 SDValue Ops[] = {getI32Imm(Ordering, DL),
1127 getI32Imm(Scope, DL),
1128 getI32Imm(CodeAddrSpace, DL),
1129 getI32Imm(FromType, DL),
1130 getI32Imm(FromTypeWidth, DL),
1131 Base,
1132 Offset,
1133 Chain};
1134
1135 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1136 const std::optional<unsigned> Opcode =
1137 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1138 if (!Opcode)
1139 return false;
1140
1141 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1142 if (!NVPTXLD)
1143 return false;
1144
1145 MachineMemOperand *MemRef = LD->getMemOperand();
1146 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1147
1148 ReplaceNode(LD, NVPTXLD);
1149 return true;
1150}
1151
1152static unsigned getStoreVectorNumElts(SDNode *N) {
1153 switch (N->getOpcode()) {
1154 case NVPTXISD::StoreV2:
1155 return 2;
1156 case NVPTXISD::StoreV4:
1157 return 4;
1158 case NVPTXISD::StoreV8:
1159 return 8;
1160 default:
1161 llvm_unreachable("Unexpected opcode");
1162 }
1163}
1164
1165bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1166 MemSDNode *LD = cast<MemSDNode>(N);
1167
1168 // Address Space Setting
1169 const auto CodeAddrSpace = getAddrSpace(LD);
1170 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1171 return tryLDG(LD);
1172
1173 const MVT EltVT = LD->getSimpleValueType(0);
1174 SDLoc DL(LD);
1175 SDValue Chain = LD->getChain();
1176 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1177
1178 // Type Setting: fromType + fromTypeWidth
1179 //
1180 // Sign : ISD::SEXTLOAD
1181 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1182 // type is integer
1183 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1184 // Read at least 8 bits (predicates are stored as 8-bit values)
1185 // The last operand holds the original LoadSDNode::getExtensionType() value
1186 const unsigned ExtensionType =
1187 N->getConstantOperandVal(N->getNumOperands() - 1);
1188 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1191
1192 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1193
1194 assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
1195
1196 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1197 SDValue Ops[] = {getI32Imm(Ordering, DL),
1198 getI32Imm(Scope, DL),
1199 getI32Imm(CodeAddrSpace, DL),
1200 getI32Imm(FromType, DL),
1201 getI32Imm(FromTypeWidth, DL),
1202 Base,
1203 Offset,
1204 Chain};
1205
1206 std::optional<unsigned> Opcode;
1207 switch (N->getOpcode()) {
1208 default:
1209 llvm_unreachable("Unexpected opcode");
1210 case NVPTXISD::LoadV2:
1211 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v2,
1212 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1213 break;
1214 case NVPTXISD::LoadV4:
1215 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v4,
1216 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1217 break;
1218 case NVPTXISD::LoadV8:
1219 Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i16 */},
1220 NVPTX::LDV_i32_v8, {/* no v8i64 */});
1221 break;
1222 }
1223 if (!Opcode)
1224 return false;
1225
1226 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1227
1228 MachineMemOperand *MemRef = LD->getMemOperand();
1229 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1230
1231 ReplaceNode(LD, NVPTXLD);
1232 return true;
1233}
1234
1235bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
1236 SDLoc DL(LD);
1237
1238 unsigned ExtensionType;
1239 if (const auto *Load = dyn_cast<LoadSDNode>(LD)) {
1240 ExtensionType = Load->getExtensionType();
1241 } else {
1242 ExtensionType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
1243 }
1244 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1247
1248 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1249
1250 assert(!(LD->getSimpleValueType(0).isVector() &&
1251 ExtensionType != ISD::NON_EXTLOAD));
1252
1253 const auto [Base, Offset] = selectADDR(LD->getOperand(1), CurDAG);
1254 SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base,
1255 Offset, LD->getChain()};
1256
1257 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1258 std::optional<unsigned> Opcode;
1259 switch (LD->getOpcode()) {
1260 default:
1261 llvm_unreachable("Unexpected opcode");
1262 case ISD::LOAD:
1263 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i16,
1264 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1265 break;
1266 case NVPTXISD::LoadV2:
1267 Opcode =
1268 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v2i16,
1269 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1270 break;
1271 case NVPTXISD::LoadV4:
1272 Opcode =
1273 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v4i16,
1274 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1275 break;
1276 case NVPTXISD::LoadV8:
1277 Opcode = pickOpcodeForVT(TargetVT, {/* no v8i16 */},
1278 NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */});
1279 break;
1280 }
1281 if (!Opcode)
1282 return false;
1283
1284 SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1285
1286 ReplaceNode(LD, NVPTXLDG);
1287 return true;
1288}
1289
1291 auto TotalWidth = Mem->getMemoryVT().getSizeInBits();
1292 auto NumElts = Mem->getNumValues() - 1;
1293 auto ElementBitWidth = TotalWidth / NumElts;
1294 assert(isPowerOf2_32(ElementBitWidth) && ElementBitWidth >= 8 &&
1295 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1296 "Invalid width for load");
1297 return ElementBitWidth;
1298}
1299
1300bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
1301 auto *LD = cast<MemSDNode>(N);
1302
1303 SDLoc DL(N);
1304 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1305 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1306
1307 // If this is an LDU intrinsic, the address is the third operand. If its an
1308 // LDU SD node (from custom vector handling), then its the second operand
1309 SDValue Addr =
1310 LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1311
1312 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1313 SDValue Ops[] = {getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()};
1314
1315 std::optional<unsigned> Opcode;
1316 switch (N->getOpcode()) {
1317 default:
1318 llvm_unreachable("Unexpected opcode");
1320 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_i16,
1321 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1322 break;
1323 case NVPTXISD::LDUV2:
1324 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v2i16,
1325 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1326 break;
1327 case NVPTXISD::LDUV4:
1328 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v4i16,
1329 NVPTX::LDU_GLOBAL_v4i32, {/* no v4i64 */});
1330 break;
1331 }
1332 if (!Opcode)
1333 return false;
1334
1335 SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1336
1337 ReplaceNode(LD, NVPTXLDU);
1338 return true;
1339}
1340
1341bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1342 MemSDNode *ST = cast<MemSDNode>(N);
1343 assert(ST->writeMem() && "Expected store");
1344 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
1345 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
1346 assert((PlainStore || AtomicStore) && "Expected store");
1347
1348 // do not support pre/post inc/dec
1349 if (PlainStore && PlainStore->isIndexed())
1350 return false;
1351
1352 // Address Space Setting
1353 const auto CodeAddrSpace = getAddrSpace(ST);
1354
1355 SDLoc DL(ST);
1356 SDValue Chain = ST->getChain();
1357 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1358
1359 // Vector Setting
1360 const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits();
1361
1362 // Create the machine instruction DAG
1363 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1364
1365 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1366 "Invalid width for store");
1367
1368 const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG);
1369 SDValue Ops[] = {selectPossiblyImm(Value),
1370 getI32Imm(Ordering, DL),
1371 getI32Imm(Scope, DL),
1372 getI32Imm(CodeAddrSpace, DL),
1373 getI32Imm(ToTypeWidth, DL),
1374 Base,
1375 Offset,
1376 Chain};
1377
1378 const std::optional<unsigned> Opcode =
1379 pickOpcodeForVT(Value.getSimpleValueType().SimpleTy, NVPTX::ST_i16,
1380 NVPTX::ST_i32, NVPTX::ST_i64);
1381 if (!Opcode)
1382 return false;
1383
1384 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1385
1386 if (!NVPTXST)
1387 return false;
1388
1389 MachineMemOperand *MemRef = ST->getMemOperand();
1390 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1391 ReplaceNode(ST, NVPTXST);
1392 return true;
1393}
1394
1395bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1396 MemSDNode *ST = cast<MemSDNode>(N);
1397 const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits();
1398
1399 // Address Space Setting
1400 const auto CodeAddrSpace = getAddrSpace(ST);
1401 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1402 report_fatal_error("Cannot store to pointer that points to constant "
1403 "memory space");
1404 }
1405
1406 SDLoc DL(ST);
1407 SDValue Chain = ST->getChain();
1408 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1409
1410 const unsigned NumElts = getStoreVectorNumElts(ST);
1411
1413 for (auto &V : ST->ops().slice(1, NumElts))
1414 Ops.push_back(selectPossiblyImm(V));
1415 SDValue Addr = N->getOperand(NumElts + 1);
1416 const unsigned ToTypeWidth = TotalWidth / NumElts;
1417
1418 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1419 TotalWidth <= 256 && "Invalid width for store");
1420
1421 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1422 Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1423 getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base,
1424 Offset, Chain});
1425
1426 const MVT::SimpleValueType EltVT =
1427 ST->getOperand(1).getSimpleValueType().SimpleTy;
1428 std::optional<unsigned> Opcode;
1429 switch (ST->getOpcode()) {
1430 default:
1431 return false;
1432 case NVPTXISD::StoreV2:
1433 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
1434 NVPTX::STV_i64_v2);
1435 break;
1436 case NVPTXISD::StoreV4:
1437 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
1438 NVPTX::STV_i64_v4);
1439 break;
1440 case NVPTXISD::StoreV8:
1441 Opcode = pickOpcodeForVT(EltVT, {/* no v8i16 */}, NVPTX::STV_i32_v8,
1442 {/* no v8i64 */});
1443 break;
1444 }
1445
1446 if (!Opcode)
1447 return false;
1448
1449 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1450
1451 MachineMemOperand *MemRef = ST->getMemOperand();
1452 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1453
1454 ReplaceNode(ST, NVPTXST);
1455 return true;
1456}
1457
1458/// SelectBFE - Look for instruction sequences that can be made more efficient
1459/// by using the 'bfe' (bit-field extract) PTX instruction
1460bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
1461 SDLoc DL(N);
1462 SDValue LHS = N->getOperand(0);
1463 SDValue RHS = N->getOperand(1);
1464 SDValue Len;
1465 SDValue Start;
1466 SDValue Val;
1467 bool IsSigned = false;
1468
1469 if (N->getOpcode() == ISD::AND) {
1470 // Canonicalize the operands
1471 // We want 'and %val, %mask'
1472 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
1473 std::swap(LHS, RHS);
1474 }
1475
1476 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
1477 if (!Mask) {
1478 // We need a constant mask on the RHS of the AND
1479 return false;
1480 }
1481
1482 // Extract the mask bits
1483 uint64_t MaskVal = Mask->getZExtValue();
1484 if (!isMask_64(MaskVal)) {
1485 // We *could* handle shifted masks here, but doing so would require an
1486 // 'and' operation to fix up the low-order bits so we would trade
1487 // shr+and for bfe+and, which has the same throughput
1488 return false;
1489 }
1490
1491 // How many bits are in our mask?
1492 int64_t NumBits = countr_one(MaskVal);
1493 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1494
1495 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
1496 // We have a 'srl/and' pair, extract the effective start bit and length
1497 Val = LHS.getNode()->getOperand(0);
1498 Start = LHS.getNode()->getOperand(1);
1499 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
1500 if (StartConst) {
1501 uint64_t StartVal = StartConst->getZExtValue();
1502 // How many "good" bits do we have left? "good" is defined here as bits
1503 // that exist in the original value, not shifted in.
1504 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1505 if (NumBits > GoodBits) {
1506 // Do not handle the case where bits have been shifted in. In theory
1507 // we could handle this, but the cost is likely higher than just
1508 // emitting the srl/and pair.
1509 return false;
1510 }
1511 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
1512 } else {
1513 // Do not handle the case where the shift amount (can be zero if no srl
1514 // was found) is not constant. We could handle this case, but it would
1515 // require run-time logic that would be more expensive than just
1516 // emitting the srl/and pair.
1517 return false;
1518 }
1519 } else {
1520 // Do not handle the case where the LHS of the and is not a shift. While
1521 // it would be trivial to handle this case, it would just transform
1522 // 'and' -> 'bfe', but 'and' has higher-throughput.
1523 return false;
1524 }
1525 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
1526 if (LHS->getOpcode() == ISD::AND) {
1527 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
1528 if (!ShiftCnst) {
1529 // Shift amount must be constant
1530 return false;
1531 }
1532
1533 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
1534
1535 SDValue AndLHS = LHS->getOperand(0);
1536 SDValue AndRHS = LHS->getOperand(1);
1537
1538 // Canonicalize the AND to have the mask on the RHS
1539 if (isa<ConstantSDNode>(AndLHS)) {
1540 std::swap(AndLHS, AndRHS);
1541 }
1542
1543 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
1544 if (!MaskCnst) {
1545 // Mask must be constant
1546 return false;
1547 }
1548
1549 uint64_t MaskVal = MaskCnst->getZExtValue();
1550 uint64_t NumZeros;
1551 uint64_t NumBits;
1552 if (isMask_64(MaskVal)) {
1553 NumZeros = 0;
1554 // The number of bits in the result bitfield will be the number of
1555 // trailing ones (the AND) minus the number of bits we shift off
1556 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
1557 } else if (isShiftedMask_64(MaskVal)) {
1558 NumZeros = llvm::countr_zero(MaskVal);
1559 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
1560 // The number of bits in the result bitfield will be the number of
1561 // trailing zeros plus the number of set bits in the mask minus the
1562 // number of bits we shift off
1563 NumBits = NumZeros + NumOnes - ShiftAmt;
1564 } else {
1565 // This is not a mask we can handle
1566 return false;
1567 }
1568
1569 if (ShiftAmt < NumZeros) {
1570 // Handling this case would require extra logic that would make this
1571 // transformation non-profitable
1572 return false;
1573 }
1574
1575 Val = AndLHS;
1576 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
1577 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1578
1579 // If pre-shift AND includes the sign bit in the bitfield, we must use
1580 // signed BFE to replicate that bit during bitfield extraction. If the
1581 // sign bit is not part of the mask, unsigned BFE will zero out upper bits
1582 // of the result
1583 if (N->getOpcode() == ISD::SRA)
1584 IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
1585 } else if (LHS->getOpcode() == ISD::SHL) {
1586 // Here, we have a pattern like:
1587 //
1588 // (sra (shl val, NN), MM)
1589 // or
1590 // (srl (shl val, NN), MM)
1591 //
1592 // If MM >= NN, we can efficiently optimize this with bfe
1593 Val = LHS->getOperand(0);
1594
1595 SDValue ShlRHS = LHS->getOperand(1);
1596 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
1597 if (!ShlCnst) {
1598 // Shift amount must be constant
1599 return false;
1600 }
1601 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
1602
1603 SDValue ShrRHS = RHS;
1604 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
1605 if (!ShrCnst) {
1606 // Shift amount must be constant
1607 return false;
1608 }
1609 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
1610
1611 // To avoid extra codegen and be profitable, we need Outer >= Inner
1612 if (OuterShiftAmt < InnerShiftAmt) {
1613 return false;
1614 }
1615
1616 // If the outer shift is more than the type size, we have no bitfield to
1617 // extract (since we also check that the inner shift is <= the outer shift
1618 // then this also implies that the inner shift is < the type size)
1619 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
1620 return false;
1621 }
1622
1623 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
1624 MVT::i32);
1625 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
1626 DL, MVT::i32);
1627
1628 if (N->getOpcode() == ISD::SRA) {
1629 // If we have a arithmetic right shift, we need to use the signed bfe
1630 // variant
1631 IsSigned = true;
1632 }
1633 } else {
1634 // No can do...
1635 return false;
1636 }
1637 } else {
1638 // No can do...
1639 return false;
1640 }
1641
1642
1643 unsigned Opc;
1644 // For the BFE operations we form here from "and" and "srl", always use the
1645 // unsigned variants.
1646 if (Val.getValueType() == MVT::i32) {
1647 if (IsSigned) {
1648 Opc = NVPTX::BFE_S32rii;
1649 } else {
1650 Opc = NVPTX::BFE_U32rii;
1651 }
1652 } else if (Val.getValueType() == MVT::i64) {
1653 if (IsSigned) {
1654 Opc = NVPTX::BFE_S64rii;
1655 } else {
1656 Opc = NVPTX::BFE_U64rii;
1657 }
1658 } else {
1659 // We cannot handle this type
1660 return false;
1661 }
1662
1663 SDValue Ops[] = {
1664 Val, Start, Len
1665 };
1666
1667 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
1668 return true;
1669}
1670
1671// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
1672bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
1673 EVT VT = SDValue(N, 0).getValueType();
1674 if (VT.getScalarType() != MVT::bf16)
1675 return false;
1676
1677 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1678 if (STI->hasNativeBF16Support(N->getOpcode()))
1679 return false;
1680
1681 const bool IsVec = VT.isVector();
1682 assert(!IsVec || VT.getVectorNumElements() == 2);
1683 SDLoc DL(N);
1684 SDValue N0 = N->getOperand(0);
1685 SDValue N1 = N->getOperand(1);
1687 auto GetConstant = [&](float Value) -> SDValue {
1688 // BF16 immediates must be legalized to integer register values
1689 APFloat APF(Value);
1690 bool LosesInfo;
1691 APF.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &LosesInfo);
1692 assert(!LosesInfo);
1693 if (IsVec) {
1694 auto API = APF.bitcastToAPInt();
1695 API = API.concat(API);
1696 auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32);
1697 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_B32_i, DL, VT, Const),
1698 0);
1699 }
1700 auto Const = CurDAG->getTargetConstantFP(APF, DL, VT);
1701 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_BF16_i, DL, VT, Const), 0);
1702 };
1703
1704 switch (N->getOpcode()) {
1705 case ISD::FADD:
1706 // add(a, b) -> fma(a, 1.0, b)
1707 Operands = {N0, GetConstant(1.0), N1};
1708 break;
1709 case ISD::FSUB:
1710 // sub(a, b) -> fma(b, -1.0, a)
1711 Operands = {N1, GetConstant(-1.0), N0};
1712 break;
1713 case ISD::FMUL:
1714 // mul(a, b) -> fma(a, b, -0.0)
1715 // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
1716 Operands = {N0, N1, GetConstant(-0.0)};
1717 break;
1718 default:
1719 llvm_unreachable("Unexpected opcode");
1720 };
1721
1722 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1724 ReplaceNode(N, FMA);
1725 return true;
1726}
1727
1728SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
1729 if (V.getOpcode() == ISD::BITCAST)
1730 V = V.getOperand(0);
1731
1732 if (auto *CN = dyn_cast<ConstantSDNode>(V))
1733 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1734 V.getValueType());
1735 if (auto *CN = dyn_cast<ConstantFPSDNode>(V))
1736 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1737 V.getValueType());
1738 return V;
1739}
1740
1741/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
1742/// inline asm expressions.
1744 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
1745 std::vector<SDValue> &OutOps) {
1746 switch (ConstraintID) {
1747 default:
1748 return true;
1749 case InlineAsm::ConstraintCode::m: { // memory
1750 const auto [Base, Offset] = selectADDR(Op, CurDAG);
1751 OutOps.push_back(Base);
1752 OutOps.push_back(Offset);
1753 return false;
1754 }
1755 }
1756 return true;
1757}
1758
1759void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
1760 // Lower a CopyToReg with two 64-bit inputs
1761 // Dst:i128, lo:i64, hi:i64
1762 //
1763 // CopyToReg Dst, lo, hi;
1764 //
1765 // ==>
1766 //
1767 // tmp = V2I64toI128 {lo, hi};
1768 // CopyToReg Dst, tmp;
1769 SDValue Dst = N->getOperand(1);
1770 SDValue Lo = N->getOperand(2);
1771 SDValue Hi = N->getOperand(3);
1772
1773 SDLoc DL(N);
1774 SDNode *Mov =
1775 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
1776
1777 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
1778 NewOps[0] = N->getOperand(0);
1779 NewOps[1] = Dst;
1780 NewOps[2] = SDValue(Mov, 0);
1781 if (N->getNumOperands() == 5)
1782 NewOps[3] = N->getOperand(4);
1783 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
1784
1785 ReplaceNode(N, NewValue.getNode());
1786}
1787
1788void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
1789 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
1790 // Dst:i128, Src:i128
1791 //
1792 // {lo, hi} = CopyFromReg Src
1793 //
1794 // ==>
1795 //
1796 // {lo, hi} = I128toV2I64 Src
1797 //
1798 SDValue Ch = N->getOperand(0);
1799 SDValue Src = N->getOperand(1);
1800 SDValue Glue = N->getOperand(2);
1801 SDLoc DL(N);
1802
1803 // Add Glue and Ch to the operands and results to avoid break the execution
1804 // order
1806 NVPTX::I128toV2I64, DL,
1807 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
1808 {Src, Ch, Glue});
1809
1810 ReplaceNode(N, Mov);
1811}
1812
1813bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
1814 SDLoc DL(N);
1815 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
1816 unsigned int FenceOp =
1817 getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
1818 Scopes[N->getConstantOperandVal(2)], Subtarget);
1819 SDValue Chain = N->getOperand(0);
1820 SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
1821 ReplaceNode(N, FenceNode);
1822 return true;
1823}
1824
1826 Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
1827 Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
1828 Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
1829 Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
1830 Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
1831}
1832
1834 if (Scopes.empty())
1835 llvm_unreachable("NVPTX Scopes must be initialized before calling "
1836 "NVPTXScopes::operator[]");
1837
1838 auto S = Scopes.find(ID);
1839 if (S == Scopes.end()) {
1840 // TODO:
1841 // - Add API to LLVMContext to get the name of a single scope.
1842 // - Use that API here to print an error containing the name
1843 // of this Unknown ID.
1844 report_fatal_error(formatv("Could not find scope ID={}.", int(ID)));
1845 }
1846 return S->second;
1847}
1848
1849bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
1850
1851#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1852 (is_s32 \
1853 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1854 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1855
1856#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1857 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1858 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1859
1860#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
1861 [&]() -> auto { \
1862 if (is_mc && is_ch) \
1863 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
1864 if (is_ch) \
1865 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
1866 if (is_mc) \
1867 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
1868 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
1869 }()
1870
1872 bool IsShared32,
1873 bool IsCacheHint,
1874 bool IsIm2Col) {
1875 if (IsIm2Col) {
1876 switch (Dim) {
1877 case 3:
1878 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
1879 IsShared32);
1880 case 4:
1881 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
1882 IsShared32);
1883 case 5:
1884 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
1885 IsShared32);
1886 default:
1887 llvm_unreachable("Invalid Dimension in im2col mode for "
1888 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1889 }
1890 } else {
1891 switch (Dim) {
1892 case 1:
1893 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
1894 IsShared32);
1895 case 2:
1896 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
1897 IsShared32);
1898 case 3:
1899 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
1900 IsShared32);
1901 case 4:
1902 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
1903 IsShared32);
1904 case 5:
1905 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
1906 IsShared32);
1907 default:
1908 llvm_unreachable("Invalid Dimension in tile mode for "
1909 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1910 }
1911 }
1912}
1913
1914static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
1915 bool IsMultiCast,
1916 bool IsCacheHint, bool IsIm2Col) {
1917 if (IsIm2Col) {
1918 switch (Dim) {
1919 case 3:
1920 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
1921 IsCacheHint, IsShared32);
1922 case 4:
1923 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
1924 IsCacheHint, IsShared32);
1925 case 5:
1926 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
1927 IsCacheHint, IsShared32);
1928 default:
1929 llvm_unreachable("Invalid Dimension in im2col mode for "
1930 "GetCpAsyncBulkTensorG2SOpcode.");
1931 }
1932 } else {
1933 switch (Dim) {
1934 case 1:
1935 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
1936 IsCacheHint, IsShared32);
1937 case 2:
1938 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
1939 IsCacheHint, IsShared32);
1940 case 3:
1941 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
1942 IsCacheHint, IsShared32);
1943 case 4:
1944 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
1945 IsCacheHint, IsShared32);
1946 case 5:
1947 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
1948 IsCacheHint, IsShared32);
1949 default:
1951 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
1952 }
1953 }
1954}
1955
1956static size_t GetDimsFromIntrinsic(unsigned IID) {
1957 switch (IID) {
1958 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
1959 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
1960 return 3;
1961 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
1962 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
1963 return 4;
1964 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
1965 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
1966 return 5;
1967 default:
1968 llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
1969 }
1970}
1971
1972void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
1973 bool IsIm2Col) {
1974 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
1975 // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
1976 // multicast, cache_hint,
1977 // multicast_flag, cache_hint_flag, cta_group_flag}
1978 // NumOperands = {Chain, IID} + {Actual intrinsic args}
1979 // = {2} + {8 + dims + im2col_offsets}
1980 size_t NumOps = N->getNumOperands();
1981 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
1982 : (NumOps - 10);
1983 // Offsets is always 'NumDims - 2' and only for im2col mode
1984 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
1985 bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
1986 bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
1987 size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
1988 size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
1989
1990 unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
1991 if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
1993 formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
1995
1996 SDLoc DL(N);
1997 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
1998
1999 // Push MultiCast operand, if available
2000 if (IsMultiCast)
2001 Ops.push_back(N->getOperand(MultiCastIdx));
2002
2003 // Push CacheHint operand, if available
2004 if (IsCacheHint)
2005 Ops.push_back(N->getOperand(MultiCastIdx + 1));
2006
2007 // Flag for CTA Group
2008 Ops.push_back(getI32Imm(CTAGroupVal, DL));
2009
2010 // Finally, the chain operand
2011 Ops.push_back(N->getOperand(0));
2012
2013 bool IsShared32 =
2015 unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
2016 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2017 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2018}
2019
2020void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
2021 unsigned RedOp,
2022 bool IsIm2Col) {
2023 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2024 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2025 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2026 // = {2} + {4 + dims}
2027 size_t NumOps = N->getNumOperands();
2028 size_t NumDims = NumOps - 6;
2029 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2030 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
2031
2032 SDLoc DL(N);
2033 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
2034 Ops.push_back(getI32Imm(RedOp, DL)); // Reduction Op
2035 Ops.push_back(N->getOperand(0)); // Chain operand
2036
2037 bool IsShared32 =
2040 NumDims, IsShared32, IsCacheHint, IsIm2Col);
2041 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2042}
2043
2044#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
2045 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
2046 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
2047
2048static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
2049 switch (IID) {
2050 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2051 return TCGEN05_ST_OPCODE(16x64b, x1);
2052 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2053 return TCGEN05_ST_OPCODE(16x64b, x2);
2054 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2055 return TCGEN05_ST_OPCODE(16x64b, x4);
2056 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2057 return TCGEN05_ST_OPCODE(16x64b, x8);
2058 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2059 return TCGEN05_ST_OPCODE(16x64b, x16);
2060 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2061 return TCGEN05_ST_OPCODE(16x64b, x32);
2062 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2063 return TCGEN05_ST_OPCODE(16x64b, x64);
2064 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2065 return TCGEN05_ST_OPCODE(16x64b, x128);
2066 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2067 return TCGEN05_ST_OPCODE(16x128b, x1);
2068 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2069 return TCGEN05_ST_OPCODE(16x128b, x2);
2070 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2071 return TCGEN05_ST_OPCODE(16x128b, x4);
2072 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2073 return TCGEN05_ST_OPCODE(16x128b, x8);
2074 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2075 return TCGEN05_ST_OPCODE(16x128b, x16);
2076 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2077 return TCGEN05_ST_OPCODE(16x128b, x32);
2078 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2079 return TCGEN05_ST_OPCODE(16x128b, x64);
2080 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2081 return TCGEN05_ST_OPCODE(16x256b, x1);
2082 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2083 return TCGEN05_ST_OPCODE(16x256b, x2);
2084 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2085 return TCGEN05_ST_OPCODE(16x256b, x4);
2086 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2087 return TCGEN05_ST_OPCODE(16x256b, x8);
2088 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2089 return TCGEN05_ST_OPCODE(16x256b, x16);
2090 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2091 return TCGEN05_ST_OPCODE(16x256b, x32);
2092 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2093 return TCGEN05_ST_OPCODE(16x32bx2, x1);
2094 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2095 return TCGEN05_ST_OPCODE(16x32bx2, x2);
2096 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2097 return TCGEN05_ST_OPCODE(16x32bx2, x4);
2098 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2099 return TCGEN05_ST_OPCODE(16x32bx2, x8);
2100 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2101 return TCGEN05_ST_OPCODE(16x32bx2, x16);
2102 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2103 return TCGEN05_ST_OPCODE(16x32bx2, x32);
2104 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2105 return TCGEN05_ST_OPCODE(16x32bx2, x64);
2106 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2107 return TCGEN05_ST_OPCODE(16x32bx2, x128);
2108 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2109 return TCGEN05_ST_OPCODE(32x32b, x1);
2110 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2111 return TCGEN05_ST_OPCODE(32x32b, x2);
2112 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2113 return TCGEN05_ST_OPCODE(32x32b, x4);
2114 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2115 return TCGEN05_ST_OPCODE(32x32b, x8);
2116 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2117 return TCGEN05_ST_OPCODE(32x32b, x16);
2118 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2119 return TCGEN05_ST_OPCODE(32x32b, x32);
2120 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2121 return TCGEN05_ST_OPCODE(32x32b, x64);
2122 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2123 return TCGEN05_ST_OPCODE(32x32b, x128);
2124 }
2125 llvm_unreachable("unhandled tcgen05.st lowering");
2126}
2127
2128void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
2129 SDLoc DL(N);
2130 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2131
2133 N->getOperand(2) // taddr
2134 };
2135
2136 if (hasOffset)
2138 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL,
2139 MVT::i32)); // Offset
2140
2141 for (unsigned I = hasOffset ? 4 : 3; I < (N->getNumOperands() - 1); I++)
2142 Operands.push_back(N->getOperand(I));
2143
2144 bool enableUnpack =
2145 cast<ConstantSDNode>(N->getOperand(N->getNumOperands() - 1))
2146 ->getZExtValue();
2147
2148 Operands.push_back(N->getOperand(0)); // Chain
2150 DL, N->getVTList(), Operands));
2151}
2152
2153bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2154 unsigned IID = N->getConstantOperandVal(1);
2155 using TMARedTy = llvm::nvvm::TMAReductionOp;
2156 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
2157 switch (IID) {
2158 default:
2159 return false;
2160 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2161 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2162 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2163 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2164 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2165 SelectCpAsyncBulkTensorG2SCommon(N);
2166 return true;
2167 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2168 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2169 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2170 SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
2171 return true;
2172 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2173 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2175 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2176 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2177 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD));
2178 return true;
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2182 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD),
2183 /*IsIm2Col=*/true);
2184 return true;
2185 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2188 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2189 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2190 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN));
2191 return true;
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2195 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN),
2196 /*IsIm2Col=*/true);
2197 return true;
2198 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2200 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2201 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2202 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2203 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX));
2204 return true;
2205 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2206 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2207 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2208 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX),
2209 /*IsIm2Col=*/true);
2210 return true;
2211 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2212 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2213 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2214 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2215 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2216 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC));
2217 return true;
2218 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2219 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2220 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2221 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC),
2222 /*IsIm2Col=*/true);
2223 return true;
2224 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2225 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2226 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2227 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2228 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2229 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC));
2230 return true;
2231 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2232 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2233 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2234 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC),
2235 /*IsIm2Col=*/true);
2236 return true;
2237 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2238 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2239 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2240 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2241 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2242 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND));
2243 return true;
2244 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2245 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2246 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2247 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND),
2248 /*IsIm2Col=*/true);
2249 return true;
2250 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2251 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2252 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2253 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2254 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2255 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR));
2256 return true;
2257 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2258 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2259 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2260 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR),
2261 /*IsIm2Col=*/true);
2262 return true;
2263 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2264 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2265 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2266 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2267 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2268 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR));
2269 return true;
2270 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2271 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2272 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2273 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR),
2274 /*IsIm2Col=*/true);
2275 return true;
2276
2277 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2278 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2279 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2280 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2281 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2282 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2283 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2284 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2285 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2286 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2287 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2288 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2289 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2290 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2291 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2292 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2293 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2294 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2295 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2296 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2297 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2298 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2299 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2300 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2301 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2302 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2303 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2304 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2305 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2306 SelectTcgen05St(N);
2307 return true;
2308 }
2309
2310 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2311 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2312 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2313 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2314 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2315 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2316 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2317 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2318 SelectTcgen05St(N, /* hasOffset */ true);
2319 return true;
2320 }
2321 }
2322}
2323
2324void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) {
2325 MemSDNode *AN = cast<MemSDNode>(N);
2326 SDLoc dl(N);
2327
2328 const SDValue Chain = N->getOperand(0);
2329 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
2331 Ops.append(N->op_begin() + 2, N->op_end());
2332 Ops.append({
2333 getI32Imm(getMemOrder(AN), dl),
2334 getI32Imm(getAtomicScope(AN), dl),
2335 getI32Imm(getAddrSpace(AN), dl),
2336 Chain,
2337 });
2338
2339 assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
2340 N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
2341 unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
2342 ? NVPTX::ATOM_EXCH_B128
2343 : NVPTX::ATOM_CAS_B128;
2344
2345 auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops);
2346 CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand());
2347
2348 ReplaceNode(N, ATOM);
2349}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
uint64_t Addr
#define DEBUG_TYPE
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static size_t GetDimsFromIntrinsic(unsigned IID)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:56
#define PASS_NAME
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:985
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
The size in bits of the pointer representation in a given address space.
Definition: DataLayout.h:390
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
Record instruction ordering so we can query their relative positions within a function.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Machine Value Type.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
A description of a memory reference used in the backend.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
bool hasCpAsyncBulkTensorCTAGroupSupport() const
void failIfClustersUnsupported(std::string const &FailureMessage) const
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
unsigned int getSmVersion() const
bool hasRelaxedMMIO() const
bool hasAtomScope() const
bool hasMemoryOrdering() const
bool useF32FTZ(const MachineFunction &MF) const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
const NVPTXSubtarget * getSubtargetImpl(const Function &) const override
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
MachineFunction * MF
CodeGenOptLevel OptLevel
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:758
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:763
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:719
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:743
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
bool empty() const
Definition: SmallVector.h:82
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
unsigned getPointerSizeInBits(unsigned AS) const
LLVM Value Representation.
Definition: Value.h:75
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:82
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:215
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1343
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
Definition: ISDOpcodes.h:69
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:225
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1691
@ ADDRESS_SPACE_SHARED_CLUSTER
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
AddressSpace
Definition: NVPTX.h:181
@ Global
Definition: NVPTX.h:183
@ SharedCluster
Definition: NVPTX.h:187
@ Shared
Definition: NVPTX.h:184
@ Generic
Definition: NVPTX.h:182
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition: NVPTX.h:251
@ System
Definition: NVPTX.h:175
@ Cluster
Definition: NVPTX.h:173
@ Thread
Definition: NVPTX.h:171
@ Device
Definition: NVPTX.h:174
@ DefaultDevice
Definition: NVPTX.h:176
@ RelaxedMMIO
Definition: NVPTX.h:166
@ Acquire
Definition: NVPTX.h:160
@ Relaxed
Definition: NVPTX.h:158
@ AcquireRelease
Definition: NVPTX.h:162
@ NotAtomic
Definition: NVPTX.h:155
@ Volatile
Definition: NVPTX.h:165
@ Release
Definition: NVPTX.h:161
@ SequentiallyConsistent
Definition: NVPTX.h:163
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition: bit.h:260
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:282
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:270
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:82
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:851
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:304
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:265
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
NVPTXScopes()=default
NVPTX::Scope operator[](SyncScope::ID ID) const