23#include "llvm/IR/IntrinsicsNVPTX.h"
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
39 cl::desc(
"Enable reciprocal sqrt optimization"));
68NVPTXDAGToDAGISel::getDivF32Level(
const SDNode *
N)
const {
72bool NVPTXDAGToDAGISel::usePrecSqrtF32(
const SDNode *
N)
const {
76bool NVPTXDAGToDAGISel::useF32FTZ()
const {
80bool NVPTXDAGToDAGISel::allowFMA()
const {
85bool NVPTXDAGToDAGISel::doRsqrtOpt()
const {
return EnableRsqrtOpt; }
89void NVPTXDAGToDAGISel::Select(
SDNode *
N) {
91 if (
N->isMachineOpcode()) {
96 switch (
N->getOpcode()) {
115 if (tryEXTRACT_VECTOR_ELEMENT(
N))
122 SelectSETP_BF16X2(
N);
127 if (tryLoadVector(
N))
138 if (tryStoreVector(
N))
142 if (tryIntrinsicChain(
N))
146 if (tryIntrinsicVoid(
N))
157 SelectAddrSpaceCast(
N);
160 if (
N->getOperand(1).getValueType() == MVT::i128) {
161 SelectV2I64toI128(
N);
167 if (
N->getOperand(1).getValueType() == MVT::i128) {
168 SelectI128toV2I64(
N);
175 selectAtomicSwap128(
N);
180 if (tryBF16ArithToFMA(
N))
189#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
190 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
191 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
195 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
197 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
199 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
201 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
203 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
205 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
207 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
209 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
211 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
213 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
215 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
217 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
219 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
221 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
223 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
225 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
227 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
229 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
231 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
233 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
235 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
237 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
239 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
241 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
243 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
245 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
247 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
249 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
251 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
253 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
255 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
257 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
259 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
261 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
263 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
265 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
267 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
273void NVPTXDAGToDAGISel::SelectTcgen05Ld(
SDNode *
N,
bool hasOffset) {
275 unsigned IID = cast<ConstantSDNode>(
N->getOperand(1))->getZExtValue();
278 bool enablePack = cast<ConstantSDNode>(
N->getOperand(4))->getZExtValue();
280 cast<ConstantSDNode>(
N->getOperand(3))->getZExtValue(),
DL, MVT::i32);
283 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
285 bool enablePack = cast<ConstantSDNode>(
N->getOperand(3))->getZExtValue();
288 {N->getOperand(2), N->getOperand(0)}));
292bool NVPTXDAGToDAGISel::tryIntrinsicChain(
SDNode *
N) {
293 unsigned IID =
N->getConstantOperandVal(1);
297 case Intrinsic::nvvm_ldu_global_f:
298 case Intrinsic::nvvm_ldu_global_i:
299 case Intrinsic::nvvm_ldu_global_p:
302 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
303 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
304 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
305 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
306 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
307 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
308 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
309 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
310 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
311 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
312 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
313 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
314 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
315 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
316 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
317 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
318 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
319 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
320 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
321 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
322 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
323 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
324 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
325 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
326 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
327 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
328 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
329 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
330 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
335 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
336 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
337 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
338 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
339 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
340 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
341 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
342 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
343 SelectTcgen05Ld(
N,
true);
378 return CmpMode::NotANumber;
396bool NVPTXDAGToDAGISel::SelectSETP_F16X2(
SDNode *
N) {
397 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(
N->getOperand(2)));
400 NVPTX::SETP_f16x2rr,
DL, MVT::i1, MVT::i1,
401 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
407bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(
SDNode *
N) {
408 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(
N->getOperand(2)));
411 NVPTX::SETP_bf16x2rr,
DL, MVT::i1, MVT::i1,
412 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
418bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(
SDNode *
N) {
420 MVT EltVT =
N->getSimpleValueType(0);
431bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(
SDNode *
N) {
440 Opcode = NVPTX::I32toV2I16;
442 Opcode = NVPTX::I64toV2I32;
448 for (
auto *U :
Vector.getNode()->users()) {
451 if (
U->getOperand(0) !=
Vector)
454 dyn_cast<ConstantSDNode>(
U->getOperand(1))) {
455 if (IdxConst->getZExtValue() == 0)
457 else if (IdxConst->getZExtValue() == 1)
474 for (
auto *
Node : E0)
476 for (
auto *
Node : E1)
482static std::optional<NVPTX::AddressSpace>
convertAS(
unsigned AS) {
504 return convertAS(
N->getMemOperand()->getAddrSpace())
512 auto Ordering =
N->getMergedOrdering();
536 return Scopes[
N->getSyncScopeID()];
541struct OperationOrderings {
548static OperationOrderings
650 !HasMemoryOrdering) {
652 formatv(
"PTX does not support \"atomic\" for orderings different than"
653 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
665 bool AddrGenericOrGlobalOrShared =
670 if (!AddrGenericOrGlobalOrShared)
673 bool UseRelaxedMMIO =
695 formatv(
"PTX only supports Acquire Ordering on reads: {}",
696 N->getOperationName()));
701 formatv(
"PTX only supports Release Ordering on writes: {}",
702 N->getOperationName()));
706 formatv(
"NVPTX does not support AcquireRelease Ordering on "
708 "yet and PTX does not support it on loads or stores: {}",
709 N->getOperationName()));
722 else if (
N->writeMem())
726 formatv(
"NVPTX does not support SequentiallyConsistent Ordering on "
727 "read-modify-writes yet: {}",
728 N->getOperationName()));
729 return OperationOrderings(InstrOrder,
734 formatv(
"NVPTX backend does not support AtomicOrdering \"{}\" yet.",
757 auto S = Scopes[
N->getSyncScopeID()];
785 T->failIfClustersUnsupported(
".cluster scope fence");
788 if (!
T->hasSplitAcquireAndReleaseFences() &&
796 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
797 : NVPTX::INT_MEMBAR_SYS;
799 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
800 : NVPTX::INT_MEMBAR_CTA;
802 return NVPTX::atomic_thread_fence_acquire_cluster;
804 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
805 : NVPTX::INT_MEMBAR_GL;
809 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
816 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
817 : NVPTX::INT_MEMBAR_SYS;
819 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
820 : NVPTX::INT_MEMBAR_CTA;
822 return NVPTX::atomic_thread_fence_release_cluster;
824 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
825 : NVPTX::INT_MEMBAR_GL;
829 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
836 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
837 : NVPTX::INT_MEMBAR_SYS;
839 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
840 : NVPTX::INT_MEMBAR_CTA;
842 return NVPTX::atomic_thread_fence_acq_rel_cluster;
844 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
845 : NVPTX::INT_MEMBAR_GL;
849 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
857 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
858 : NVPTX::INT_MEMBAR_SYS;
860 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
861 : NVPTX::INT_MEMBAR_CTA;
863 return NVPTX::atomic_thread_fence_seq_cst_cluster;
865 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
866 : NVPTX::INT_MEMBAR_GL;
879 formatv(
"Unsupported \"{}\" ordering and \"{}\" scope for fence.",
880 OrderingToString(O), ScopeToString(S)));
888std::pair<NVPTX::Ordering, NVPTX::Scope>
889NVPTXDAGToDAGISel::insertMemoryInstructionFence(
SDLoc DL,
SDValue &Chain,
906 formatv(
"Unexpected fence ordering: \"{}\".",
912void NVPTXDAGToDAGISel::SelectAddrSpaceCast(
SDNode *
N) {
918 assert(SrcAddrSpace != DstAddrSpace &&
919 "addrspacecast must be between different address spaces");
933 switch (SrcAddrSpace) {
936 Opc = TM.
is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
939 Opc = TM.
is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
944 "Shared cluster address space is only supported in 64-bit mode");
945 Opc = NVPTX::cvta_shared_cluster_64;
948 Opc = TM.
is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
951 Opc = TM.
is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
954 Opc = TM.
is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
961 if (SrcAddrSpace != 0)
964 switch (DstAddrSpace) {
967 Opc = TM.
is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
970 Opc = TM.
is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
975 "Shared cluster address space is only supported in 64-bit mode");
976 Opc = NVPTX::cvta_to_shared_cluster_64;
979 Opc = TM.
is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
982 Opc = TM.
is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
985 Opc = TM.
is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1004static std::optional<unsigned>
1006 std::optional<unsigned> Opcode_i32,
1007 std::optional<unsigned> Opcode_i64) {
1025 return std::nullopt;
1030 return V.getOpcode() ==
ISD::ADD ||
1031 (V->getOpcode() ==
ISD::OR && V->getFlags().hasDisjoint());
1036 N =
N.getOperand(0);
1044 if (
const auto *GA = dyn_cast<GlobalAddressSDNode>(
N))
1046 GA->getValueType(0), GA->getOffset(),
1047 GA->getTargetFlags());
1048 if (
const auto *ES = dyn_cast<ExternalSymbolSDNode>(
N))
1050 ES->getTargetFlags());
1051 if (
const auto *FIN = dyn_cast<FrameIndexSDNode>(
N))
1059 APInt AccumulatedOffset(64u, 0);
1061 const auto *CN = dyn_cast<ConstantSDNode>(
Addr.getOperand(1));
1065 const APInt CI = CN->getAPIntValue().
sext(64);
1066 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1069 AccumulatedOffset += CI;
1095bool NVPTXDAGToDAGISel::tryLoad(
SDNode *
N) {
1097 assert(
LD->readMem() &&
"Expected load");
1100 const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
1101 if (PlainLoad && PlainLoad->
isIndexed())
1111 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1113 const unsigned FromTypeWidth =
LD->getMemoryVT().getSizeInBits();
1122 FromTypeWidth <= 128 &&
"Invalid width for load");
1126 SDValue Ops[] = {getI32Imm(Ordering,
DL),
1127 getI32Imm(Scope,
DL),
1128 getI32Imm(CodeAddrSpace,
DL),
1129 getI32Imm(FromType,
DL),
1130 getI32Imm(FromTypeWidth,
DL),
1136 const std::optional<unsigned> Opcode =
1137 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1153 switch (
N->getOpcode()) {
1165bool NVPTXDAGToDAGISel::tryLoadVector(
SDNode *
N) {
1173 const MVT EltVT =
LD->getSimpleValueType(0);
1176 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1186 const unsigned ExtensionType =
1187 N->getConstantOperandVal(
N->getNumOperands() - 1);
1197 SDValue Ops[] = {getI32Imm(Ordering,
DL),
1198 getI32Imm(Scope,
DL),
1199 getI32Imm(CodeAddrSpace,
DL),
1200 getI32Imm(FromType,
DL),
1201 getI32Imm(FromTypeWidth,
DL),
1206 std::optional<unsigned> Opcode;
1207 switch (
N->getOpcode()) {
1212 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1216 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1220 NVPTX::LDV_i32_v8, {});
1235bool NVPTXDAGToDAGISel::tryLDG(
MemSDNode *LD) {
1238 unsigned ExtensionType;
1239 if (
const auto *Load = dyn_cast<LoadSDNode>(LD)) {
1240 ExtensionType =
Load->getExtensionType();
1242 ExtensionType =
LD->getConstantOperandVal(
LD->getNumOperands() - 1);
1250 assert(!(
LD->getSimpleValueType(0).isVector() &&
1254 SDValue Ops[] = {getI32Imm(FromType,
DL), getI32Imm(FromTypeWidth,
DL),
Base,
1258 std::optional<unsigned> Opcode;
1259 switch (
LD->getOpcode()) {
1264 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1269 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1274 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1278 NVPTX::LD_GLOBAL_NC_v8i32, {});
1293 auto ElementBitWidth = TotalWidth / NumElts;
1295 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1296 "Invalid width for load");
1297 return ElementBitWidth;
1300bool NVPTXDAGToDAGISel::tryLDU(
SDNode *
N) {
1301 auto *LD = cast<MemSDNode>(
N);
1315 std::optional<unsigned> Opcode;
1316 switch (
N->getOpcode()) {
1321 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1325 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1329 NVPTX::LDU_GLOBAL_v4i32, {});
1341bool NVPTXDAGToDAGISel::tryStore(
SDNode *
N) {
1343 assert(
ST->writeMem() &&
"Expected store");
1344 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
1345 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
1346 assert((PlainStore || AtomicStore) &&
"Expected store");
1349 if (PlainStore && PlainStore->
isIndexed())
1357 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1360 const unsigned ToTypeWidth =
ST->getMemoryVT().getSizeInBits();
1366 "Invalid width for store");
1370 getI32Imm(Ordering,
DL),
1371 getI32Imm(Scope,
DL),
1372 getI32Imm(CodeAddrSpace,
DL),
1373 getI32Imm(ToTypeWidth,
DL),
1378 const std::optional<unsigned> Opcode =
1380 NVPTX::ST_i32, NVPTX::ST_i64);
1395bool NVPTXDAGToDAGISel::tryStoreVector(
SDNode *
N) {
1397 const unsigned TotalWidth =
ST->getMemoryVT().getSizeInBits();
1408 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1413 for (
auto &V :
ST->ops().slice(1, NumElts))
1416 const unsigned ToTypeWidth = TotalWidth / NumElts;
1419 TotalWidth <= 256 &&
"Invalid width for store");
1422 Ops.
append({getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1423 getI32Imm(CodeAddrSpace,
DL), getI32Imm(ToTypeWidth,
DL),
Base,
1427 ST->getOperand(1).getSimpleValueType().SimpleTy;
1428 std::optional<unsigned> Opcode;
1429 switch (
ST->getOpcode()) {
1460bool NVPTXDAGToDAGISel::tryBFE(
SDNode *
N) {
1467 bool IsSigned =
false;
1472 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
1497 Val =
LHS.getNode()->getOperand(0);
1498 Start =
LHS.getNode()->getOperand(1);
1504 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1505 if (NumBits > GoodBits) {
1539 if (isa<ConstantSDNode>(AndLHS)) {
1563 NumBits = NumZeros + NumOnes - ShiftAmt;
1569 if (ShiftAmt < NumZeros) {
1593 Val =
LHS->getOperand(0);
1612 if (OuterShiftAmt < InnerShiftAmt) {
1648 Opc = NVPTX::BFE_S32rii;
1650 Opc = NVPTX::BFE_U32rii;
1654 Opc = NVPTX::BFE_S64rii;
1656 Opc = NVPTX::BFE_U64rii;
1672bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(
SDNode *
N) {
1694 auto API = APF.bitcastToAPInt();
1695 API = API.concat(API);
1704 switch (
N->getOpcode()) {
1707 Operands = {N0, GetConstant(1.0), N1};
1711 Operands = {N1, GetConstant(-1.0), N0};
1716 Operands = {N0, N1, GetConstant(-0.0)};
1722 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1730 V =
V.getOperand(0);
1732 if (
auto *CN = dyn_cast<ConstantSDNode>(V))
1735 if (
auto *CN = dyn_cast<ConstantFPSDNode>(V))
1745 std::vector<SDValue> &OutOps) {
1746 switch (ConstraintID) {
1751 OutOps.push_back(
Base);
1752 OutOps.push_back(
Offset);
1759void NVPTXDAGToDAGISel::SelectV2I64toI128(
SDNode *
N) {
1778 NewOps[0] =
N->getOperand(0);
1781 if (
N->getNumOperands() == 5)
1782 NewOps[3] =
N->getOperand(4);
1788void NVPTXDAGToDAGISel::SelectI128toV2I64(
SDNode *
N) {
1806 NVPTX::I128toV2I64,
DL,
1813bool NVPTXDAGToDAGISel::tryFence(
SDNode *
N) {
1816 unsigned int FenceOp =
1818 Scopes[
N->getConstantOperandVal(2)],
Subtarget);
1836 "NVPTXScopes::operator[]");
1838 auto S = Scopes.find(
ID);
1839 if (S == Scopes.end()) {
1851#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1853 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1854 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1856#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1857 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1858 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1860#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
1862 if (is_mc && is_ch) \
1863 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
1865 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
1867 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
1868 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
1888 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1909 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1916 bool IsCacheHint,
bool IsIm2Col) {
1921 IsCacheHint, IsShared32);
1924 IsCacheHint, IsShared32);
1927 IsCacheHint, IsShared32);
1930 "GetCpAsyncBulkTensorG2SOpcode.");
1936 IsCacheHint, IsShared32);
1939 IsCacheHint, IsShared32);
1942 IsCacheHint, IsShared32);
1945 IsCacheHint, IsShared32);
1948 IsCacheHint, IsShared32);
1951 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
1958 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
1959 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
1961 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
1962 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
1964 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
1965 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
1972void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(
SDNode *
N,
1980 size_t NumOps =
N->getNumOperands();
1984 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
1985 bool IsCacheHint =
N->getConstantOperandVal(NumOps - 2) == 1;
1986 bool IsMultiCast =
N->getConstantOperandVal(NumOps - 3) == 1;
1987 size_t NumBaseArgs = NumDims + NumOffsets + 3;
1988 size_t MultiCastIdx = NumBaseArgs + 2;
1990 unsigned CTAGroupVal =
N->getConstantOperandVal(NumOps - 1);
1993 formatv(
"CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
2005 Ops.
push_back(
N->getOperand(MultiCastIdx + 1));
2016 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2020void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(
SDNode *
N,
2027 size_t NumOps =
N->getNumOperands();
2028 size_t NumDims = NumOps - 6;
2029 bool IsCacheHint =
N->getConstantOperandVal(NumOps - 1) == 1;
2030 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
2040 NumDims, IsShared32, IsCacheHint, IsIm2Col);
2044#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
2045 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
2046 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
2050 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2052 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2054 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2056 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2058 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2060 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2062 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2064 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2066 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2068 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2070 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2072 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2074 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2076 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2078 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2080 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2082 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2084 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2086 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2088 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2090 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2092 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2094 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2096 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2098 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2100 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2102 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2104 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2106 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2108 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2110 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2112 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2114 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2116 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2118 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2120 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2122 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2128void NVPTXDAGToDAGISel::SelectTcgen05St(
SDNode *
N,
bool hasOffset) {
2130 unsigned IID = cast<ConstantSDNode>(
N->getOperand(1))->getZExtValue();
2138 cast<ConstantSDNode>(
N->getOperand(3))->getZExtValue(),
DL,
2141 for (
unsigned I = hasOffset ? 4 : 3;
I < (
N->getNumOperands() - 1);
I++)
2145 cast<ConstantSDNode>(
N->getOperand(
N->getNumOperands() - 1))
2153bool NVPTXDAGToDAGISel::tryIntrinsicVoid(
SDNode *
N) {
2154 unsigned IID =
N->getConstantOperandVal(1);
2156 auto CastTy = [](TMARedTy
Op) {
return static_cast<unsigned>(
Op); };
2160 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2161 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2162 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2163 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2164 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2165 SelectCpAsyncBulkTensorG2SCommon(
N);
2167 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2168 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2169 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2170 SelectCpAsyncBulkTensorG2SCommon(
N,
true);
2172 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2173 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2175 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2176 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2177 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD));
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2182 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD),
2185 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2188 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2189 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2190 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN));
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2195 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN),
2198 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2200 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2201 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2202 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2203 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX));
2205 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2206 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2207 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2208 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX),
2211 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2212 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2213 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2214 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2215 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2216 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC));
2218 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2219 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2220 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2221 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC),
2224 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2225 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2226 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2227 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2228 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2229 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC));
2231 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2232 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2233 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2234 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC),
2237 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2238 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2239 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2240 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2241 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2242 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND));
2244 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2245 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2246 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2247 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND),
2250 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2251 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2252 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2253 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2254 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2255 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR));
2257 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2258 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2259 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2260 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR),
2263 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2264 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2265 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2266 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2267 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2268 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR));
2270 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2271 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2272 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2273 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR),
2277 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2278 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2279 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2280 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2281 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2282 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2283 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2284 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2285 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2286 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2287 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2288 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2289 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2290 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2291 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2292 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2293 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2294 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2295 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2296 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2297 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2298 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2299 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2300 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2301 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2302 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2303 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2304 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2305 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2310 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2311 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2312 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2313 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2314 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2315 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2316 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2317 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2318 SelectTcgen05St(
N,
true);
2324void NVPTXDAGToDAGISel::selectAtomicSwap128(
SDNode *
N) {
2328 const SDValue Chain =
N->getOperand(0);
2331 Ops.
append(
N->op_begin() + 2,
N->op_end());
2333 getI32Imm(getMemOrder(AN), dl),
2334 getI32Imm(getAtomicScope(AN), dl),
2342 ? NVPTX::ATOM_EXCH_B128
2343 : NVPTX::ATOM_CAS_B128;
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
mir Rename Register Operands
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static size_t GetDimsFromIntrinsic(unsigned IID)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Class for arbitrary precision integers.
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
int64_t getSExtValue() const
Get sign extended value.
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
The size in bits of the pointer representation in a given address space.
FunctionPass class - This class is used to implement most global optimizations.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Record instruction ordering so we can query their relative positions within a function.
This is an important class for using LLVM in a threaded context.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
A description of a memory reference used in the backend.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
bool hasCpAsyncBulkTensorCTAGroupSupport() const
void failIfClustersUnsupported(std::string const &FailureMessage) const
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
unsigned int getSmVersion() const
bool hasRelaxedMMIO() const
bool hasAtomScope() const
bool hasMemoryOrdering() const
bool useF32FTZ(const MachineFunction &MF) const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
const NVPTXSubtarget * getSubtargetImpl(const Function &) const override
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
unsigned getPointerSizeInBits(unsigned AS) const
LLVM Value Representation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ FADD
Simple binary floating point operators.
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SHL
Shift and rotation operations.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ADDRESS_SPACE_SHARED_CLUSTER
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
initializer< Ty > init(const Ty &Val)
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
This is an optimization pass for GlobalISel generic memory operations.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
CodeGenOptLevel
Code generation optimization level.
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
NVPTX::Scope operator[](SyncScope::ID ID) const