LLVM 22.0.0git
NVPTXSubtarget.h
Go to the documentation of this file.
1//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file declares the NVPTX specific subclass of TargetSubtarget.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
14#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
15
16#include "NVPTX.h"
17#include "NVPTXFrameLowering.h"
18#include "NVPTXISelLowering.h"
19#include "NVPTXInstrInfo.h"
20#include "NVPTXRegisterInfo.h"
22#include "llvm/IR/DataLayout.h"
24#include <string>
25
26#define GET_SUBTARGETINFO_HEADER
27#include "NVPTXGenSubtargetInfo.inc"
28
29namespace llvm {
30
32 virtual void anchor();
33 std::string TargetName;
34
35 // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
36 unsigned PTXVersion;
37
38 // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
39 // sm_90a == 901
40 unsigned int FullSmVersion;
41
42 // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
43 // FullSmVersion.
44 unsigned int SmVersion;
45
46 NVPTXInstrInfo InstrInfo;
48 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
49
50 // NVPTX does not have any call stack frame, but need a NVPTX specific
51 // FrameLowering class because TargetFrameLowering is abstract.
52 NVPTXFrameLowering FrameLowering;
53
54public:
55 /// This constructor initializes the data members to match that
56 /// of the specified module.
57 ///
58 NVPTXSubtarget(const Triple &TT, const std::string &CPU,
59 const std::string &FS, const NVPTXTargetMachine &TM);
60
61 ~NVPTXSubtarget() override;
62
63 const TargetFrameLowering *getFrameLowering() const override {
64 return &FrameLowering;
65 }
66 const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
67 const NVPTXRegisterInfo *getRegisterInfo() const override {
68 return &InstrInfo.getRegisterInfo();
69 }
70 const NVPTXTargetLowering *getTargetLowering() const override {
71 return &TLInfo;
72 }
73
74 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
75
76 bool has256BitVectorLoadStore(unsigned AS) const {
77 return SmVersion >= 100 && PTXVersion >= 88 &&
79 }
80 bool hasAtomAddF64() const { return SmVersion >= 60; }
81 bool hasAtomScope() const { return SmVersion >= 60; }
82 bool hasAtomBitwise64() const { return SmVersion >= 32; }
83 bool hasAtomMinMax64() const { return SmVersion >= 32; }
84 bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
85 bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
86 bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
87 bool hasLDG() const { return SmVersion >= 32; }
88 bool hasHWROT32() const { return SmVersion >= 32; }
89 bool hasFP16Math() const { return SmVersion >= 53; }
90 bool hasBF16Math() const { return SmVersion >= 80; }
91 bool allowFP16Math() const;
92 bool hasMaskOperator() const { return PTXVersion >= 71; }
93 bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
94 // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
95 // release, acq_rel, sc) ?
96 bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
97 // Does SM & PTX support .acquire and .release qualifiers for fence?
99 return SmVersion >= 90 && PTXVersion >= 86;
100 }
101 // Does SM & PTX support atomic relaxed MMIO operations ?
102 bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
103 bool hasDotInstructions() const {
104 return SmVersion >= 61 && PTXVersion >= 50;
105 }
106 // Tcgen05 instructions in Blackwell family
108 bool HasTcgen05 = false;
109 unsigned MinPTXVersion = 86;
110 switch (FullSmVersion) {
111 default:
112 break;
113 case 1003: // sm_100a
114 case 1013: // sm_101a
115 HasTcgen05 = true;
116 break;
117 case 1033: // sm_103a
118 HasTcgen05 = true;
119 MinPTXVersion = 88;
120 break;
121 }
122
123 return HasTcgen05 && PTXVersion >= MinPTXVersion;
124 }
125 // f32x2 instructions in Blackwell family
126 bool hasF32x2Instructions() const;
127
128 // TMA G2S copy with cta_group::1/2 support
130 // TODO: Update/tidy-up after the family-conditional support arrives
131 switch (FullSmVersion) {
132 case 1003:
133 case 1013:
134 return PTXVersion >= 86;
135 case 1033:
136 return PTXVersion >= 88;
137 default:
138 return false;
139 }
140 }
141
142 // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
143 // terminates a basic block. Instead, it would assume that control flow
144 // continued to the next instruction. The next instruction could be in the
145 // block that's lexically below it. This would lead to a phantom CFG edges
146 // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
147 // PTX ISA versions 8.3+ we can confidently say that the bug will not be
148 // present.
149 bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
150 bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
151 unsigned int getFullSmVersion() const { return FullSmVersion; }
152 unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
153 // GPUs with "a" suffix have architecture-accelerated features that are
154 // supported on the specified architecture only, hence such targets do not
155 // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
156 // such GPU variants from the base GPU architecture.
157 // - false represents non-accelerated architecture.
158 // - true represents architecture-accelerated variant.
159 bool hasArchAccelFeatures() const {
160 return (getFullSmVersion() & 1) && PTXVersion >= 80;
161 }
162 // GPUs with 'f' suffix have architecture-accelerated features which are
163 // portable across all future architectures under same SM major. For example,
164 // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
165 // - false represents non-family-specific architecture.
166 // - true represents family-specific variant.
168 return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
170 }
171 // If the user did not provide a target we default to the `sm_30` target.
172 std::string getTargetName() const {
173 return TargetName.empty() ? "sm_30" : TargetName;
174 }
175 bool hasTargetName() const { return !TargetName.empty(); }
176
177 bool hasNativeBF16Support(int Opcode) const;
178
179 // Get maximum value of required alignments among the supported data types.
180 // From the PTX ISA doc, section 8.2.3:
181 // The memory consistency model relates operations executed on memory
182 // locations with scalar data-types, which have a maximum size and alignment
183 // of 64 bits. Memory operations with a vector data-type are modelled as a
184 // set of equivalent memory operations with a scalar data-type, executed in
185 // an unspecified order on the elements in the vector.
186 unsigned getMaxRequiredAlignment() const { return 8; }
187 // Get the smallest cmpxchg word size that the hardware supports.
188 unsigned getMinCmpXchgSizeInBits() const { return 32; }
189
190 unsigned getPTXVersion() const { return PTXVersion; }
191
194
195 void failIfClustersUnsupported(std::string const &FailureMessage) const;
196};
197
198} // End llvm namespace
199
200#endif
NVPTX address space definition.
bool hasCpAsyncBulkTensorCTAGroupSupport() const
const NVPTXInstrInfo * getInstrInfo() const override
void failIfClustersUnsupported(std::string const &FailureMessage) const
std::string getTargetName() const
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasAtomAddF64() const
bool hasHWROT32() const
bool hasSplitAcquireAndReleaseFences() const
bool hasClusters() const
bool hasMaskOperator() const
const NVPTXTargetLowering * getTargetLowering() const override
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
unsigned getMinCmpXchgSizeInBits() const
unsigned getPTXVersion() const
bool hasCvtaParam() const
~NVPTXSubtarget() override
bool hasNativeBF16Support(int Opcode) const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getFullSmVersion() const
unsigned int getSmVersion() const
bool hasDotInstructions() const
bool hasFamilySpecificFeatures() const
bool hasTcgen05Instructions() const
bool hasAtomBitwise64() const
bool hasRelaxedMMIO() const
bool hasTargetName() const
bool hasBF16Math() const
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
const TargetFrameLowering * getFrameLowering() const override
bool hasAtomScope() const
bool hasAtomCas16() const
NVPTXSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const NVPTXTargetMachine &TM)
This constructor initializes the data members to match that of the specified module.
bool hasMemoryOrdering() const
bool hasArchAccelFeatures() const
NVPTXSubtarget & initializeSubtargetDependencies(StringRef CPU, StringRef FS)
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
bool has256BitVectorLoadStore(unsigned AS) const
bool hasPTXASUnreachableBug() const
bool hasFP16Math() const
bool hasNoReturn() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Information about stack frame layout on the target.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
This is an optimization pass for GlobalISel generic memory operations.