LLVM 22.0.0git
AMDGPUSubtarget.h
Go to the documentation of this file.
1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
18#include "llvm/IR/CallingConv.h"
21
22namespace llvm {
23
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
31public:
34 R600 = 1,
35 R700 = 2,
41 GFX9 = 8,
42 GFX10 = 9,
43 GFX11 = 10,
44 GFX12 = 11,
45 };
46
47private:
48 Triple TargetTriple;
49
50protected:
51 bool GCN3Encoding = false;
52 bool Has16BitInsts = false;
53 bool HasTrue16BitInsts = false;
59 bool HasCvtPkF16F32Inst = false;
62 bool HasBF16TransInsts = false;
64 bool HasBF16PackedInsts = false;
65 bool HasMadMixInsts = false;
66 bool HasMadMacF32Insts = false;
67 bool HasDsSrc2Insts = false;
68 bool HasSDWA = false;
69 bool HasVOP3PInsts = false;
70 bool HasMulI24 = true;
71 bool HasMulU24 = true;
72 bool HasSMulHi = false;
73 bool HasInv2PiInlineImm = false;
74 bool HasFminFmaxLegacy = true;
75 bool EnablePromoteAlloca = false;
76 bool HasTrigReducedRange = false;
77 bool FastFMAF32 = false;
78 unsigned EUsPerCU = 4;
79 unsigned MaxWavesPerEU = 10;
80 unsigned LocalMemorySize = 0;
83
84public:
86
87 static const AMDGPUSubtarget &get(const MachineFunction &MF);
88 static const AMDGPUSubtarget &get(const TargetMachine &TM,
89 const Function &F);
90
91 /// \returns Default range flat work group size for a calling convention.
92 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
93
94 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
95 /// for function \p F, or minimum/maximum flat work group sizes explicitly
96 /// requested using "amdgpu-flat-work-group-size" attribute attached to
97 /// function \p F.
98 ///
99 /// \returns Subtarget's default values if explicitly requested values cannot
100 /// be converted to integer, or violate subtarget's specifications.
101 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
102
103 /// \returns The required size of workgroups that will be used to execute \p F
104 /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
105 /// metadata. Otherwise, returns std::nullopt.
106 std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
107 unsigned Dim) const;
108
109 /// \returns true if \p F will execute in a manner that leaves the X
110 /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
111 /// wavefrontsize is uniform. This is true if either the Y and Z block
112 /// dimensions are known to always be 1 or if the X dimension will always be a
113 /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
114 /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
115 /// wavesize64 would ordinarily pass this test, it won't with
116 /// \pRequiresUniformYZ).
117 ///
118 /// This information is currently only gathered from the !reqd_work_group_size
119 /// metadata on \p F, but this may be improved in the future.
121 bool REquiresUniformYZ = false) const;
122
123 /// \returns Subtarget's default pair of minimum/maximum number of waves per
124 /// execution unit for function \p F, or minimum/maximum number of waves per
125 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
126 /// attached to function \p F.
127 ///
128 /// \returns Subtarget's default values if explicitly requested values cannot
129 /// be converted to integer, violate subtarget's specifications, or are not
130 /// compatible with minimum/maximum number of waves limited by flat work group
131 /// size, register usage, and/or lds usage.
132 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
133
134 /// Overload which uses the specified values for the flat work group sizes,
135 /// rather than querying the function itself. \p FlatWorkGroupSizes Should
136 /// correspond to the function's value for getFlatWorkGroupSizes.
137 std::pair<unsigned, unsigned>
139 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
140
141 /// Overload which uses the specified values for the flat workgroup sizes and
142 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
143 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
144 /// LDSBytes to the per-workgroup LDS allocation.
145 std::pair<unsigned, unsigned>
146 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
147 unsigned LDSBytes, const Function &F) const;
148
149 /// Returns the target minimum/maximum number of waves per EU. This is based
150 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
151 /// limited by the maximum achievable occupancy derived from the range of \p
152 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
153 std::pair<unsigned, unsigned>
154 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
155 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
156 unsigned LDSBytes) const;
157
158 /// Return the amount of LDS that can be used that will not restrict the
159 /// occupancy lower than WaveCount.
160 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
161 const Function &) const;
162
163 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
164 /// be achieved when the only function running on a CU is \p F and each
165 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
166 /// This notably depends on the range of allowed flat group sizes for the
167 /// function and hardware characteristics.
168 std::pair<unsigned, unsigned>
171 }
172
173 /// Overload which uses the specified values for the flat work group sizes,
174 /// rather than querying the function itself. \p FlatWorkGroupSizes should
175 /// correspond to the function's value for getFlatWorkGroupSizes.
176 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
177 uint32_t LDSBytes,
178 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
179
180 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
181 /// be achieved when the only function running on a CU is \p MF. This notably
182 /// depends on the range of allowed flat group sizes for the function, the
183 /// amount of per-workgroup LDS space required by the function, and hardware
184 /// characteristics.
185 std::pair<unsigned, unsigned>
187
188 bool isAmdHsaOS() const {
189 return TargetTriple.getOS() == Triple::AMDHSA;
190 }
191
192 bool isAmdPalOS() const {
193 return TargetTriple.getOS() == Triple::AMDPAL;
194 }
195
196 bool isMesa3DOS() const {
197 return TargetTriple.getOS() == Triple::Mesa3D;
198 }
199
200 bool isMesaKernel(const Function &F) const;
201
202 bool isAmdHsaOrMesa(const Function &F) const {
203 return isAmdHsaOS() || isMesaKernel(F);
204 }
205
206 bool isGCN() const { return TargetTriple.isAMDGCN(); }
207
208 bool isGCN3Encoding() const {
209 return GCN3Encoding;
210 }
211
212 bool has16BitInsts() const {
213 return Has16BitInsts;
214 }
215
216 /// Return true if the subtarget supports True16 instructions.
217 bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
218
219 /// Return true if real (non-fake) variants of True16 instructions using
220 /// 16-bit registers should be code-generated. Fake True16 instructions are
221 /// identical to non-fake ones except that they take 32-bit registers as
222 /// operands and always use their low halves.
223 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
224 // supported and the support for fake True16 instructions is removed.
225 bool useRealTrue16Insts() const;
226
227 bool hasBF16TransInsts() const { return HasBF16TransInsts; }
228
231 }
232
233 bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }
234
235 bool hasMadMixInsts() const {
236 return HasMadMixInsts;
237 }
238
240
242
244
247 }
248
251 }
252
253 bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
254
257 }
258
259 bool hasMadMacF32Insts() const {
260 return HasMadMacF32Insts || !isGCN();
261 }
262
263 bool hasDsSrc2Insts() const {
264 return HasDsSrc2Insts;
265 }
266
267 bool hasSDWA() const {
268 return HasSDWA;
269 }
270
271 bool hasVOP3PInsts() const {
272 return HasVOP3PInsts;
273 }
274
275 bool hasMulI24() const {
276 return HasMulI24;
277 }
278
279 bool hasMulU24() const {
280 return HasMulU24;
281 }
282
283 bool hasSMulHi() const {
284 return HasSMulHi;
285 }
286
287 bool hasInv2PiInlineImm() const {
288 return HasInv2PiInlineImm;
289 }
290
291 bool hasFminFmaxLegacy() const {
292 return HasFminFmaxLegacy;
293 }
294
295 bool hasTrigReducedRange() const {
296 return HasTrigReducedRange;
297 }
298
299 bool hasFastFMAF32() const {
300 return FastFMAF32;
301 }
302
304 return EnablePromoteAlloca;
305 }
306
307 unsigned getWavefrontSize() const {
308 return 1 << WavefrontSizeLog2;
309 }
310
311 unsigned getWavefrontSizeLog2() const {
312 return WavefrontSizeLog2;
313 }
314
315 /// Return the maximum number of bytes of LDS available for all workgroups
316 /// running on the same WGP or CU.
317 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
318 /// limited to 64k.
319 unsigned getLocalMemorySize() const {
320 return LocalMemorySize;
321 }
322
323 /// Return the maximum number of bytes of LDS that can be allocated to a
324 /// single workgroup.
325 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
326 /// 128k in total.
329 }
330
331 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
332 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
333 /// CU mode into account.
334 unsigned getEUsPerCU() const { return EUsPerCU; }
335
337 return isAmdHsaOS() ? Align(8) : Align(4);
338 }
339
340 /// Returns the offset in bytes from the start of the input buffer
341 /// of the first explicit kernel argument.
342 unsigned getExplicitKernelArgOffset() const {
343 switch (TargetTriple.getOS()) {
344 case Triple::AMDHSA:
345 case Triple::AMDPAL:
346 case Triple::Mesa3D:
347 return 0;
349 default:
350 // For legacy reasons unknown/other is treated as a different version of
351 // mesa.
352 return 36;
353 }
354
355 llvm_unreachable("invalid triple OS");
356 }
357
358 /// \returns Maximum number of work groups per compute unit supported by the
359 /// subtarget and limited by given \p FlatWorkGroupSize.
360 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
361
362 /// \returns Minimum flat work group size supported by the subtarget.
363 virtual unsigned getMinFlatWorkGroupSize() const = 0;
364
365 /// \returns Maximum flat work group size supported by the subtarget.
366 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
367
368 /// \returns Number of waves per execution unit required to support the given
369 /// \p FlatWorkGroupSize.
370 virtual unsigned
371 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
372
373 /// \returns Minimum number of waves per execution unit supported by the
374 /// subtarget.
375 virtual unsigned getMinWavesPerEU() const = 0;
376
377 /// \returns Maximum number of waves per execution unit supported by the
378 /// subtarget without any kind of limitation.
379 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
380
381 /// Return the maximum workitem ID value in the function, for the given (0, 1,
382 /// 2) dimension.
383 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
384
385 /// Return the number of work groups for the function.
387
388 /// Return true if only a single workitem can be active in a wave.
389 bool isSingleLaneExecution(const Function &Kernel) const;
390
391 /// Creates value range metadata on an workitemid.* intrinsic call or load.
393
394 /// \returns Number of bytes of arguments that are passed to a shader or
395 /// kernel in addition to the explicit ones declared for the function.
396 unsigned getImplicitArgNumBytes(const Function &F) const;
397 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
398 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
399
400 /// \returns Corresponding DWARF register number mapping flavour for the
401 /// \p WavefrontSize.
403
404 virtual ~AMDGPUSubtarget() = default;
405};
406
407} // end namespace llvm
408
409#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file defines the SmallVector class.
bool hasFP8ConversionScaleInsts() const
bool hasFminFmaxLegacy() const
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
bool hasBF16PackedInsts() const
bool hasFP4ConversionScaleInsts() const
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
virtual unsigned getMinWavesPerEU() const =0
bool hasBF16ConversionInsts() const
bool hasFP6BF6ConversionScaleInsts() const
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
bool hasBF8ConversionScaleInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
bool isGCN3Encoding() const
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool HasF16BF16ToFP6BF6ConversionScaleInsts
bool has16BitInsts() const
virtual ~AMDGPUSubtarget()=default
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool isPromoteAllocaEnabled() const
bool hasTrigReducedRange() const
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
bool hasDsSrc2Insts() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const
unsigned getMaxWavesPerEU() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasBF16TransInsts() const
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > RequestedWavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes, unsigned LDSBytes) const
Returns the target minimum/maximum number of waves per EU.
bool hasInv2PiInlineImm() const
bool hasF32ToF16BF16ConversionSRInsts() const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
Overload which uses the specified values for the flat work group sizes, rather than querying the func...
bool hasVOP3PInsts() const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:417
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition: Triple.h:901
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39