LLVM 22.0.0git
AMDGPUSubtarget.h
Go to the documentation of this file.
1//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Base class for AMDGPU specific classes of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16
18#include "llvm/IR/CallingConv.h"
21
22namespace llvm {
23
24enum AMDGPUDwarfFlavour : unsigned;
25class Function;
26class Instruction;
27class MachineFunction;
28class TargetMachine;
29
31public:
46
47private:
48 Triple TargetTriple;
49
50protected:
51 bool GCN3Encoding = false;
52 bool Has16BitInsts = false;
53 bool HasTrue16BitInsts = false;
59 bool HasCvtPkF16F32Inst = false;
63 bool HasBF16TransInsts = false;
65 bool HasBF16PackedInsts = false;
66 bool HasMadMixInsts = false;
67 bool HasMadMacF32Insts = false;
68 bool HasDsSrc2Insts = false;
69 bool HasSDWA = false;
70 bool HasVOP3PInsts = false;
71 bool HasMulI24 = true;
72 bool HasMulU24 = true;
73 bool HasSMulHi = false;
74 bool HasInv2PiInlineImm = false;
75 bool HasFminFmaxLegacy = true;
76 bool EnablePromoteAlloca = false;
77 bool HasTrigReducedRange = false;
78 bool FastFMAF32 = false;
79 unsigned EUsPerCU = 4;
80 unsigned MaxWavesPerEU = 10;
81 unsigned LocalMemorySize = 0;
84
85public:
87
88 static const AMDGPUSubtarget &get(const MachineFunction &MF);
89 static const AMDGPUSubtarget &get(const TargetMachine &TM,
90 const Function &F);
91
92 /// \returns Default range flat work group size for a calling convention.
93 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
94
95 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
96 /// for function \p F, or minimum/maximum flat work group sizes explicitly
97 /// requested using "amdgpu-flat-work-group-size" attribute attached to
98 /// function \p F.
99 ///
100 /// \returns Subtarget's default values if explicitly requested values cannot
101 /// be converted to integer, or violate subtarget's specifications.
102 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
103
104 /// \returns The required size of workgroups that will be used to execute \p F
105 /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
106 /// metadata. Otherwise, returns std::nullopt.
107 std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
108 unsigned Dim) const;
109
110 /// \returns true if \p F will execute in a manner that leaves the X
111 /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
112 /// wavefrontsize is uniform. This is true if either the Y and Z block
113 /// dimensions are known to always be 1 or if the X dimension will always be a
114 /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
115 /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
116 /// wavesize64 would ordinarily pass this test, it won't with
117 /// \pRequiresUniformYZ).
118 ///
119 /// This information is currently only gathered from the !reqd_work_group_size
120 /// metadata on \p F, but this may be improved in the future.
122 bool REquiresUniformYZ = false) const;
123
124 /// \returns Subtarget's default pair of minimum/maximum number of waves per
125 /// execution unit for function \p F, or minimum/maximum number of waves per
126 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
127 /// attached to function \p F.
128 ///
129 /// \returns Subtarget's default values if explicitly requested values cannot
130 /// be converted to integer, violate subtarget's specifications, or are not
131 /// compatible with minimum/maximum number of waves limited by flat work group
132 /// size, register usage, and/or lds usage.
133 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
134
135 /// Overload which uses the specified values for the flat work group sizes,
136 /// rather than querying the function itself. \p FlatWorkGroupSizes Should
137 /// correspond to the function's value for getFlatWorkGroupSizes.
138 std::pair<unsigned, unsigned>
140 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
141
142 /// Overload which uses the specified values for the flat workgroup sizes and
143 /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
144 /// should correspond to the function's value for getFlatWorkGroupSizes and \p
145 /// LDSBytes to the per-workgroup LDS allocation.
146 std::pair<unsigned, unsigned>
147 getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
148 unsigned LDSBytes, const Function &F) const;
149
150 /// Returns the target minimum/maximum number of waves per EU. This is based
151 /// on the minimum/maximum number of \p RequestedWavesPerEU and further
152 /// limited by the maximum achievable occupancy derived from the range of \p
153 /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
154 std::pair<unsigned, unsigned>
155 getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
156 std::pair<unsigned, unsigned> FlatWorkGroupSizes,
157 unsigned LDSBytes) const;
158
159 /// Return the amount of LDS that can be used that will not restrict the
160 /// occupancy lower than WaveCount.
161 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
162 const Function &) const;
163
164 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
165 /// be achieved when the only function running on a CU is \p F and each
166 /// workgroup running the function requires \p LDSBytes bytes of LDS space.
167 /// This notably depends on the range of allowed flat group sizes for the
168 /// function and hardware characteristics.
169 std::pair<unsigned, unsigned>
173
174 /// Overload which uses the specified values for the flat work group sizes,
175 /// rather than querying the function itself. \p FlatWorkGroupSizes should
176 /// correspond to the function's value for getFlatWorkGroupSizes.
177 std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
178 uint32_t LDSBytes,
179 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
180
181 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
182 /// be achieved when the only function running on a CU is \p MF. This notably
183 /// depends on the range of allowed flat group sizes for the function, the
184 /// amount of per-workgroup LDS space required by the function, and hardware
185 /// characteristics.
186 std::pair<unsigned, unsigned>
188
189 bool isAmdHsaOS() const {
190 return TargetTriple.getOS() == Triple::AMDHSA;
191 }
192
193 bool isAmdPalOS() const {
194 return TargetTriple.getOS() == Triple::AMDPAL;
195 }
196
197 bool isMesa3DOS() const {
198 return TargetTriple.getOS() == Triple::Mesa3D;
199 }
200
201 bool isMesaKernel(const Function &F) const;
202
203 bool isAmdHsaOrMesa(const Function &F) const {
204 return isAmdHsaOS() || isMesaKernel(F);
205 }
206
207 bool isGCN() const { return TargetTriple.isAMDGCN(); }
208
209 bool isGCN3Encoding() const {
210 return GCN3Encoding;
211 }
212
213 bool has16BitInsts() const {
214 return Has16BitInsts;
215 }
216
217 /// Return true if the subtarget supports True16 instructions.
218 bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
219
220 /// Return true if real (non-fake) variants of True16 instructions using
221 /// 16-bit registers should be code-generated. Fake True16 instructions are
222 /// identical to non-fake ones except that they take 32-bit registers as
223 /// operands and always use their low halves.
224 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
225 // supported and the support for fake True16 instructions is removed.
226 bool useRealTrue16Insts() const;
227
228 bool hasD16Writes32BitVgpr() const;
229
230 bool hasBF16TransInsts() const { return HasBF16TransInsts; }
231
234 }
235
236 bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }
237
238 bool hasMadMixInsts() const {
239 return HasMadMixInsts;
240 }
241
243
245
247
251
255
256 bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
257
261
262 bool hasMadMacF32Insts() const {
263 return HasMadMacF32Insts || !isGCN();
264 }
265
266 bool hasDsSrc2Insts() const {
267 return HasDsSrc2Insts;
268 }
269
270 bool hasSDWA() const {
271 return HasSDWA;
272 }
273
274 bool hasVOP3PInsts() const {
275 return HasVOP3PInsts;
276 }
277
278 bool hasMulI24() const {
279 return HasMulI24;
280 }
281
282 bool hasMulU24() const {
283 return HasMulU24;
284 }
285
286 bool hasSMulHi() const {
287 return HasSMulHi;
288 }
289
290 bool hasInv2PiInlineImm() const {
291 return HasInv2PiInlineImm;
292 }
293
294 bool hasFminFmaxLegacy() const {
295 return HasFminFmaxLegacy;
296 }
297
298 bool hasTrigReducedRange() const {
299 return HasTrigReducedRange;
300 }
301
302 bool hasFastFMAF32() const {
303 return FastFMAF32;
304 }
305
307 return EnablePromoteAlloca;
308 }
309
310 unsigned getWavefrontSize() const {
311 return 1 << WavefrontSizeLog2;
312 }
313
314 unsigned getWavefrontSizeLog2() const {
315 return WavefrontSizeLog2;
316 }
317
318 /// Return the maximum number of bytes of LDS available for all workgroups
319 /// running on the same WGP or CU.
320 /// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
321 /// limited to 64k.
322 unsigned getLocalMemorySize() const {
323 return LocalMemorySize;
324 }
325
326 /// Return the maximum number of bytes of LDS that can be allocated to a
327 /// single workgroup.
328 /// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
329 /// 128k in total.
332 }
333
334 /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
335 /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
336 /// CU mode into account.
337 unsigned getEUsPerCU() const { return EUsPerCU; }
338
340 return isAmdHsaOS() ? Align(8) : Align(4);
341 }
342
343 /// Returns the offset in bytes from the start of the input buffer
344 /// of the first explicit kernel argument.
345 unsigned getExplicitKernelArgOffset() const {
346 switch (TargetTriple.getOS()) {
347 case Triple::AMDHSA:
348 case Triple::AMDPAL:
349 case Triple::Mesa3D:
350 return 0;
352 default:
353 // For legacy reasons unknown/other is treated as a different version of
354 // mesa.
355 return 36;
356 }
357
358 llvm_unreachable("invalid triple OS");
359 }
360
361 /// \returns Maximum number of work groups per compute unit supported by the
362 /// subtarget and limited by given \p FlatWorkGroupSize.
363 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
364
365 /// \returns Minimum flat work group size supported by the subtarget.
366 virtual unsigned getMinFlatWorkGroupSize() const = 0;
367
368 /// \returns Maximum flat work group size supported by the subtarget.
369 virtual unsigned getMaxFlatWorkGroupSize() const = 0;
370
371 /// \returns Number of waves per execution unit required to support the given
372 /// \p FlatWorkGroupSize.
373 virtual unsigned
374 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
375
376 /// \returns Minimum number of waves per execution unit supported by the
377 /// subtarget.
378 virtual unsigned getMinWavesPerEU() const = 0;
379
380 /// \returns Maximum number of waves per execution unit supported by the
381 /// subtarget without any kind of limitation.
382 unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
383
384 /// Return the maximum workitem ID value in the function, for the given (0, 1,
385 /// 2) dimension.
386 unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
387
388 /// Return the number of work groups for the function.
390
391 /// Return true if only a single workitem can be active in a wave.
392 bool isSingleLaneExecution(const Function &Kernel) const;
393
394 /// Creates value range metadata on an workitemid.* intrinsic call or load.
396
397 /// \returns Number of bytes of arguments that are passed to a shader or
398 /// kernel in addition to the explicit ones declared for the function.
399 unsigned getImplicitArgNumBytes(const Function &F) const;
400 uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
401 unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
402
403 /// \returns Corresponding DWARF register number mapping flavour for the
404 /// \p WavefrontSize.
406
407 virtual ~AMDGPUSubtarget() = default;
408};
409
410} // end namespace llvm
411
412#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define AMDGPUSubtarget
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file defines the SmallVector class.
bool hasFP8ConversionScaleInsts() const
bool hasFminFmaxLegacy() const
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
bool hasBF16PackedInsts() const
bool hasFP4ConversionScaleInsts() const
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool hasCvtPkF16F32Inst() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
virtual unsigned getMinWavesPerEU() const =0
bool hasBF16ConversionInsts() const
bool hasFP6BF6ConversionScaleInsts() const
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
bool hasBF8ConversionScaleInsts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getWavefrontSizeLog2() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool HasF16BF16ToFP6BF6ConversionScaleInsts
virtual ~AMDGPUSubtarget()=default
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
bool isAmdHsaOrMesa(const Function &F) const
bool isPromoteAllocaEnabled() const
bool hasTrigReducedRange() const
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const
unsigned getMaxWavesPerEU() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
unsigned AddressableLocalMemorySize
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasBF16TransInsts() const
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > RequestedWavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes, unsigned LDSBytes) const
Returns the target minimum/maximum number of waves per EU.
bool hasInv2PiInlineImm() const
bool hasF32ToF16BF16ConversionSRInsts() const
bool hasD16Writes32BitVgpr() const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
Overload which uses the specified values for the flat work group sizes, rather than querying the func...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39