LLVM 22.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
37
38#define DEBUG_TYPE "amdgpu-legalinfo"
39
40using namespace llvm;
41using namespace LegalizeActions;
42using namespace LegalizeMutations;
43using namespace LegalityPredicates;
44using namespace MIPatternMatch;
45
46// Hack until load/store selection patterns support any tuple of legal types.
48 "amdgpu-global-isel-new-legality",
49 cl::desc("Use GlobalISel desired legality, rather than try to use"
50 "rules compatible with selection patterns"),
51 cl::init(false),
53
54static constexpr unsigned MaxRegisterSize = 1024;
55
56// Round the number of elements to the next power of two elements
58 unsigned NElts = Ty.getNumElements();
59 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
60 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
61}
62
63// Round the number of bits to the next power of two bits
65 unsigned Bits = Ty.getSizeInBits();
66 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
67 return LLT::scalar(Pow2Bits);
68}
69
70/// \returns true if this is an odd sized vector which should widen by adding an
71/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
72/// excludes s1 vectors, which should always be scalarized.
73static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
74 return [=](const LegalityQuery &Query) {
75 const LLT Ty = Query.Types[TypeIdx];
76 if (!Ty.isVector())
77 return false;
78
79 const LLT EltTy = Ty.getElementType();
80 const unsigned EltSize = EltTy.getSizeInBits();
81 return Ty.getNumElements() % 2 != 0 &&
82 EltSize > 1 && EltSize < 32 &&
83 Ty.getSizeInBits() % 32 != 0;
84 };
85}
86
87static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 return Ty.getSizeInBits() % 32 == 0;
91 };
92}
93
94static LegalityPredicate isWideVec16(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getScalarType();
98 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
99 };
100}
101
102static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105 const LLT EltTy = Ty.getElementType();
106 return std::pair(TypeIdx,
107 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
108 };
109}
110
112 return [=](const LegalityQuery &Query) {
113 const LLT Ty = Query.Types[TypeIdx];
114 const LLT EltTy = Ty.getElementType();
115 unsigned Size = Ty.getSizeInBits();
116 unsigned Pieces = (Size + 63) / 64;
117 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
118 return std::pair(TypeIdx, LLT::scalarOrVector(
119 ElementCount::getFixed(NewNumElts), EltTy));
120 };
121}
122
123// Increase the number of vector elements to reach the next multiple of 32-bit
124// type.
125static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
126 return [=](const LegalityQuery &Query) {
127 const LLT Ty = Query.Types[TypeIdx];
128
129 const LLT EltTy = Ty.getElementType();
130 const int Size = Ty.getSizeInBits();
131 const int EltSize = EltTy.getSizeInBits();
132 const int NextMul32 = (Size + 31) / 32;
133
134 assert(EltSize < 32);
135
136 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
137 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
138 };
139}
140
141// Retrieves the scalar type that's the same size as the mem desc
143 return [=](const LegalityQuery &Query) {
144 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
145 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
146 };
147}
148
149// Increase the number of vector elements to reach the next legal RegClass.
151 return [=](const LegalityQuery &Query) {
152 const LLT Ty = Query.Types[TypeIdx];
153 const unsigned NumElts = Ty.getNumElements();
154 const unsigned EltSize = Ty.getElementType().getSizeInBits();
155 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
156
157 assert(EltSize == 32 || EltSize == 64);
159
160 unsigned NewNumElts;
161 // Find the nearest legal RegClass that is larger than the current type.
162 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
163 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
164 break;
165 }
166 return std::pair(TypeIdx,
167 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
168 };
169}
170
172 if (!Ty.isVector())
173 return LLT::scalar(128);
174 const ElementCount NumElems = Ty.getElementCount();
175 return LLT::vector(NumElems, LLT::scalar(128));
176}
177
179 if (!Ty.isVector())
180 return LLT::fixed_vector(4, LLT::scalar(32));
181 const unsigned NumElems = Ty.getElementCount().getFixedValue();
182 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
183}
184
186 const unsigned Size = Ty.getSizeInBits();
187
188 if (Size <= 32) {
189 // <2 x s8> -> s16
190 // <4 x s8> -> s32
191 return LLT::scalar(Size);
192 }
193
195}
196
197static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
198 return [=](const LegalityQuery &Query) {
199 const LLT Ty = Query.Types[TypeIdx];
200 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
201 };
202}
203
205 return [=](const LegalityQuery &Query) {
206 const LLT Ty = Query.Types[TypeIdx];
207 unsigned Size = Ty.getSizeInBits();
208 assert(Size % 32 == 0);
209 return std::pair(
211 };
212}
213
214static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
215 return [=](const LegalityQuery &Query) {
216 const LLT QueryTy = Query.Types[TypeIdx];
217 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
218 };
219}
220
221static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
222 return [=](const LegalityQuery &Query) {
223 const LLT QueryTy = Query.Types[TypeIdx];
224 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
225 };
226}
227
228static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
229 return [=](const LegalityQuery &Query) {
230 const LLT QueryTy = Query.Types[TypeIdx];
231 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
232 };
233}
234
235static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
236 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
238}
239
241 const int EltSize = EltTy.getSizeInBits();
242 return EltSize == 16 || EltSize % 32 == 0;
243}
244
245static bool isRegisterVectorType(LLT Ty) {
246 const int EltSize = Ty.getElementType().getSizeInBits();
247 return EltSize == 32 || EltSize == 64 ||
248 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
249 EltSize == 128 || EltSize == 256;
250}
251
252// TODO: replace all uses of isRegisterType with isRegisterClassType
253static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
254 if (!isRegisterSize(ST, Ty.getSizeInBits()))
255 return false;
256
257 if (Ty.isVector())
258 return isRegisterVectorType(Ty);
259
260 return true;
261}
262
263// Any combination of 32 or 64-bit elements up the maximum register size, and
264// multiples of v2s16.
266 unsigned TypeIdx) {
267 return [=, &ST](const LegalityQuery &Query) {
268 return isRegisterType(ST, Query.Types[TypeIdx]);
269 };
270}
271
272// RegisterType that doesn't have a corresponding RegClass.
273// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
274// should be removed.
276 unsigned TypeIdx) {
277 return [=, &ST](const LegalityQuery &Query) {
278 LLT Ty = Query.Types[TypeIdx];
279 return isRegisterType(ST, Ty) &&
281 };
282}
283
284static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
285 return [=](const LegalityQuery &Query) {
286 const LLT QueryTy = Query.Types[TypeIdx];
287 if (!QueryTy.isVector())
288 return false;
289 const LLT EltTy = QueryTy.getElementType();
290 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
291 };
292}
293
294constexpr LLT S1 = LLT::scalar(1);
295constexpr LLT S8 = LLT::scalar(8);
296constexpr LLT S16 = LLT::scalar(16);
297constexpr LLT S32 = LLT::scalar(32);
298constexpr LLT F32 = LLT::float32();
299constexpr LLT S64 = LLT::scalar(64);
300constexpr LLT F64 = LLT::float64();
301constexpr LLT S96 = LLT::scalar(96);
302constexpr LLT S128 = LLT::scalar(128);
303constexpr LLT S160 = LLT::scalar(160);
304constexpr LLT S192 = LLT::scalar(192);
305constexpr LLT S224 = LLT::scalar(224);
306constexpr LLT S256 = LLT::scalar(256);
307constexpr LLT S512 = LLT::scalar(512);
308constexpr LLT S1024 = LLT::scalar(1024);
310
311constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
312constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
313constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
314constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
315constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
316constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
317constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
318constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
319
321constexpr LLT V2BF16 = V2F16; // FIXME
322
323constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
324constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
325constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
326constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
327constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
328constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
329constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
330constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
331constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
332constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
333constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
334constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
335constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
336
337constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
338constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
339constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
340constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
341constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
342constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
343constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
344constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
345
346constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
347constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
348
349constexpr std::initializer_list<LLT> AllScalarTypes = {
351
352constexpr std::initializer_list<LLT> AllS16Vectors{
354
355constexpr std::initializer_list<LLT> AllS32Vectors = {
358
359constexpr std::initializer_list<LLT> AllS64Vectors = {
361
362constexpr std::initializer_list<LLT> AllVectors{
367
368// Checks whether a type is in the list of legal register types.
369static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
372
375 (ST.useRealTrue16Insts() && Ty == S16) ||
377}
378
380 unsigned TypeIdx) {
381 return [&ST, TypeIdx](const LegalityQuery &Query) {
382 return isRegisterClassType(ST, Query.Types[TypeIdx]);
383 };
384}
385
386// If we have a truncating store or an extending load with a data size larger
387// than 32-bits, we need to reduce to a 32-bit type.
389 return [=](const LegalityQuery &Query) {
390 const LLT Ty = Query.Types[TypeIdx];
391 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
392 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
393 };
394}
395
396// If we have a truncating store or an extending load with a data size larger
397// than 32-bits and mem location is a power of 2
399 return [=](const LegalityQuery &Query) {
400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
401 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
402 isPowerOf2_64(MemSize);
403 };
404}
405
406// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
407// handle some operations by just promoting the register during
408// selection. There are also d16 loads on GFX9+ which preserve the high bits.
409static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
410 bool IsLoad, bool IsAtomic) {
411 switch (AS) {
413 // FIXME: Private element size.
414 return ST.enableFlatScratch() ? 128 : 32;
416 return ST.useDS128() ? 128 : 64;
421 // Treat constant and global as identical. SMRD loads are sometimes usable for
422 // global loads (ideally constant address space should be eliminated)
423 // depending on the context. Legality cannot be context dependent, but
424 // RegBankSelect can split the load as necessary depending on the pointer
425 // register bank/uniformity and if the memory is invariant or not written in a
426 // kernel.
427 return IsLoad ? 512 : 128;
428 default:
429 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
430 // if they may alias scratch depending on the subtarget. This needs to be
431 // moved to custom handling to use addressMayBeAccessedAsPrivate
432 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
433 }
434}
435
436static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
437 const LegalityQuery &Query) {
438 const LLT Ty = Query.Types[0];
439
440 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
441 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
442
443 unsigned RegSize = Ty.getSizeInBits();
444 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
445 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
446 unsigned AS = Query.Types[1].getAddressSpace();
447
448 // All of these need to be custom lowered to cast the pointer operand.
450 return false;
451
452 // Do not handle extending vector loads.
453 if (Ty.isVector() && MemSize != RegSize)
454 return false;
455
456 // TODO: We should be able to widen loads if the alignment is high enough, but
457 // we also need to modify the memory access size.
458#if 0
459 // Accept widening loads based on alignment.
460 if (IsLoad && MemSize < Size)
461 MemSize = std::max(MemSize, Align);
462#endif
463
464 // Only 1-byte and 2-byte to 32-bit extloads are valid.
465 if (MemSize != RegSize && RegSize != 32)
466 return false;
467
468 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
469 Query.MMODescrs[0].Ordering !=
470 AtomicOrdering::NotAtomic))
471 return false;
472
473 switch (MemSize) {
474 case 8:
475 case 16:
476 case 32:
477 case 64:
478 case 128:
479 break;
480 case 96:
481 if (!ST.hasDwordx3LoadStores())
482 return false;
483 break;
484 case 256:
485 case 512:
486 // These may contextually need to be broken down.
487 break;
488 default:
489 return false;
490 }
491
492 assert(RegSize >= MemSize);
493
494 if (AlignBits < MemSize) {
495 const SITargetLowering *TLI = ST.getTargetLowering();
496 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
497 Align(AlignBits / 8)))
498 return false;
499 }
500
501 return true;
502}
503
504// The newer buffer intrinsic forms take their resource arguments as
505// pointers in address space 8, aka s128 values. However, in order to not break
506// SelectionDAG, the underlying operations have to continue to take v4i32
507// arguments. Therefore, we convert resource pointers - or vectors of them
508// to integer values here.
509static bool hasBufferRsrcWorkaround(const LLT Ty) {
511 return true;
512 if (Ty.isVector()) {
513 const LLT ElemTy = Ty.getElementType();
514 return hasBufferRsrcWorkaround(ElemTy);
515 }
516 return false;
517}
518
519// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
520// workaround this. Eventually it should ignore the type for loads and only care
521// about the size. Return true in cases where we will workaround this for now by
522// bitcasting.
523static bool loadStoreBitcastWorkaround(const LLT Ty) {
525 return false;
526
527 const unsigned Size = Ty.getSizeInBits();
528 if (Ty.isPointerVector())
529 return true;
530 if (Size <= 64)
531 return false;
532 // Address space 8 pointers get their own workaround.
534 return false;
535 if (!Ty.isVector())
536 return true;
537
538 unsigned EltSize = Ty.getScalarSizeInBits();
539 return EltSize != 32 && EltSize != 64;
540}
541
542static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
543 const LLT Ty = Query.Types[0];
544 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
546}
547
548/// Return true if a load or store of the type should be lowered with a bitcast
549/// to a different type.
550static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
551 const LLT MemTy) {
552 const unsigned MemSizeInBits = MemTy.getSizeInBits();
553 const unsigned Size = Ty.getSizeInBits();
554 if (Size != MemSizeInBits)
555 return Size <= 32 && Ty.isVector();
556
558 return true;
559
560 // Don't try to handle bitcasting vector ext loads for now.
561 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
562 (Size <= 32 || isRegisterSize(ST, Size)) &&
564}
565
566/// Return true if we should legalize a load by widening an odd sized memory
567/// access up to the alignment. Note this case when the memory access itself
568/// changes, not the size of the result register.
569static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
570 uint64_t AlignInBits, unsigned AddrSpace,
571 unsigned Opcode) {
572 unsigned SizeInBits = MemoryTy.getSizeInBits();
573 // We don't want to widen cases that are naturally legal.
574 if (isPowerOf2_32(SizeInBits))
575 return false;
576
577 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
578 // end up widening these for a scalar load during RegBankSelect, if we don't
579 // have 96-bit scalar loads.
580 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
581 return false;
582
583 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
584 return false;
585
586 // A load is known dereferenceable up to the alignment, so it's legal to widen
587 // to it.
588 //
589 // TODO: Could check dereferenceable for less aligned cases.
590 unsigned RoundedSize = NextPowerOf2(SizeInBits);
591 if (AlignInBits < RoundedSize)
592 return false;
593
594 // Do not widen if it would introduce a slow unaligned load.
595 const SITargetLowering *TLI = ST.getTargetLowering();
596 unsigned Fast = 0;
598 RoundedSize, AddrSpace, Align(AlignInBits / 8),
600 Fast;
601}
602
603static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
604 unsigned Opcode) {
605 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
606 return false;
607
608 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
609 Query.MMODescrs[0].AlignInBits,
610 Query.Types[1].getAddressSpace(), Opcode);
611}
612
613/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
614/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
615/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
617 MachineRegisterInfo &MRI, unsigned Idx) {
618 MachineOperand &MO = MI.getOperand(Idx);
619
620 const LLT PointerTy = MRI.getType(MO.getReg());
621
622 // Paranoidly prevent us from doing this multiple times.
624 return PointerTy;
625
626 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
627 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
628 if (!PointerTy.isVector()) {
629 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
630 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
631 const LLT S32 = LLT::scalar(32);
632
633 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
634 std::array<Register, 4> VectorElems;
635 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
636 for (unsigned I = 0; I < NumParts; ++I)
637 VectorElems[I] =
638 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
639 B.buildMergeValues(MO, VectorElems);
640 MO.setReg(VectorReg);
641 return VectorTy;
642 }
643 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
644 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
645 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
646 B.buildIntToPtr(MO, Scalar);
647 MO.setReg(BitcastReg);
648
649 return VectorTy;
650}
651
652/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
653/// the form in which the value must be in order to be passed to the low-level
654/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
655/// needed in order to account for the fact that we can't define a register
656/// class for s128 without breaking SelectionDAG.
658 MachineRegisterInfo &MRI = *B.getMRI();
659 const LLT PointerTy = MRI.getType(Pointer);
660 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
661 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
662
663 if (!PointerTy.isVector()) {
664 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
665 SmallVector<Register, 4> PointerParts;
666 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
667 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
668 for (unsigned I = 0; I < NumParts; ++I)
669 PointerParts.push_back(Unmerged.getReg(I));
670 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
671 }
672 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
673 return B.buildBitcast(VectorTy, Scalar).getReg(0);
674}
675
677 unsigned Idx) {
678 MachineOperand &MO = MI.getOperand(Idx);
679
680 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
681 // Paranoidly prevent us from doing this multiple times.
683 return;
685}
686
688 const GCNTargetMachine &TM)
689 : ST(ST_) {
690 using namespace TargetOpcode;
691
692 auto GetAddrSpacePtr = [&TM](unsigned AS) {
693 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
694 };
695
696 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
697 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
698 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
699 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
700 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
701 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
702 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
703 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
704 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
705 const LLT BufferStridedPtr =
706 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
707
708 const LLT CodePtr = FlatPtr;
709
710 const std::initializer_list<LLT> AddrSpaces64 = {
711 GlobalPtr, ConstantPtr, FlatPtr
712 };
713
714 const std::initializer_list<LLT> AddrSpaces32 = {
715 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
716 };
717
718 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
719
720 const std::initializer_list<LLT> FPTypesBase = {
721 S32, S64
722 };
723
724 const std::initializer_list<LLT> FPTypes16 = {
725 S32, S64, S16
726 };
727
728 const std::initializer_list<LLT> FPTypesPK16 = {
729 S32, S64, S16, V2S16
730 };
731
732 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
733
734 // s1 for VCC branches, s32 for SCC branches.
736
737 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
738 // elements for v3s16
741 .legalFor(AllS32Vectors)
743 .legalFor(AddrSpaces64)
744 .legalFor(AddrSpaces32)
745 .legalFor(AddrSpaces128)
746 .legalIf(isPointer(0))
747 .clampScalar(0, S16, S256)
749 .clampMaxNumElements(0, S32, 16)
751 .scalarize(0);
752
753 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
754 // Full set of gfx9 features.
755 if (ST.hasScalarAddSub64()) {
756 getActionDefinitionsBuilder({G_ADD, G_SUB})
757 .legalFor({S64, S32, S16, V2S16})
758 .clampMaxNumElementsStrict(0, S16, 2)
759 .scalarize(0)
760 .minScalar(0, S16)
762 .maxScalar(0, S32);
763 } else {
764 getActionDefinitionsBuilder({G_ADD, G_SUB})
765 .legalFor({S32, S16, V2S16})
766 .clampMaxNumElementsStrict(0, S16, 2)
767 .scalarize(0)
768 .minScalar(0, S16)
770 .maxScalar(0, S32);
771 }
772
773 if (ST.hasScalarSMulU64()) {
775 .legalFor({S64, S32, S16, V2S16})
776 .clampMaxNumElementsStrict(0, S16, 2)
777 .scalarize(0)
778 .minScalar(0, S16)
780 .custom();
781 } else {
783 .legalFor({S32, S16, V2S16})
784 .clampMaxNumElementsStrict(0, S16, 2)
785 .scalarize(0)
786 .minScalar(0, S16)
788 .custom();
789 }
790 assert(ST.hasMad64_32());
791
792 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
793 .legalFor({S32, S16, V2S16}) // Clamp modifier
794 .minScalarOrElt(0, S16)
796 .scalarize(0)
798 .lower();
799 } else if (ST.has16BitInsts()) {
800 getActionDefinitionsBuilder({G_ADD, G_SUB})
801 .legalFor({S32, S16})
802 .minScalar(0, S16)
804 .maxScalar(0, S32)
805 .scalarize(0);
806
808 .legalFor({S32, S16})
809 .scalarize(0)
810 .minScalar(0, S16)
811 .widenScalarToNextMultipleOf(0, 32)
812 .custom();
813 assert(ST.hasMad64_32());
814
815 // Technically the saturating operations require clamp bit support, but this
816 // was introduced at the same time as 16-bit operations.
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32, S16}) // Clamp modifier
819 .minScalar(0, S16)
820 .scalarize(0)
822 .lower();
823
824 // We're just lowering this, but it helps get a better result to try to
825 // coerce to the desired type first.
826 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
827 .minScalar(0, S16)
828 .scalarize(0)
829 .lower();
830 } else {
831 getActionDefinitionsBuilder({G_ADD, G_SUB})
832 .legalFor({S32})
833 .widenScalarToNextMultipleOf(0, 32)
834 .clampScalar(0, S32, S32)
835 .scalarize(0);
836
837 auto &Mul = getActionDefinitionsBuilder(G_MUL)
838 .legalFor({S32})
839 .scalarize(0)
840 .minScalar(0, S32)
841 .widenScalarToNextMultipleOf(0, 32);
842
843 if (ST.hasMad64_32())
844 Mul.custom();
845 else
846 Mul.maxScalar(0, S32);
847
848 if (ST.hasIntClamp()) {
849 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
850 .legalFor({S32}) // Clamp modifier.
851 .scalarize(0)
852 .minScalarOrElt(0, S32)
853 .lower();
854 } else {
855 // Clamp bit support was added in VI, along with 16-bit operations.
856 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
857 .minScalar(0, S32)
858 .scalarize(0)
859 .lower();
860 }
861
862 // FIXME: DAG expansion gets better results. The widening uses the smaller
863 // range values and goes for the min/max lowering directly.
864 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
865 .minScalar(0, S32)
866 .scalarize(0)
867 .lower();
868 }
869
871 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
872 .customFor({S32, S64})
873 .clampScalar(0, S32, S64)
875 .scalarize(0);
876
877 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
878 .legalFor({S32})
879 .maxScalar(0, S32);
880
881 if (ST.hasVOP3PInsts()) {
882 Mulh
883 .clampMaxNumElements(0, S8, 2)
884 .lowerFor({V2S8});
885 }
886
887 Mulh
888 .scalarize(0)
889 .lower();
890
891 // Report legal for any types we can handle anywhere. For the cases only legal
892 // on the SALU, RegBankSelect will be able to re-legalize.
893 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
894 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
895 .clampScalar(0, S32, S64)
901 .scalarize(0);
902
904 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
905 .legalFor({{S32, S1}, {S32, S32}})
906 .clampScalar(0, S32, S32)
907 .scalarize(0);
908
910 // Don't worry about the size constraint.
912 .lower();
913
915 .legalFor({S1, S32, S64, S16, GlobalPtr,
916 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
917 .legalIf(isPointer(0))
918 .clampScalar(0, S32, S64)
920
921 getActionDefinitionsBuilder(G_FCONSTANT)
922 .legalFor({S32, S64, S16})
923 .clampScalar(0, S16, S64);
924
925 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
926 .legalIf(isRegisterClassType(ST, 0))
927 // s1 and s16 are special cases because they have legal operations on
928 // them, but don't really occupy registers in the normal way.
929 .legalFor({S1, S16})
930 .clampNumElements(0, V16S32, V32S32)
934 .clampMaxNumElements(0, S32, 16);
935
936 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
937
938 // If the amount is divergent, we have to do a wave reduction to get the
939 // maximum value, so this is expanded during RegBankSelect.
940 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
941 .legalFor({{PrivatePtr, S32}});
942
943 getActionDefinitionsBuilder(G_STACKSAVE)
944 .customFor({PrivatePtr});
945 getActionDefinitionsBuilder(G_STACKRESTORE)
946 .legalFor({PrivatePtr});
947
948 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
949
950 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
951 .customIf(typeIsNot(0, PrivatePtr));
952
953 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
954
955 auto &FPOpActions = getActionDefinitionsBuilder(
956 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
957 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
958 .legalFor({S32, S64});
959 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
960 .customFor({S32, S64});
961 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
962 .customFor({S32, S64});
963
964 if (ST.has16BitInsts()) {
965 if (ST.hasVOP3PInsts())
966 FPOpActions.legalFor({S16, V2S16});
967 else
968 FPOpActions.legalFor({S16});
969
970 TrigActions.customFor({S16});
971 FDIVActions.customFor({S16});
972 }
973
974 if (ST.hasPackedFP32Ops()) {
975 FPOpActions.legalFor({V2S32});
976 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
977 }
978
979 auto &MinNumMaxNum = getActionDefinitionsBuilder(
980 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
981 G_FMAXNUM_IEEE});
982
983 if (ST.hasVOP3PInsts()) {
984 MinNumMaxNum.customFor(FPTypesPK16)
987 .clampScalar(0, S16, S64)
988 .scalarize(0);
989 } else if (ST.has16BitInsts()) {
990 MinNumMaxNum.customFor(FPTypes16)
991 .clampScalar(0, S16, S64)
992 .scalarize(0);
993 } else {
994 MinNumMaxNum.customFor(FPTypesBase)
995 .clampScalar(0, S32, S64)
996 .scalarize(0);
997 }
998
999 if (ST.hasVOP3PInsts())
1000 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1001
1002 FPOpActions
1003 .scalarize(0)
1004 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1005
1006 TrigActions
1007 .scalarize(0)
1008 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1009
1010 FDIVActions
1011 .scalarize(0)
1012 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1013
1014 getActionDefinitionsBuilder({G_FNEG, G_FABS})
1015 .legalFor(FPTypesPK16)
1017 .scalarize(0)
1018 .clampScalar(0, S16, S64);
1019
1020 if (ST.has16BitInsts()) {
1022 .legalFor({S16})
1023 .customFor({S32, S64})
1024 .scalarize(0)
1025 .unsupported();
1027 .legalFor({S32, S64, S16})
1028 .scalarize(0)
1029 .clampScalar(0, S16, S64);
1030
1031 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1032 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1033 .scalarize(0)
1034 .maxScalarIf(typeIs(0, S16), 1, S16)
1035 .clampScalar(1, S32, S32)
1036 .lower();
1037
1039 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1040 .scalarize(0)
1041 .lower();
1042 } else {
1044 .customFor({S32, S64, S16})
1045 .scalarize(0)
1046 .unsupported();
1047
1048
1049 if (ST.hasFractBug()) {
1051 .customFor({S64})
1052 .legalFor({S32, S64})
1053 .scalarize(0)
1054 .clampScalar(0, S32, S64);
1055 } else {
1057 .legalFor({S32, S64})
1058 .scalarize(0)
1059 .clampScalar(0, S32, S64);
1060 }
1061
1062 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1063 .legalFor({{S32, S32}, {S64, S32}})
1064 .scalarize(0)
1065 .clampScalar(0, S32, S64)
1066 .clampScalar(1, S32, S32)
1067 .lower();
1068
1070 .customFor({{S32, S32}, {S64, S32}})
1071 .scalarize(0)
1072 .minScalar(0, S32)
1073 .clampScalar(1, S32, S32)
1074 .lower();
1075 }
1076
1077 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1078 if (ST.hasCvtPkF16F32Inst()) {
1079 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1080 .clampMaxNumElements(0, S16, 2);
1081 } else {
1082 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1083 }
1084 FPTruncActions.scalarize(0).lower();
1085
1087 .legalFor({{S64, S32}, {S32, S16}})
1088 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1089 .scalarize(0);
1090
1091 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1092 if (ST.has16BitInsts()) {
1093 FSubActions
1094 // Use actual fsub instruction
1095 .legalFor({S32, S16})
1096 // Must use fadd + fneg
1097 .lowerFor({S64, V2S16});
1098 } else {
1099 FSubActions
1100 // Use actual fsub instruction
1101 .legalFor({S32})
1102 // Must use fadd + fneg
1103 .lowerFor({S64, S16, V2S16});
1104 }
1105
1106 FSubActions
1107 .scalarize(0)
1108 .clampScalar(0, S32, S64);
1109
1110 // Whether this is legal depends on the floating point mode for the function.
1111 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1112 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1113 FMad.customFor({S32, S16});
1114 else if (ST.hasMadMacF32Insts())
1115 FMad.customFor({S32});
1116 else if (ST.hasMadF16())
1117 FMad.customFor({S16});
1118 FMad.scalarize(0)
1119 .lower();
1120
1121 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1122 if (ST.has16BitInsts()) {
1123 FRem.customFor({S16, S32, S64});
1124 } else {
1125 FRem.minScalar(0, S32)
1126 .customFor({S32, S64});
1127 }
1128 FRem.scalarize(0);
1129
1130 // TODO: Do we need to clamp maximum bitwidth?
1132 .legalIf(isScalar(0))
1133 .legalFor({{V2S16, V2S32}})
1134 .clampMaxNumElements(0, S16, 2)
1135 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1136 // situations (like an invalid implicit use), we don't want to infinite loop
1137 // in the legalizer.
1139 .alwaysLegal();
1140
1141 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1142 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1143 {S32, S1}, {S64, S1}, {S16, S1}})
1144 .scalarize(0)
1145 .clampScalar(0, S32, S64)
1146 .widenScalarToNextPow2(1, 32);
1147
1148 // TODO: Split s1->s64 during regbankselect for VALU.
1149 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1150 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1151 .lowerIf(typeIs(1, S1))
1152 .customFor({{S32, S64}, {S64, S64}});
1153 if (ST.has16BitInsts())
1154 IToFP.legalFor({{S16, S16}});
1155 IToFP.clampScalar(1, S32, S64)
1156 .minScalar(0, S32)
1157 .scalarize(0)
1159
1160 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1161 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1162 .customFor({{S64, S32}, {S64, S64}})
1163 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1164 if (ST.has16BitInsts())
1165 FPToI.legalFor({{S16, S16}});
1166 else
1167 FPToI.minScalar(1, S32);
1168
1169 FPToI.minScalar(0, S32)
1170 .widenScalarToNextPow2(0, 32)
1171 .scalarize(0)
1172 .lower();
1173
1174 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1175 .clampScalar(0, S16, S64)
1176 .scalarize(0)
1177 .lower();
1178
1179 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1180 .legalFor({S16, S32})
1181 .scalarize(0)
1182 .lower();
1183
1184 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1185 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1186 .scalarize(0)
1187 .lower();
1188
1189 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1190 .clampScalar(0, S16, S64)
1191 .scalarize(0)
1192 .lower();
1193
1194 if (ST.has16BitInsts()) {
1195 getActionDefinitionsBuilder(
1196 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1197 .legalFor({S16, S32, S64})
1198 .clampScalar(0, S16, S64)
1199 .scalarize(0);
1200 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1201 getActionDefinitionsBuilder(
1202 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1203 .legalFor({S32, S64})
1204 .clampScalar(0, S32, S64)
1205 .scalarize(0);
1206 } else {
1207 getActionDefinitionsBuilder(
1208 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1209 .legalFor({S32})
1210 .customFor({S64})
1211 .clampScalar(0, S32, S64)
1212 .scalarize(0);
1213 }
1214
1215 getActionDefinitionsBuilder(G_PTR_ADD)
1216 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1217 .legalIf(all(isPointer(0), sameSize(0, 1)))
1218 .scalarize(0)
1219 .scalarSameSizeAs(1, 0);
1220
1221 getActionDefinitionsBuilder(G_PTRMASK)
1222 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1223 .scalarSameSizeAs(1, 0)
1224 .scalarize(0);
1225
1226 auto &CmpBuilder =
1227 getActionDefinitionsBuilder(G_ICMP)
1228 // The compare output type differs based on the register bank of the output,
1229 // so make both s1 and s32 legal.
1230 //
1231 // Scalar compares producing output in scc will be promoted to s32, as that
1232 // is the allocatable register type that will be needed for the copy from
1233 // scc. This will be promoted during RegBankSelect, and we assume something
1234 // before that won't try to use s32 result types.
1235 //
1236 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1237 // bank.
1238 .legalForCartesianProduct(
1239 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1240 .legalForCartesianProduct(
1241 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1242 if (ST.has16BitInsts()) {
1243 CmpBuilder.legalFor({{S1, S16}});
1244 }
1245
1246 CmpBuilder
1247 .widenScalarToNextPow2(1)
1248 .clampScalar(1, S32, S64)
1249 .scalarize(0)
1250 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1251
1252 auto &FCmpBuilder =
1253 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1254 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1255
1256 if (ST.hasSALUFloatInsts())
1257 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1258
1259 FCmpBuilder
1260 .widenScalarToNextPow2(1)
1261 .clampScalar(1, S32, S64)
1262 .scalarize(0);
1263
1264 // FIXME: fpow has a selection pattern that should move to custom lowering.
1265 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1266 if (ST.has16BitInsts())
1267 ExpOps.customFor({{S32}, {S16}});
1268 else
1269 ExpOps.customFor({S32});
1270 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1271 .scalarize(0);
1272
1273 getActionDefinitionsBuilder(G_FPOWI)
1274 .clampScalar(0, MinScalarFPTy, S32)
1275 .lower();
1276
1277 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1278 Log2Ops.customFor({S32});
1279 if (ST.has16BitInsts())
1280 Log2Ops.legalFor({S16});
1281 else
1282 Log2Ops.customFor({S16});
1283 Log2Ops.scalarize(0)
1284 .lower();
1285
1286 auto &LogOps =
1287 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1288 LogOps.customFor({S32, S16});
1289 LogOps.clampScalar(0, MinScalarFPTy, S32)
1290 .scalarize(0);
1291
1292 // The 64-bit versions produce 32-bit results, but only on the SALU.
1293 getActionDefinitionsBuilder(G_CTPOP)
1294 .legalFor({{S32, S32}, {S32, S64}})
1295 .clampScalar(0, S32, S32)
1296 .widenScalarToNextPow2(1, 32)
1297 .clampScalar(1, S32, S64)
1298 .scalarize(0)
1299 .widenScalarToNextPow2(0, 32);
1300
1301 // If no 16 bit instr is available, lower into different instructions.
1302 if (ST.has16BitInsts())
1303 getActionDefinitionsBuilder(G_IS_FPCLASS)
1304 .legalForCartesianProduct({S1}, FPTypes16)
1305 .widenScalarToNextPow2(1)
1306 .scalarize(0)
1307 .lower();
1308 else
1309 getActionDefinitionsBuilder(G_IS_FPCLASS)
1310 .legalForCartesianProduct({S1}, FPTypesBase)
1311 .lowerFor({S1, S16})
1312 .widenScalarToNextPow2(1)
1313 .scalarize(0)
1314 .lower();
1315
1316 // The hardware instructions return a different result on 0 than the generic
1317 // instructions expect. The hardware produces -1, but these produce the
1318 // bitwidth.
1319 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1320 .scalarize(0)
1321 .clampScalar(0, S32, S32)
1322 .clampScalar(1, S32, S64)
1323 .widenScalarToNextPow2(0, 32)
1324 .widenScalarToNextPow2(1, 32)
1325 .custom();
1326
1327 // The 64-bit versions produce 32-bit results, but only on the SALU.
1328 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1329 .legalFor({{S32, S32}, {S32, S64}})
1330 .customIf(scalarNarrowerThan(1, 32))
1331 .clampScalar(0, S32, S32)
1332 .clampScalar(1, S32, S64)
1333 .scalarize(0)
1334 .widenScalarToNextPow2(0, 32)
1335 .widenScalarToNextPow2(1, 32);
1336
1337 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1338 .legalFor({{S32, S32}, {S32, S64}})
1339 .clampScalar(0, S32, S32)
1340 .clampScalar(1, S32, S64)
1341 .scalarize(0)
1342 .widenScalarToNextPow2(0, 32)
1343 .widenScalarToNextPow2(1, 32);
1344
1345 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1346 // RegBankSelect.
1347 getActionDefinitionsBuilder(G_BITREVERSE)
1348 .legalFor({S32, S64})
1349 .clampScalar(0, S32, S64)
1350 .scalarize(0)
1351 .widenScalarToNextPow2(0);
1352
1353 if (ST.has16BitInsts()) {
1354 getActionDefinitionsBuilder(G_BSWAP)
1355 .legalFor({S16, S32, V2S16})
1356 .clampMaxNumElementsStrict(0, S16, 2)
1357 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1358 // narrowScalar limitation.
1359 .widenScalarToNextPow2(0)
1360 .clampScalar(0, S16, S32)
1361 .scalarize(0);
1362
1363 if (ST.hasVOP3PInsts()) {
1364 getActionDefinitionsBuilder(G_ABS)
1365 .legalFor({S32, S16, V2S16})
1366 .clampMaxNumElements(0, S16, 2)
1367 .minScalar(0, S16)
1368 .widenScalarToNextPow2(0)
1369 .scalarize(0)
1370 .lower();
1371 if (ST.hasIntMinMax64()) {
1372 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1373 .legalFor({S32, S16, S64, V2S16})
1374 .clampMaxNumElements(0, S16, 2)
1375 .minScalar(0, S16)
1376 .widenScalarToNextPow2(0)
1377 .scalarize(0)
1378 .lower();
1379 } else {
1380 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1381 .legalFor({S32, S16, V2S16})
1382 .clampMaxNumElements(0, S16, 2)
1383 .minScalar(0, S16)
1384 .widenScalarToNextPow2(0)
1385 .scalarize(0)
1386 .lower();
1387 }
1388 } else {
1389 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1390 .legalFor({S32, S16})
1391 .widenScalarToNextPow2(0)
1392 .minScalar(0, S16)
1393 .scalarize(0)
1394 .lower();
1395 }
1396 } else {
1397 // TODO: Should have same legality without v_perm_b32
1398 getActionDefinitionsBuilder(G_BSWAP)
1399 .legalFor({S32})
1400 .lowerIf(scalarNarrowerThan(0, 32))
1401 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1402 // narrowScalar limitation.
1403 .widenScalarToNextPow2(0)
1404 .maxScalar(0, S32)
1405 .scalarize(0)
1406 .lower();
1407
1408 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1409 .legalFor({S32})
1410 .minScalar(0, S32)
1411 .widenScalarToNextPow2(0)
1412 .scalarize(0)
1413 .lower();
1414 }
1415
1416 getActionDefinitionsBuilder(G_INTTOPTR)
1417 // List the common cases
1418 .legalForCartesianProduct(AddrSpaces64, {S64})
1419 .legalForCartesianProduct(AddrSpaces32, {S32})
1420 .scalarize(0)
1421 // Accept any address space as long as the size matches
1422 .legalIf(sameSize(0, 1))
1423 .widenScalarIf(smallerThan(1, 0),
1424 [](const LegalityQuery &Query) {
1425 return std::pair(
1426 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1427 })
1428 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1429 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1430 });
1431
1432 getActionDefinitionsBuilder(G_PTRTOINT)
1433 // List the common cases
1434 .legalForCartesianProduct(AddrSpaces64, {S64})
1435 .legalForCartesianProduct(AddrSpaces32, {S32})
1436 .scalarize(0)
1437 // Accept any address space as long as the size matches
1438 .legalIf(sameSize(0, 1))
1439 .widenScalarIf(smallerThan(0, 1),
1440 [](const LegalityQuery &Query) {
1441 return std::pair(
1442 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1443 })
1444 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1445 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1446 });
1447
1448 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1449 .scalarize(0)
1450 .custom();
1451
1452 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1453 bool IsLoad) -> bool {
1454 const LLT DstTy = Query.Types[0];
1455
1456 // Split vector extloads.
1457 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1458
1459 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1460 return true;
1461
1462 const LLT PtrTy = Query.Types[1];
1463 unsigned AS = PtrTy.getAddressSpace();
1464 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1465 Query.MMODescrs[0].Ordering !=
1467 return true;
1468
1469 // Catch weird sized loads that don't evenly divide into the access sizes
1470 // TODO: May be able to widen depending on alignment etc.
1471 unsigned NumRegs = (MemSize + 31) / 32;
1472 if (NumRegs == 3) {
1473 if (!ST.hasDwordx3LoadStores())
1474 return true;
1475 } else {
1476 // If the alignment allows, these should have been widened.
1477 if (!isPowerOf2_32(NumRegs))
1478 return true;
1479 }
1480
1481 return false;
1482 };
1483
1484 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1485 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1486 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1487
1488 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1489 // LDS
1490 // TODO: Unsupported flat for SI.
1491
1492 for (unsigned Op : {G_LOAD, G_STORE}) {
1493 const bool IsStore = Op == G_STORE;
1494
1495 auto &Actions = getActionDefinitionsBuilder(Op);
1496 // Explicitly list some common cases.
1497 // TODO: Does this help compile time at all?
1498 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1499 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1500 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1501 {S64, GlobalPtr, S64, GlobalAlign32},
1502 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1503 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1504 {S32, GlobalPtr, S8, GlobalAlign8},
1505 {S32, GlobalPtr, S16, GlobalAlign16},
1506
1507 {S32, LocalPtr, S32, 32},
1508 {S64, LocalPtr, S64, 32},
1509 {V2S32, LocalPtr, V2S32, 32},
1510 {S32, LocalPtr, S8, 8},
1511 {S32, LocalPtr, S16, 16},
1512 {V2S16, LocalPtr, S32, 32},
1513
1514 {S32, PrivatePtr, S32, 32},
1515 {S32, PrivatePtr, S8, 8},
1516 {S32, PrivatePtr, S16, 16},
1517 {V2S16, PrivatePtr, S32, 32},
1518
1519 {S32, ConstantPtr, S32, GlobalAlign32},
1520 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1521 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1522 {S64, ConstantPtr, S64, GlobalAlign32},
1523 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1524 Actions.legalIf(
1525 [=](const LegalityQuery &Query) -> bool {
1526 return isLoadStoreLegal(ST, Query);
1527 });
1528
1529 // The custom pointers (fat pointers, buffer resources) don't work with load
1530 // and store at this level. Fat pointers should have been lowered to
1531 // intrinsics before the translation to MIR.
1532 Actions.unsupportedIf(
1533 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1534
1535 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1536 // ptrtoint. This is needed to account for the fact that we can't have i128
1537 // as a register class for SelectionDAG reasons.
1538 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1539 return hasBufferRsrcWorkaround(Query.Types[0]);
1540 });
1541
1542 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1543 // 64-bits.
1544 //
1545 // TODO: Should generalize bitcast action into coerce, which will also cover
1546 // inserting addrspacecasts.
1547 Actions.customIf(typeIs(1, Constant32Ptr));
1548
1549 // Turn any illegal element vectors into something easier to deal
1550 // with. These will ultimately produce 32-bit scalar shifts to extract the
1551 // parts anyway.
1552 //
1553 // For odd 16-bit element vectors, prefer to split those into pieces with
1554 // 16-bit vector parts.
1555 Actions.bitcastIf(
1556 [=](const LegalityQuery &Query) -> bool {
1557 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1558 Query.MMODescrs[0].MemoryTy);
1559 }, bitcastToRegisterType(0));
1560
1561 if (!IsStore) {
1562 // Widen suitably aligned loads by loading extra bytes. The standard
1563 // legalization actions can't properly express widening memory operands.
1564 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1565 return shouldWidenLoad(ST, Query, G_LOAD);
1566 });
1567 }
1568
1569 // FIXME: load/store narrowing should be moved to lower action
1570 Actions
1571 .narrowScalarIf(
1572 [=](const LegalityQuery &Query) -> bool {
1573 return !Query.Types[0].isVector() &&
1574 needToSplitMemOp(Query, Op == G_LOAD);
1575 },
1576 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1577 const LLT DstTy = Query.Types[0];
1578 const LLT PtrTy = Query.Types[1];
1579
1580 const unsigned DstSize = DstTy.getSizeInBits();
1581 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1582
1583 // Split extloads.
1584 if (DstSize > MemSize)
1585 return std::pair(0, LLT::scalar(MemSize));
1586
1587 unsigned MaxSize = maxSizeForAddrSpace(
1588 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1589 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1590 if (MemSize > MaxSize)
1591 return std::pair(0, LLT::scalar(MaxSize));
1592
1593 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1594 return std::pair(0, LLT::scalar(Align));
1595 })
1596 .fewerElementsIf(
1597 [=](const LegalityQuery &Query) -> bool {
1598 return Query.Types[0].isVector() &&
1599 needToSplitMemOp(Query, Op == G_LOAD);
1600 },
1601 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1602 const LLT DstTy = Query.Types[0];
1603 const LLT PtrTy = Query.Types[1];
1604
1605 LLT EltTy = DstTy.getElementType();
1606 unsigned MaxSize = maxSizeForAddrSpace(
1607 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1608 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1609
1610 // FIXME: Handle widened to power of 2 results better. This ends
1611 // up scalarizing.
1612 // FIXME: 3 element stores scalarized on SI
1613
1614 // Split if it's too large for the address space.
1615 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1616 if (MemSize > MaxSize) {
1617 unsigned NumElts = DstTy.getNumElements();
1618 unsigned EltSize = EltTy.getSizeInBits();
1619
1620 if (MaxSize % EltSize == 0) {
1621 return std::pair(
1623 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1624 }
1625
1626 unsigned NumPieces = MemSize / MaxSize;
1627
1628 // FIXME: Refine when odd breakdowns handled
1629 // The scalars will need to be re-legalized.
1630 if (NumPieces == 1 || NumPieces >= NumElts ||
1631 NumElts % NumPieces != 0)
1632 return std::pair(0, EltTy);
1633
1634 return std::pair(0,
1635 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1636 }
1637
1638 // FIXME: We could probably handle weird extending loads better.
1639 if (DstTy.getSizeInBits() > MemSize)
1640 return std::pair(0, EltTy);
1641
1642 unsigned EltSize = EltTy.getSizeInBits();
1643 unsigned DstSize = DstTy.getSizeInBits();
1644 if (!isPowerOf2_32(DstSize)) {
1645 // We're probably decomposing an odd sized store. Try to split
1646 // to the widest type. TODO: Account for alignment. As-is it
1647 // should be OK, since the new parts will be further legalized.
1648 unsigned FloorSize = llvm::bit_floor(DstSize);
1649 return std::pair(
1651 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1652 }
1653
1654 // May need relegalization for the scalars.
1655 return std::pair(0, EltTy);
1656 })
1657 .minScalar(0, S32)
1658 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1660 .widenScalarToNextPow2(0)
1661 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1662 .lower();
1663 }
1664
1665 // FIXME: Unaligned accesses not lowered.
1666 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1667 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1668 {S32, GlobalPtr, S16, 2 * 8},
1669 {S32, LocalPtr, S8, 8},
1670 {S32, LocalPtr, S16, 16},
1671 {S32, PrivatePtr, S8, 8},
1672 {S32, PrivatePtr, S16, 16},
1673 {S32, ConstantPtr, S8, 8},
1674 {S32, ConstantPtr, S16, 2 * 8}})
1675 .legalIf(
1676 [=](const LegalityQuery &Query) -> bool {
1677 return isLoadStoreLegal(ST, Query);
1678 });
1679
1680 if (ST.hasFlatAddressSpace()) {
1681 ExtLoads.legalForTypesWithMemDesc(
1682 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1683 }
1684
1685 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1686 // 64-bits.
1687 //
1688 // TODO: Should generalize bitcast action into coerce, which will also cover
1689 // inserting addrspacecasts.
1690 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1691
1692 ExtLoads.clampScalar(0, S32, S32)
1693 .widenScalarToNextPow2(0)
1694 .lower();
1695
1696 auto &Atomics = getActionDefinitionsBuilder(
1697 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1698 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1699 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1700 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1701 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1702 {S64, GlobalPtr}, {S64, LocalPtr},
1703 {S32, RegionPtr}, {S64, RegionPtr}});
1704 if (ST.hasFlatAddressSpace()) {
1705 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1706 }
1707
1708 // TODO: v2bf16 operations, and fat buffer pointer support.
1709 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1710 if (ST.hasLDSFPAtomicAddF32()) {
1711 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1712 if (ST.hasLdsAtomicAddF64())
1713 Atomic.legalFor({{S64, LocalPtr}});
1714 if (ST.hasAtomicDsPkAdd16Insts())
1715 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1716 }
1717 if (ST.hasAtomicFaddInsts())
1718 Atomic.legalFor({{S32, GlobalPtr}});
1719 if (ST.hasFlatAtomicFaddF32Inst())
1720 Atomic.legalFor({{S32, FlatPtr}});
1721
1722 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1723 // These are legal with some caveats, and should have undergone expansion in
1724 // the IR in most situations
1725 // TODO: Move atomic expansion into legalizer
1726 Atomic.legalFor({
1727 {S32, GlobalPtr},
1728 {S64, GlobalPtr},
1729 {S64, FlatPtr}
1730 });
1731 }
1732
1733 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1734 ST.hasAtomicBufferGlobalPkAddF16Insts())
1735 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1736 if (ST.hasAtomicGlobalPkAddBF16Inst())
1737 Atomic.legalFor({{V2BF16, GlobalPtr}});
1738 if (ST.hasAtomicFlatPkAdd16Insts())
1739 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1740
1741
1742 // Most of the legalization work here is done by AtomicExpand. We could
1743 // probably use a simpler legality rule that just assumes anything is OK.
1744 auto &AtomicFMinFMax =
1745 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1746 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1747
1748 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1749 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1750 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1751 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1752 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1753 AtomicFMinFMax.legalFor({F32, FlatPtr});
1754 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1755 AtomicFMinFMax.legalFor({F64, FlatPtr});
1756
1757 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1758 // demarshalling
1759 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1760 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1761 {S32, FlatPtr}, {S64, FlatPtr}})
1762 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1763 {S32, RegionPtr}, {S64, RegionPtr}});
1764 // TODO: Pointer types, any 32-bit or 64-bit vector
1765
1766 // Condition should be s32 for scalar, s1 for vector.
1767 getActionDefinitionsBuilder(G_SELECT)
1768 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1769 LocalPtr, FlatPtr, PrivatePtr,
1770 LLT::fixed_vector(2, LocalPtr),
1771 LLT::fixed_vector(2, PrivatePtr)},
1772 {S1, S32})
1773 .clampScalar(0, S16, S64)
1774 .scalarize(1)
1775 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1776 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1777 .clampMaxNumElements(0, S32, 2)
1778 .clampMaxNumElements(0, LocalPtr, 2)
1779 .clampMaxNumElements(0, PrivatePtr, 2)
1780 .scalarize(0)
1781 .widenScalarToNextPow2(0)
1782 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1783
1784 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1785 // be more flexible with the shift amount type.
1786 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1787 .legalFor({{S32, S32}, {S64, S32}});
1788 if (ST.has16BitInsts()) {
1789 if (ST.hasVOP3PInsts()) {
1790 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1791 .clampMaxNumElements(0, S16, 2);
1792 } else
1793 Shifts.legalFor({{S16, S16}});
1794
1795 // TODO: Support 16-bit shift amounts for all types
1796 Shifts.widenScalarIf(
1797 [=](const LegalityQuery &Query) {
1798 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1799 // 32-bit amount.
1800 const LLT ValTy = Query.Types[0];
1801 const LLT AmountTy = Query.Types[1];
1802 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1803 AmountTy.getSizeInBits() < 16;
1804 }, changeTo(1, S16));
1805 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1806 Shifts.clampScalar(1, S32, S32);
1807 Shifts.widenScalarToNextPow2(0, 16);
1808 Shifts.clampScalar(0, S16, S64);
1809
1810 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1811 .minScalar(0, S16)
1812 .scalarize(0)
1813 .lower();
1814 } else {
1815 // Make sure we legalize the shift amount type first, as the general
1816 // expansion for the shifted type will produce much worse code if it hasn't
1817 // been truncated already.
1818 Shifts.clampScalar(1, S32, S32);
1819 Shifts.widenScalarToNextPow2(0, 32);
1820 Shifts.clampScalar(0, S32, S64);
1821
1822 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1823 .minScalar(0, S32)
1824 .scalarize(0)
1825 .lower();
1826 }
1827 Shifts.scalarize(0);
1828
1829 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1830 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1831 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1832 unsigned IdxTypeIdx = 2;
1833
1834 getActionDefinitionsBuilder(Op)
1835 .customIf([=](const LegalityQuery &Query) {
1836 const LLT EltTy = Query.Types[EltTypeIdx];
1837 const LLT VecTy = Query.Types[VecTypeIdx];
1838 const LLT IdxTy = Query.Types[IdxTypeIdx];
1839 const unsigned EltSize = EltTy.getSizeInBits();
1840 const bool isLegalVecType =
1842 // Address space 8 pointers are 128-bit wide values, but the logic
1843 // below will try to bitcast them to 2N x s64, which will fail.
1844 // Therefore, as an intermediate step, wrap extracts/insertions from a
1845 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1846 // extraction result) in order to produce a vector operation that can
1847 // be handled by the logic below.
1848 if (EltTy.isPointer() && EltSize > 64)
1849 return true;
1850 return (EltSize == 32 || EltSize == 64) &&
1851 VecTy.getSizeInBits() % 32 == 0 &&
1852 VecTy.getSizeInBits() <= MaxRegisterSize &&
1853 IdxTy.getSizeInBits() == 32 &&
1854 isLegalVecType;
1855 })
1856 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1857 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1858 bitcastToVectorElement32(VecTypeIdx))
1859 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1860 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1861 scalarOrEltWiderThan(VecTypeIdx, 64)),
1862 [=](const LegalityQuery &Query) {
1863 // For > 64-bit element types, try to turn this into a
1864 // 64-bit element vector since we may be able to do better
1865 // indexing if this is scalar. If not, fall back to 32.
1866 const LLT EltTy = Query.Types[EltTypeIdx];
1867 const LLT VecTy = Query.Types[VecTypeIdx];
1868 const unsigned DstEltSize = EltTy.getSizeInBits();
1869 const unsigned VecSize = VecTy.getSizeInBits();
1870
1871 const unsigned TargetEltSize =
1872 DstEltSize % 64 == 0 ? 64 : 32;
1873 return std::pair(VecTypeIdx,
1874 LLT::fixed_vector(VecSize / TargetEltSize,
1875 TargetEltSize));
1876 })
1877 .clampScalar(EltTypeIdx, S32, S64)
1878 .clampScalar(VecTypeIdx, S32, S64)
1879 .clampScalar(IdxTypeIdx, S32, S32)
1880 .clampMaxNumElements(VecTypeIdx, S32, 32)
1881 // TODO: Clamp elements for 64-bit vectors?
1882 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1884 // It should only be necessary with variable indexes.
1885 // As a last resort, lower to the stack
1886 .lower();
1887 }
1888
1889 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1890 .unsupportedIf([=](const LegalityQuery &Query) {
1891 const LLT &EltTy = Query.Types[1].getElementType();
1892 return Query.Types[0] != EltTy;
1893 });
1894
1895 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1896 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1897 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1898
1899 // FIXME: Doesn't handle extract of illegal sizes.
1900 getActionDefinitionsBuilder(Op)
1901 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1902 .lowerIf([=](const LegalityQuery &Query) {
1903 // Sub-vector(or single element) insert and extract.
1904 // TODO: verify immediate offset here since lower only works with
1905 // whole elements.
1906 const LLT BigTy = Query.Types[BigTyIdx];
1907 return BigTy.isVector();
1908 })
1909 // FIXME: Multiples of 16 should not be legal.
1910 .legalIf([=](const LegalityQuery &Query) {
1911 const LLT BigTy = Query.Types[BigTyIdx];
1912 const LLT LitTy = Query.Types[LitTyIdx];
1913 return (BigTy.getSizeInBits() % 32 == 0) &&
1914 (LitTy.getSizeInBits() % 16 == 0);
1915 })
1916 .widenScalarIf(
1917 [=](const LegalityQuery &Query) {
1918 const LLT BigTy = Query.Types[BigTyIdx];
1919 return (BigTy.getScalarSizeInBits() < 16);
1920 },
1922 .widenScalarIf(
1923 [=](const LegalityQuery &Query) {
1924 const LLT LitTy = Query.Types[LitTyIdx];
1925 return (LitTy.getScalarSizeInBits() < 16);
1926 },
1928 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1929 .widenScalarToNextPow2(BigTyIdx, 32);
1930
1931 }
1932
1933 auto &BuildVector =
1934 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1935 .legalForCartesianProduct(AllS32Vectors, {S32})
1936 .legalForCartesianProduct(AllS64Vectors, {S64})
1937 .clampNumElements(0, V16S32, V32S32)
1938 .clampNumElements(0, V2S64, V16S64)
1939 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1940 .moreElementsIf(isIllegalRegisterType(ST, 0),
1942
1943 if (ST.hasScalarPackInsts()) {
1944 BuildVector
1945 // FIXME: Should probably widen s1 vectors straight to s32
1946 .minScalarOrElt(0, S16)
1947 .minScalar(1, S16);
1948
1949 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1950 .legalFor({V2S16, S32})
1951 .lower();
1952 } else {
1953 BuildVector.customFor({V2S16, S16});
1954 BuildVector.minScalarOrElt(0, S32);
1955
1956 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1957 .customFor({V2S16, S32})
1958 .lower();
1959 }
1960
1961 BuildVector.legalIf(isRegisterType(ST, 0));
1962
1963 // FIXME: Clamp maximum size
1964 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1965 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1966 .clampMaxNumElements(0, S32, 32)
1967 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1968 .clampMaxNumElements(0, S16, 64);
1969
1970 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1971
1972 // Merge/Unmerge
1973 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1974 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1975 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1976
1977 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1978 const LLT Ty = Query.Types[TypeIdx];
1979 if (Ty.isVector()) {
1980 const LLT &EltTy = Ty.getElementType();
1981 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1982 return true;
1983 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1984 return true;
1985 }
1986 return false;
1987 };
1988
1989 auto &Builder =
1990 getActionDefinitionsBuilder(Op)
1991 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1992 .lowerFor({{S16, V2S16}})
1993 .lowerIf([=](const LegalityQuery &Query) {
1994 const LLT BigTy = Query.Types[BigTyIdx];
1995 return BigTy.getSizeInBits() == 32;
1996 })
1997 // Try to widen to s16 first for small types.
1998 // TODO: Only do this on targets with legal s16 shifts
1999 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2000 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2001 .moreElementsIf(isSmallOddVector(BigTyIdx),
2002 oneMoreElement(BigTyIdx))
2003 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
2004 elementTypeIs(1, S16)),
2005 changeTo(1, V2S16))
2006 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2007 // not worth considering the multiples of 64 since 2*192 and 2*384
2008 // are not valid.
2009 .clampScalar(LitTyIdx, S32, S512)
2010 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2011 // Break up vectors with weird elements into scalars
2012 .fewerElementsIf(
2013 [=](const LegalityQuery &Query) {
2014 return notValidElt(Query, LitTyIdx);
2015 },
2016 scalarize(0))
2017 .fewerElementsIf(
2018 [=](const LegalityQuery &Query) {
2019 return notValidElt(Query, BigTyIdx);
2020 },
2021 scalarize(1))
2022 .clampScalar(BigTyIdx, S32, MaxScalar);
2023
2024 if (Op == G_MERGE_VALUES) {
2025 Builder.widenScalarIf(
2026 // TODO: Use 16-bit shifts if legal for 8-bit values?
2027 [=](const LegalityQuery &Query) {
2028 const LLT Ty = Query.Types[LitTyIdx];
2029 return Ty.getSizeInBits() < 32;
2030 },
2031 changeTo(LitTyIdx, S32));
2032 }
2033
2034 Builder.widenScalarIf(
2035 [=](const LegalityQuery &Query) {
2036 const LLT Ty = Query.Types[BigTyIdx];
2037 return Ty.getSizeInBits() % 16 != 0;
2038 },
2039 [=](const LegalityQuery &Query) {
2040 // Pick the next power of 2, or a multiple of 64 over 128.
2041 // Whichever is smaller.
2042 const LLT &Ty = Query.Types[BigTyIdx];
2043 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2044 if (NewSizeInBits >= 256) {
2045 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2046 if (RoundedTo < NewSizeInBits)
2047 NewSizeInBits = RoundedTo;
2048 }
2049 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2050 })
2051 // Any vectors left are the wrong size. Scalarize them.
2052 .scalarize(0)
2053 .scalarize(1);
2054 }
2055
2056 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2057 // RegBankSelect.
2058 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2059 .legalFor({{S32}, {S64}})
2060 .clampScalar(0, S32, S64);
2061
2062 if (ST.hasVOP3PInsts()) {
2063 SextInReg.lowerFor({{V2S16}})
2064 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2065 // get more vector shift opportunities, since we'll get those when
2066 // expanded.
2067 .clampMaxNumElementsStrict(0, S16, 2);
2068 } else if (ST.has16BitInsts()) {
2069 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2070 } else {
2071 // Prefer to promote to s32 before lowering if we don't have 16-bit
2072 // shifts. This avoid a lot of intermediate truncate and extend operations.
2073 SextInReg.lowerFor({{S32}, {S64}});
2074 }
2075
2076 SextInReg
2077 .scalarize(0)
2078 .clampScalar(0, S32, S64)
2079 .lower();
2080
2081 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2082 .scalarize(0)
2083 .lower();
2084
2085 // TODO: Only Try to form v2s16 with legal packed instructions.
2086 getActionDefinitionsBuilder(G_FSHR)
2087 .legalFor({{S32, S32}})
2088 .lowerFor({{V2S16, V2S16}})
2089 .clampMaxNumElementsStrict(0, S16, 2)
2090 .scalarize(0)
2091 .lower();
2092
2093 if (ST.hasVOP3PInsts()) {
2094 getActionDefinitionsBuilder(G_FSHL)
2095 .lowerFor({{V2S16, V2S16}})
2096 .clampMaxNumElementsStrict(0, S16, 2)
2097 .scalarize(0)
2098 .lower();
2099 } else {
2100 getActionDefinitionsBuilder(G_FSHL)
2101 .scalarize(0)
2102 .lower();
2103 }
2104
2105 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2106 .legalFor({S64});
2107
2108 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2109
2110 getActionDefinitionsBuilder(G_FENCE)
2111 .alwaysLegal();
2112
2113 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2114 .scalarize(0)
2115 .minScalar(0, S32)
2116 .lower();
2117
2118 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2119 .legalFor({{S32, S32}, {S64, S32}})
2120 .clampScalar(1, S32, S32)
2121 .clampScalar(0, S32, S64)
2122 .widenScalarToNextPow2(0)
2123 .scalarize(0);
2124
2125 getActionDefinitionsBuilder(
2126 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2127 G_FCOPYSIGN,
2128
2129 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2130 G_READ_REGISTER, G_WRITE_REGISTER,
2131
2132 G_SADDO, G_SSUBO})
2133 .lower();
2134
2135 if (ST.hasIEEEMinimumMaximumInsts()) {
2136 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2137 .legalFor(FPTypesPK16)
2138 .clampMaxNumElements(0, S16, 2)
2139 .scalarize(0);
2140 } else {
2141 // TODO: Implement
2142 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2143 }
2144
2145 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2146 .lower();
2147
2148 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2149
2150 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2151 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2152 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2153 .unsupported();
2154
2155 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2156
2157 getActionDefinitionsBuilder(
2158 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2159 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2160 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2161 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2162 .legalFor(AllVectors)
2163 .scalarize(1)
2164 .lower();
2165
2166 getLegacyLegalizerInfo().computeTables();
2167 verify(*ST.getInstrInfo());
2168}
2169
2172 LostDebugLocObserver &LocObserver) const {
2173 MachineIRBuilder &B = Helper.MIRBuilder;
2174 MachineRegisterInfo &MRI = *B.getMRI();
2175
2176 switch (MI.getOpcode()) {
2177 case TargetOpcode::G_ADDRSPACE_CAST:
2178 return legalizeAddrSpaceCast(MI, MRI, B);
2179 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2180 return legalizeFroundeven(MI, MRI, B);
2181 case TargetOpcode::G_FCEIL:
2182 return legalizeFceil(MI, MRI, B);
2183 case TargetOpcode::G_FREM:
2184 return legalizeFrem(MI, MRI, B);
2185 case TargetOpcode::G_INTRINSIC_TRUNC:
2186 return legalizeIntrinsicTrunc(MI, MRI, B);
2187 case TargetOpcode::G_SITOFP:
2188 return legalizeITOFP(MI, MRI, B, true);
2189 case TargetOpcode::G_UITOFP:
2190 return legalizeITOFP(MI, MRI, B, false);
2191 case TargetOpcode::G_FPTOSI:
2192 return legalizeFPTOI(MI, MRI, B, true);
2193 case TargetOpcode::G_FPTOUI:
2194 return legalizeFPTOI(MI, MRI, B, false);
2195 case TargetOpcode::G_FMINNUM:
2196 case TargetOpcode::G_FMAXNUM:
2197 case TargetOpcode::G_FMINIMUMNUM:
2198 case TargetOpcode::G_FMAXIMUMNUM:
2199 case TargetOpcode::G_FMINNUM_IEEE:
2200 case TargetOpcode::G_FMAXNUM_IEEE:
2201 return legalizeMinNumMaxNum(Helper, MI);
2202 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2203 return legalizeExtractVectorElt(MI, MRI, B);
2204 case TargetOpcode::G_INSERT_VECTOR_ELT:
2205 return legalizeInsertVectorElt(MI, MRI, B);
2206 case TargetOpcode::G_FSIN:
2207 case TargetOpcode::G_FCOS:
2208 return legalizeSinCos(MI, MRI, B);
2209 case TargetOpcode::G_GLOBAL_VALUE:
2210 return legalizeGlobalValue(MI, MRI, B);
2211 case TargetOpcode::G_LOAD:
2212 case TargetOpcode::G_SEXTLOAD:
2213 case TargetOpcode::G_ZEXTLOAD:
2214 return legalizeLoad(Helper, MI);
2215 case TargetOpcode::G_STORE:
2216 return legalizeStore(Helper, MI);
2217 case TargetOpcode::G_FMAD:
2218 return legalizeFMad(MI, MRI, B);
2219 case TargetOpcode::G_FDIV:
2220 return legalizeFDIV(MI, MRI, B);
2221 case TargetOpcode::G_FFREXP:
2222 return legalizeFFREXP(MI, MRI, B);
2223 case TargetOpcode::G_FSQRT:
2224 return legalizeFSQRT(MI, MRI, B);
2225 case TargetOpcode::G_UDIV:
2226 case TargetOpcode::G_UREM:
2227 case TargetOpcode::G_UDIVREM:
2228 return legalizeUnsignedDIV_REM(MI, MRI, B);
2229 case TargetOpcode::G_SDIV:
2230 case TargetOpcode::G_SREM:
2231 case TargetOpcode::G_SDIVREM:
2232 return legalizeSignedDIV_REM(MI, MRI, B);
2233 case TargetOpcode::G_ATOMIC_CMPXCHG:
2234 return legalizeAtomicCmpXChg(MI, MRI, B);
2235 case TargetOpcode::G_FLOG2:
2236 return legalizeFlog2(MI, B);
2237 case TargetOpcode::G_FLOG:
2238 case TargetOpcode::G_FLOG10:
2239 return legalizeFlogCommon(MI, B);
2240 case TargetOpcode::G_FEXP2:
2241 return legalizeFExp2(MI, B);
2242 case TargetOpcode::G_FEXP:
2243 case TargetOpcode::G_FEXP10:
2244 return legalizeFExp(MI, B);
2245 case TargetOpcode::G_FPOW:
2246 return legalizeFPow(MI, B);
2247 case TargetOpcode::G_FFLOOR:
2248 return legalizeFFloor(MI, MRI, B);
2249 case TargetOpcode::G_BUILD_VECTOR:
2250 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2251 return legalizeBuildVector(MI, MRI, B);
2252 case TargetOpcode::G_MUL:
2253 return legalizeMul(Helper, MI);
2254 case TargetOpcode::G_CTLZ:
2255 case TargetOpcode::G_CTTZ:
2256 return legalizeCTLZ_CTTZ(MI, MRI, B);
2257 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2258 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2259 case TargetOpcode::G_STACKSAVE:
2260 return legalizeStackSave(MI, B);
2261 case TargetOpcode::G_GET_FPENV:
2262 return legalizeGetFPEnv(MI, MRI, B);
2263 case TargetOpcode::G_SET_FPENV:
2264 return legalizeSetFPEnv(MI, MRI, B);
2265 case TargetOpcode::G_TRAP:
2266 return legalizeTrap(MI, MRI, B);
2267 case TargetOpcode::G_DEBUGTRAP:
2268 return legalizeDebugTrap(MI, MRI, B);
2269 default:
2270 return false;
2271 }
2272
2273 llvm_unreachable("expected switch to return");
2274}
2275
2277 unsigned AS,
2279 MachineIRBuilder &B) const {
2280 MachineFunction &MF = B.getMF();
2281 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2282 const LLT S32 = LLT::scalar(32);
2283 const LLT S64 = LLT::scalar(64);
2284
2286
2287 if (ST.hasApertureRegs()) {
2288 // Note: this register is somewhat broken. When used as a 32-bit operand,
2289 // it only returns zeroes. The real value is in the upper 32 bits.
2290 // Thus, we must emit extract the high 32 bits.
2291 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2292 ? AMDGPU::SRC_SHARED_BASE
2293 : AMDGPU::SRC_PRIVATE_BASE;
2294 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2295 !ST.hasGloballyAddressableScratch()) &&
2296 "Cannot use src_private_base with globally addressable scratch!");
2297 // FIXME: It would be more natural to emit a COPY here, but then copy
2298 // coalescing would kick in and it would think it's okay to use the "HI"
2299 // subregister (instead of extracting the HI 32 bits) which is an artificial
2300 // (unusable) register.
2301 // Register TableGen definitions would need an overhaul to get rid of the
2302 // artificial "HI" aperture registers and prevent this kind of issue from
2303 // happening.
2304 Register Dst = MRI.createGenericVirtualRegister(S64);
2305 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2306 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2307 return B.buildUnmerge(S32, Dst).getReg(1);
2308 }
2309
2310 // TODO: can we be smarter about machine pointer info?
2312 Register LoadAddr = MRI.createGenericVirtualRegister(
2314 // For code object version 5, private_base and shared_base are passed through
2315 // implicit kernargs.
2322 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2323
2324 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2326
2327 if (!loadInputValue(KernargPtrReg, B,
2329 return Register();
2330
2332 PtrInfo,
2336
2337 // Pointer address
2338 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2339 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2340 // Load address
2341 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2342 }
2343
2344 Register QueuePtr = MRI.createGenericVirtualRegister(
2346
2348 return Register();
2349
2350 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2351 // private_segment_aperture_base_hi.
2352 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2353
2355 PtrInfo,
2358 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2359
2360 B.buildObjectPtrOffset(
2361 LoadAddr, QueuePtr,
2362 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2363 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2364}
2365
2366/// Return true if the value is a known valid address, such that a null check is
2367/// not necessary.
2369 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2370 MachineInstr *Def = MRI.getVRegDef(Val);
2371 switch (Def->getOpcode()) {
2372 case AMDGPU::G_FRAME_INDEX:
2373 case AMDGPU::G_GLOBAL_VALUE:
2374 case AMDGPU::G_BLOCK_ADDR:
2375 return true;
2376 case AMDGPU::G_CONSTANT: {
2377 const ConstantInt *CI = Def->getOperand(1).getCImm();
2378 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2379 }
2380 default:
2381 return false;
2382 }
2383
2384 return false;
2385}
2386
2389 MachineIRBuilder &B) const {
2390 MachineFunction &MF = B.getMF();
2391
2392 // MI can either be a G_ADDRSPACE_CAST or a
2393 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2394 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2395 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2396 Intrinsic::amdgcn_addrspacecast_nonnull));
2397
2398 const LLT S32 = LLT::scalar(32);
2399 Register Dst = MI.getOperand(0).getReg();
2400 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2401 : MI.getOperand(1).getReg();
2402 LLT DstTy = MRI.getType(Dst);
2403 LLT SrcTy = MRI.getType(Src);
2404 unsigned DestAS = DstTy.getAddressSpace();
2405 unsigned SrcAS = SrcTy.getAddressSpace();
2406
2407 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2408 // vector element.
2409 assert(!DstTy.isVector());
2410
2411 const AMDGPUTargetMachine &TM
2412 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2413
2414 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2415 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2416 return true;
2417 }
2418
2419 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2420 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2421 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2422 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2423 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2425 // flat -> private with globally addressable scratch: subtract
2426 // src_flat_scratch_base_lo.
2427 const LLT S32 = LLT::scalar(32);
2428 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2429 Register FlatScratchBaseLo =
2430 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2431 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2432 .getReg(0);
2433 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2434 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2435 return B.buildIntToPtr(Dst, Sub).getReg(0);
2436 }
2437
2438 // Extract low 32-bits of the pointer.
2439 return B.buildExtract(Dst, Src, 0).getReg(0);
2440 };
2441
2442 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2443 // G_ADDRSPACE_CAST we need to guess.
2444 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2445 castFlatToLocalOrPrivate(Dst);
2446 MI.eraseFromParent();
2447 return true;
2448 }
2449
2450 unsigned NullVal = TM.getNullPointerValue(DestAS);
2451
2452 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2453 auto FlatNull = B.buildConstant(SrcTy, 0);
2454
2455 // Extract low 32-bits of the pointer.
2456 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2457
2458 auto CmpRes =
2459 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2460 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2461
2462 MI.eraseFromParent();
2463 return true;
2464 }
2465
2466 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2467 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2468 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2469 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2470 // Coerce the type of the low half of the result so we can use
2471 // merge_values.
2472 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2473
2474 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2476 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2477 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2478 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2479 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2480 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2481 .addUse(AllOnes)
2482 .addUse(ThreadID)
2483 .getReg(0);
2484 if (ST.isWave64()) {
2485 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2486 .addUse(AllOnes)
2487 .addUse(ThreadID)
2488 .getReg(0);
2489 }
2490 Register ShAmt =
2491 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2492 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2493 Register CvtPtr =
2494 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2495 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2496 // 64-bit hi:lo value.
2497 Register FlatScratchBase =
2498 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2499 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2500 .getReg(0);
2501 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2502 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2503 }
2504
2505 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2506 if (!ApertureReg.isValid())
2507 return false;
2508
2509 // TODO: Should we allow mismatched types but matching sizes in merges to
2510 // avoid the ptrtoint?
2511 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2512 };
2513
2514 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2515 // G_ADDRSPACE_CAST we need to guess.
2516 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2517 castLocalOrPrivateToFlat(Dst);
2518 MI.eraseFromParent();
2519 return true;
2520 }
2521
2522 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2523
2524 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2525 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2526
2527 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2528 SegmentNull.getReg(0));
2529
2530 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2531
2532 MI.eraseFromParent();
2533 return true;
2534 }
2535
2536 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2537 SrcTy.getSizeInBits() == 64) {
2538 // Truncate.
2539 B.buildExtract(Dst, Src, 0);
2540 MI.eraseFromParent();
2541 return true;
2542 }
2543
2544 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2545 DstTy.getSizeInBits() == 64) {
2547 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2548 auto PtrLo = B.buildPtrToInt(S32, Src);
2549 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2550 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2551 MI.eraseFromParent();
2552 return true;
2553 }
2554
2555 // Invalid casts are poison.
2556 // TODO: Should return poison
2557 B.buildUndef(Dst);
2558 MI.eraseFromParent();
2559 return true;
2560}
2561
2564 MachineIRBuilder &B) const {
2565 Register Src = MI.getOperand(1).getReg();
2566 LLT Ty = MRI.getType(Src);
2567 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2568
2569 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2570 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2571
2572 auto C1 = B.buildFConstant(Ty, C1Val);
2573 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2574
2575 // TODO: Should this propagate fast-math-flags?
2576 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2577 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2578
2579 auto C2 = B.buildFConstant(Ty, C2Val);
2580 auto Fabs = B.buildFAbs(Ty, Src);
2581
2582 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2583 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2584 MI.eraseFromParent();
2585 return true;
2586}
2587
2590 MachineIRBuilder &B) const {
2591
2592 const LLT S1 = LLT::scalar(1);
2593 const LLT S64 = LLT::scalar(64);
2594
2595 Register Src = MI.getOperand(1).getReg();
2596 assert(MRI.getType(Src) == S64);
2597
2598 // result = trunc(src)
2599 // if (src > 0.0 && src != result)
2600 // result += 1.0
2601
2602 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2603
2604 const auto Zero = B.buildFConstant(S64, 0.0);
2605 const auto One = B.buildFConstant(S64, 1.0);
2606 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2607 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2608 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2609 auto Add = B.buildSelect(S64, And, One, Zero);
2610
2611 // TODO: Should this propagate fast-math-flags?
2612 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2613 MI.eraseFromParent();
2614 return true;
2615}
2616
2619 MachineIRBuilder &B) const {
2620 Register DstReg = MI.getOperand(0).getReg();
2621 Register Src0Reg = MI.getOperand(1).getReg();
2622 Register Src1Reg = MI.getOperand(2).getReg();
2623 auto Flags = MI.getFlags();
2624 LLT Ty = MRI.getType(DstReg);
2625
2626 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2627 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2628 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2629 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2630 MI.eraseFromParent();
2631 return true;
2632}
2633
2636 const unsigned FractBits = 52;
2637 const unsigned ExpBits = 11;
2638 LLT S32 = LLT::scalar(32);
2639
2640 auto Const0 = B.buildConstant(S32, FractBits - 32);
2641 auto Const1 = B.buildConstant(S32, ExpBits);
2642
2643 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2644 .addUse(Hi)
2645 .addUse(Const0.getReg(0))
2646 .addUse(Const1.getReg(0));
2647
2648 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2649}
2650
2653 MachineIRBuilder &B) const {
2654 const LLT S1 = LLT::scalar(1);
2655 const LLT S32 = LLT::scalar(32);
2656 const LLT S64 = LLT::scalar(64);
2657
2658 Register Src = MI.getOperand(1).getReg();
2659 assert(MRI.getType(Src) == S64);
2660
2661 // TODO: Should this use extract since the low half is unused?
2662 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2663 Register Hi = Unmerge.getReg(1);
2664
2665 // Extract the upper half, since this is where we will find the sign and
2666 // exponent.
2667 auto Exp = extractF64Exponent(Hi, B);
2668
2669 const unsigned FractBits = 52;
2670
2671 // Extract the sign bit.
2672 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2673 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2674
2675 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2676
2677 const auto Zero32 = B.buildConstant(S32, 0);
2678
2679 // Extend back to 64-bits.
2680 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2681
2682 auto Shr = B.buildAShr(S64, FractMask, Exp);
2683 auto Not = B.buildNot(S64, Shr);
2684 auto Tmp0 = B.buildAnd(S64, Src, Not);
2685 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2686
2687 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2688 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2689
2690 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2691 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2692 MI.eraseFromParent();
2693 return true;
2694}
2695
2698 MachineIRBuilder &B, bool Signed) const {
2699
2700 Register Dst = MI.getOperand(0).getReg();
2701 Register Src = MI.getOperand(1).getReg();
2702
2703 const LLT S64 = LLT::scalar(64);
2704 const LLT S32 = LLT::scalar(32);
2705
2706 assert(MRI.getType(Src) == S64);
2707
2708 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2709 auto ThirtyTwo = B.buildConstant(S32, 32);
2710
2711 if (MRI.getType(Dst) == S64) {
2712 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2713 : B.buildUITOFP(S64, Unmerge.getReg(1));
2714
2715 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2716 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2717
2718 // TODO: Should this propagate fast-math-flags?
2719 B.buildFAdd(Dst, LdExp, CvtLo);
2720 MI.eraseFromParent();
2721 return true;
2722 }
2723
2724 assert(MRI.getType(Dst) == S32);
2725
2726 auto One = B.buildConstant(S32, 1);
2727
2728 MachineInstrBuilder ShAmt;
2729 if (Signed) {
2730 auto ThirtyOne = B.buildConstant(S32, 31);
2731 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2732 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2733 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2734 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2735 .addUse(Unmerge.getReg(1));
2736 auto LS2 = B.buildSub(S32, LS, One);
2737 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2738 } else
2739 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2740 auto Norm = B.buildShl(S64, Src, ShAmt);
2741 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2742 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2743 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2744 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2745 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2746 B.buildFLdexp(Dst, FVal, Scale);
2747 MI.eraseFromParent();
2748 return true;
2749}
2750
2751// TODO: Copied from DAG implementation. Verify logic and document how this
2752// actually works.
2756 bool Signed) const {
2757
2758 Register Dst = MI.getOperand(0).getReg();
2759 Register Src = MI.getOperand(1).getReg();
2760
2761 const LLT S64 = LLT::scalar(64);
2762 const LLT S32 = LLT::scalar(32);
2763
2764 const LLT SrcLT = MRI.getType(Src);
2765 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2766
2767 unsigned Flags = MI.getFlags();
2768
2769 // The basic idea of converting a floating point number into a pair of 32-bit
2770 // integers is illustrated as follows:
2771 //
2772 // tf := trunc(val);
2773 // hif := floor(tf * 2^-32);
2774 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2775 // hi := fptoi(hif);
2776 // lo := fptoi(lof);
2777 //
2778 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2780 if (Signed && SrcLT == S32) {
2781 // However, a 32-bit floating point number has only 23 bits mantissa and
2782 // it's not enough to hold all the significant bits of `lof` if val is
2783 // negative. To avoid the loss of precision, We need to take the absolute
2784 // value after truncating and flip the result back based on the original
2785 // signedness.
2786 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2787 Trunc = B.buildFAbs(S32, Trunc, Flags);
2788 }
2789 MachineInstrBuilder K0, K1;
2790 if (SrcLT == S64) {
2791 K0 = B.buildFConstant(
2792 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2793 K1 = B.buildFConstant(
2794 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2795 } else {
2796 K0 = B.buildFConstant(
2797 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2798 K1 = B.buildFConstant(
2799 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2800 }
2801
2802 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2803 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2804 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2805
2806 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2807 : B.buildFPTOUI(S32, FloorMul);
2808 auto Lo = B.buildFPTOUI(S32, Fma);
2809
2810 if (Signed && SrcLT == S32) {
2811 // Flip the result based on the signedness, which is either all 0s or 1s.
2812 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2813 // r := xor({lo, hi}, sign) - sign;
2814 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2815 Sign);
2816 } else
2817 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2818 MI.eraseFromParent();
2819
2820 return true;
2821}
2822
2824 MachineInstr &MI) const {
2825 MachineFunction &MF = Helper.MIRBuilder.getMF();
2827
2828 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2829 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2830
2831 // With ieee_mode disabled, the instructions have the correct behavior
2832 // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
2833 //
2834 // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
2835 // enabled.
2836 if (!MFI->getMode().IEEE) {
2837 if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
2838 MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
2839 return true;
2840
2841 return !IsIEEEOp;
2842 }
2843
2844 if (IsIEEEOp)
2845 return true;
2846
2848}
2849
2852 MachineIRBuilder &B) const {
2853 // TODO: Should move some of this into LegalizerHelper.
2854
2855 // TODO: Promote dynamic indexing of s16 to s32
2856
2857 Register Dst = MI.getOperand(0).getReg();
2858 Register Vec = MI.getOperand(1).getReg();
2859
2860 LLT VecTy = MRI.getType(Vec);
2861 LLT EltTy = VecTy.getElementType();
2862 assert(EltTy == MRI.getType(Dst));
2863
2864 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2865 // but we can't go directly to that logic becasue you can't bitcast a vector
2866 // of pointers to a vector of integers. Therefore, introduce an intermediate
2867 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2868 // drive the legalization forward.
2869 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2870 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2871 LLT IntVecTy = VecTy.changeElementType(IntTy);
2872
2873 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2874 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2875 B.buildIntToPtr(Dst, IntElt);
2876
2877 MI.eraseFromParent();
2878 return true;
2879 }
2880
2881 // FIXME: Artifact combiner probably should have replaced the truncated
2882 // constant before this, so we shouldn't need
2883 // getIConstantVRegValWithLookThrough.
2884 std::optional<ValueAndVReg> MaybeIdxVal =
2885 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2886 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2887 return true;
2888 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2889
2890 if (IdxVal < VecTy.getNumElements()) {
2891 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2892 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2893 } else {
2894 B.buildUndef(Dst);
2895 }
2896
2897 MI.eraseFromParent();
2898 return true;
2899}
2900
2903 MachineIRBuilder &B) const {
2904 // TODO: Should move some of this into LegalizerHelper.
2905
2906 // TODO: Promote dynamic indexing of s16 to s32
2907
2908 Register Dst = MI.getOperand(0).getReg();
2909 Register Vec = MI.getOperand(1).getReg();
2910 Register Ins = MI.getOperand(2).getReg();
2911
2912 LLT VecTy = MRI.getType(Vec);
2913 LLT EltTy = VecTy.getElementType();
2914 assert(EltTy == MRI.getType(Ins));
2915
2916 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2917 // but we can't go directly to that logic becasue you can't bitcast a vector
2918 // of pointers to a vector of integers. Therefore, make the pointer vector
2919 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2920 // new value, and then inttoptr the result vector back. This will then allow
2921 // the rest of legalization to take over.
2922 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2923 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2924 LLT IntVecTy = VecTy.changeElementType(IntTy);
2925
2926 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2927 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2928 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2929 MI.getOperand(3));
2930 B.buildIntToPtr(Dst, IntVecDest);
2931 MI.eraseFromParent();
2932 return true;
2933 }
2934
2935 // FIXME: Artifact combiner probably should have replaced the truncated
2936 // constant before this, so we shouldn't need
2937 // getIConstantVRegValWithLookThrough.
2938 std::optional<ValueAndVReg> MaybeIdxVal =
2939 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2940 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2941 return true;
2942
2943 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2944
2945 unsigned NumElts = VecTy.getNumElements();
2946 if (IdxVal < NumElts) {
2948 for (unsigned i = 0; i < NumElts; ++i)
2949 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2950 B.buildUnmerge(SrcRegs, Vec);
2951
2952 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2953 B.buildMergeLikeInstr(Dst, SrcRegs);
2954 } else {
2955 B.buildUndef(Dst);
2956 }
2957
2958 MI.eraseFromParent();
2959 return true;
2960}
2961
2964 MachineIRBuilder &B) const {
2965
2966 Register DstReg = MI.getOperand(0).getReg();
2967 Register SrcReg = MI.getOperand(1).getReg();
2968 LLT Ty = MRI.getType(DstReg);
2969 unsigned Flags = MI.getFlags();
2970
2971 Register TrigVal;
2972 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2973 if (ST.hasTrigReducedRange()) {
2974 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2975 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2976 .addUse(MulVal.getReg(0))
2977 .setMIFlags(Flags)
2978 .getReg(0);
2979 } else
2980 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2981
2982 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2983 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2984 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2985 .addUse(TrigVal)
2986 .setMIFlags(Flags);
2987 MI.eraseFromParent();
2988 return true;
2989}
2990
2993 const GlobalValue *GV,
2994 int64_t Offset,
2995 unsigned GAFlags) const {
2996 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2997 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2998 // to the following code sequence:
2999 //
3000 // For constant address space:
3001 // s_getpc_b64 s[0:1]
3002 // s_add_u32 s0, s0, $symbol
3003 // s_addc_u32 s1, s1, 0
3004 //
3005 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3006 // a fixup or relocation is emitted to replace $symbol with a literal
3007 // constant, which is a pc-relative offset from the encoding of the $symbol
3008 // operand to the global variable.
3009 //
3010 // For global address space:
3011 // s_getpc_b64 s[0:1]
3012 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3013 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3014 //
3015 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3016 // fixups or relocations are emitted to replace $symbol@*@lo and
3017 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3018 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3019 // operand to the global variable.
3020
3022
3023 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3024 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3025
3026 if (ST.has64BitLiterals()) {
3027 assert(GAFlags != SIInstrInfo::MO_NONE);
3028
3030 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3031 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3032 } else {
3034 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3035
3036 MIB.addGlobalAddress(GV, Offset, GAFlags);
3037 if (GAFlags == SIInstrInfo::MO_NONE)
3038 MIB.addImm(0);
3039 else
3040 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3041 }
3042
3043 if (!B.getMRI()->getRegClassOrNull(PCReg))
3044 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3045
3046 if (PtrTy.getSizeInBits() == 32)
3047 B.buildExtract(DstReg, PCReg, 0);
3048 return true;
3049}
3050
3051// Emit a ABS32_LO / ABS32_HI relocation stub.
3053 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3054 MachineRegisterInfo &MRI) const {
3055 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3056
3057 if (RequiresHighHalf && ST.has64BitLiterals()) {
3058 if (!MRI.getRegClassOrNull(DstReg))
3059 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3060 B.buildInstr(AMDGPU::S_MOV_B64)
3061 .addDef(DstReg)
3062 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3063 return;
3064 }
3065
3066 LLT S32 = LLT::scalar(32);
3067
3068 // Use the destination directly, if and only if we store the lower address
3069 // part only and we don't have a register class being set.
3070 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3071 ? DstReg
3072 : MRI.createGenericVirtualRegister(S32);
3073
3074 if (!MRI.getRegClassOrNull(AddrLo))
3075 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3076
3077 // Write the lower half.
3078 B.buildInstr(AMDGPU::S_MOV_B32)
3079 .addDef(AddrLo)
3080 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3081
3082 // If required, write the upper half as well.
3083 if (RequiresHighHalf) {
3084 assert(PtrTy.getSizeInBits() == 64 &&
3085 "Must provide a 64-bit pointer type!");
3086
3087 Register AddrHi = MRI.createGenericVirtualRegister(S32);
3088 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3089
3090 B.buildInstr(AMDGPU::S_MOV_B32)
3091 .addDef(AddrHi)
3092 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3093
3094 // Use the destination directly, if and only if we don't have a register
3095 // class being set.
3096 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3097 ? DstReg
3098 : MRI.createGenericVirtualRegister(LLT::scalar(64));
3099
3100 if (!MRI.getRegClassOrNull(AddrDst))
3101 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3102
3103 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3104
3105 // If we created a new register for the destination, cast the result into
3106 // the final output.
3107 if (AddrDst != DstReg)
3108 B.buildCast(DstReg, AddrDst);
3109 } else if (AddrLo != DstReg) {
3110 // If we created a new register for the destination, cast the result into
3111 // the final output.
3112 B.buildCast(DstReg, AddrLo);
3113 }
3114}
3115
3118 MachineIRBuilder &B) const {
3119 Register DstReg = MI.getOperand(0).getReg();
3120 LLT Ty = MRI.getType(DstReg);
3121 unsigned AS = Ty.getAddressSpace();
3122
3123 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3124 MachineFunction &MF = B.getMF();
3126
3128 if (!MFI->isModuleEntryFunction() &&
3129 GV->getName() != "llvm.amdgcn.module.lds" &&
3130 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
3131 const Function &Fn = MF.getFunction();
3133 Fn, "local memory global used by non-kernel function",
3134 MI.getDebugLoc(), DS_Warning));
3135
3136 // We currently don't have a way to correctly allocate LDS objects that
3137 // aren't directly associated with a kernel. We do force inlining of
3138 // functions that use local objects. However, if these dead functions are
3139 // not eliminated, we don't want a compile time error. Just emit a warning
3140 // and a trap, since there should be no callable path here.
3141 B.buildTrap();
3142 B.buildUndef(DstReg);
3143 MI.eraseFromParent();
3144 return true;
3145 }
3146
3147 // TODO: We could emit code to handle the initialization somewhere.
3148 // We ignore the initializer for now and legalize it to allow selection.
3149 // The initializer will anyway get errored out during assembly emission.
3150 const SITargetLowering *TLI = ST.getTargetLowering();
3151 if (!TLI->shouldUseLDSConstAddress(GV)) {
3152 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3153 return true; // Leave in place;
3154 }
3155
3156 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3157 Type *Ty = GV->getValueType();
3158 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3159 // zero-sized type in other languages to declare the dynamic shared
3160 // memory which size is not known at the compile time. They will be
3161 // allocated by the runtime and placed directly after the static
3162 // allocated ones. They all share the same offset.
3163 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3164 // Adjust alignment for that dynamic shared memory array.
3165 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3166 LLT S32 = LLT::scalar(32);
3167 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3168 B.buildIntToPtr(DstReg, Sz);
3169 MI.eraseFromParent();
3170 return true;
3171 }
3172 }
3173
3174 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3175 *cast<GlobalVariable>(GV)));
3176 MI.eraseFromParent();
3177 return true;
3178 }
3179
3180 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3181 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3182 MI.eraseFromParent();
3183 return true;
3184 }
3185
3186 const SITargetLowering *TLI = ST.getTargetLowering();
3187
3188 if (TLI->shouldEmitFixup(GV)) {
3189 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3190 MI.eraseFromParent();
3191 return true;
3192 }
3193
3194 if (TLI->shouldEmitPCReloc(GV)) {
3195 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3196 MI.eraseFromParent();
3197 return true;
3198 }
3199
3201 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3202
3203 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3208 LoadTy, Align(8));
3209
3210 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3211
3212 if (Ty.getSizeInBits() == 32) {
3213 // Truncate if this is a 32-bit constant address.
3214 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3215 B.buildExtract(DstReg, Load, 0);
3216 } else
3217 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3218
3219 MI.eraseFromParent();
3220 return true;
3221}
3222
3224 if (Ty.isVector())
3225 return Ty.changeElementCount(
3228}
3229
3231 MachineInstr &MI) const {
3232 MachineIRBuilder &B = Helper.MIRBuilder;
3233 MachineRegisterInfo &MRI = *B.getMRI();
3234 GISelChangeObserver &Observer = Helper.Observer;
3235
3236 Register PtrReg = MI.getOperand(1).getReg();
3237 LLT PtrTy = MRI.getType(PtrReg);
3238 unsigned AddrSpace = PtrTy.getAddressSpace();
3239
3240 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3242 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3243 Observer.changingInstr(MI);
3244 MI.getOperand(1).setReg(Cast.getReg(0));
3245 Observer.changedInstr(MI);
3246 return true;
3247 }
3248
3249 if (MI.getOpcode() != AMDGPU::G_LOAD)
3250 return false;
3251
3252 Register ValReg = MI.getOperand(0).getReg();
3253 LLT ValTy = MRI.getType(ValReg);
3254
3255 if (hasBufferRsrcWorkaround(ValTy)) {
3256 Observer.changingInstr(MI);
3258 Observer.changedInstr(MI);
3259 return true;
3260 }
3261
3262 MachineMemOperand *MMO = *MI.memoperands_begin();
3263 const unsigned ValSize = ValTy.getSizeInBits();
3264 const LLT MemTy = MMO->getMemoryType();
3265 const Align MemAlign = MMO->getAlign();
3266 const unsigned MemSize = MemTy.getSizeInBits();
3267 const uint64_t AlignInBits = 8 * MemAlign.value();
3268
3269 // Widen non-power-of-2 loads to the alignment if needed
3270 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3271 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3272
3273 // This was already the correct extending load result type, so just adjust
3274 // the memory type.
3275 if (WideMemSize == ValSize) {
3276 MachineFunction &MF = B.getMF();
3277
3278 MachineMemOperand *WideMMO =
3279 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3280 Observer.changingInstr(MI);
3281 MI.setMemRefs(MF, {WideMMO});
3282 Observer.changedInstr(MI);
3283 return true;
3284 }
3285
3286 // Don't bother handling edge case that should probably never be produced.
3287 if (ValSize > WideMemSize)
3288 return false;
3289
3290 LLT WideTy = widenToNextPowerOf2(ValTy);
3291
3292 Register WideLoad;
3293 if (!WideTy.isVector()) {
3294 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3295 B.buildTrunc(ValReg, WideLoad).getReg(0);
3296 } else {
3297 // Extract the subvector.
3298
3299 if (isRegisterType(ST, ValTy)) {
3300 // If this a case where G_EXTRACT is legal, use it.
3301 // (e.g. <3 x s32> -> <4 x s32>)
3302 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3303 B.buildExtract(ValReg, WideLoad, 0);
3304 } else {
3305 // For cases where the widened type isn't a nice register value, unmerge
3306 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3307 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3308 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3309 }
3310 }
3311
3312 MI.eraseFromParent();
3313 return true;
3314 }
3315
3316 return false;
3317}
3318
3320 MachineInstr &MI) const {
3321 MachineIRBuilder &B = Helper.MIRBuilder;
3322 MachineRegisterInfo &MRI = *B.getMRI();
3323 GISelChangeObserver &Observer = Helper.Observer;
3324
3325 Register DataReg = MI.getOperand(0).getReg();
3326 LLT DataTy = MRI.getType(DataReg);
3327
3328 if (hasBufferRsrcWorkaround(DataTy)) {
3329 Observer.changingInstr(MI);
3331 Observer.changedInstr(MI);
3332 return true;
3333 }
3334 return false;
3335}
3336
3339 MachineIRBuilder &B) const {
3340 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3341 assert(Ty.isScalar());
3342
3343 MachineFunction &MF = B.getMF();
3345
3346 // TODO: Always legal with future ftz flag.
3347 // FIXME: Do we need just output?
3348 if (Ty == LLT::float32() &&
3350 return true;
3351 if (Ty == LLT::float16() &&
3353 return true;
3354
3355 MachineIRBuilder HelperBuilder(MI);
3356 GISelObserverWrapper DummyObserver;
3357 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3358 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3359}
3360
3363 Register DstReg = MI.getOperand(0).getReg();
3364 Register PtrReg = MI.getOperand(1).getReg();
3365 Register CmpVal = MI.getOperand(2).getReg();
3366 Register NewVal = MI.getOperand(3).getReg();
3367
3368 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3369 "this should not have been custom lowered");
3370
3371 LLT ValTy = MRI.getType(CmpVal);
3372 LLT VecTy = LLT::fixed_vector(2, ValTy);
3373
3374 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3375
3376 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3377 .addDef(DstReg)
3378 .addUse(PtrReg)
3379 .addUse(PackedVal)
3380 .setMemRefs(MI.memoperands());
3381
3382 MI.eraseFromParent();
3383 return true;
3384}
3385
3386/// Return true if it's known that \p Src can never be an f32 denormal value.
3388 Register Src) {
3389 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3390 switch (DefMI->getOpcode()) {
3391 case TargetOpcode::G_INTRINSIC: {
3392 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3393 case Intrinsic::amdgcn_frexp_mant:
3394 return true;
3395 default:
3396 break;
3397 }
3398
3399 break;
3400 }
3401 case TargetOpcode::G_FFREXP: {
3402 if (DefMI->getOperand(0).getReg() == Src)
3403 return true;
3404 break;
3405 }
3406 case TargetOpcode::G_FPEXT: {
3407 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3408 }
3409 default:
3410 return false;
3411 }
3412
3413 return false;
3414}
3415
3416static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3417 return Flags & MachineInstr::FmAfn;
3418}
3419
3421 unsigned Flags) {
3422 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3425}
3426
3427std::pair<Register, Register>
3429 unsigned Flags) const {
3430 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3431 return {};
3432
3433 const LLT F32 = LLT::scalar(32);
3434 auto SmallestNormal = B.buildFConstant(
3436 auto IsLtSmallestNormal =
3437 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3438
3439 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3440 auto One = B.buildFConstant(F32, 1.0);
3441 auto ScaleFactor =
3442 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3443 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3444
3445 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3446}
3447
3449 MachineIRBuilder &B) const {
3450 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3451 // If we have to handle denormals, scale up the input and adjust the result.
3452
3453 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3454 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3455
3456 Register Dst = MI.getOperand(0).getReg();
3457 Register Src = MI.getOperand(1).getReg();
3458 LLT Ty = B.getMRI()->getType(Dst);
3459 unsigned Flags = MI.getFlags();
3460
3461 if (Ty == LLT::scalar(16)) {
3462 const LLT F32 = LLT::scalar(32);
3463 // Nothing in half is a denormal when promoted to f32.
3464 auto Ext = B.buildFPExt(F32, Src, Flags);
3465 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3466 .addUse(Ext.getReg(0))
3467 .setMIFlags(Flags);
3468 B.buildFPTrunc(Dst, Log2, Flags);
3469 MI.eraseFromParent();
3470 return true;
3471 }
3472
3473 assert(Ty == LLT::scalar(32));
3474
3475 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3476 if (!ScaledInput) {
3477 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3478 .addUse(Src)
3479 .setMIFlags(Flags);
3480 MI.eraseFromParent();
3481 return true;
3482 }
3483
3484 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3485 .addUse(ScaledInput)
3486 .setMIFlags(Flags);
3487
3488 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3489 auto Zero = B.buildFConstant(Ty, 0.0);
3490 auto ResultOffset =
3491 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3492 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3493
3494 MI.eraseFromParent();
3495 return true;
3496}
3497
3499 Register Z, unsigned Flags) {
3500 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3501 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3502}
3503
3505 MachineIRBuilder &B) const {
3506 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3507 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3508
3509 MachineRegisterInfo &MRI = *B.getMRI();
3510 Register Dst = MI.getOperand(0).getReg();
3511 Register X = MI.getOperand(1).getReg();
3512 unsigned Flags = MI.getFlags();
3513 const LLT Ty = MRI.getType(X);
3514 MachineFunction &MF = B.getMF();
3515
3516 const LLT F32 = LLT::scalar(32);
3517 const LLT F16 = LLT::scalar(16);
3518
3519 const AMDGPUTargetMachine &TM =
3520 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3521
3522 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3523 if (Ty == F16 && !ST.has16BitInsts()) {
3524 Register LogVal = MRI.createGenericVirtualRegister(F32);
3525 auto PromoteSrc = B.buildFPExt(F32, X);
3526 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3527 B.buildFPTrunc(Dst, LogVal);
3528 } else {
3529 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3530 }
3531
3532 MI.eraseFromParent();
3533 return true;
3534 }
3535
3536 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3537 if (ScaledInput)
3538 X = ScaledInput;
3539
3540 auto Y =
3541 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3542
3543 Register R;
3544 if (ST.hasFastFMAF32()) {
3545 // c+cc are ln(2)/ln(10) to more than 49 bits
3546 const float c_log10 = 0x1.344134p-2f;
3547 const float cc_log10 = 0x1.09f79ep-26f;
3548
3549 // c + cc is ln(2) to more than 49 bits
3550 const float c_log = 0x1.62e42ep-1f;
3551 const float cc_log = 0x1.efa39ep-25f;
3552
3553 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3554 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3555
3556 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3557 auto NegR = B.buildFNeg(Ty, R, Flags);
3558 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3559 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3560 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3561 } else {
3562 // ch+ct is ln(2)/ln(10) to more than 36 bits
3563 const float ch_log10 = 0x1.344000p-2f;
3564 const float ct_log10 = 0x1.3509f6p-18f;
3565
3566 // ch + ct is ln(2) to more than 36 bits
3567 const float ch_log = 0x1.62e000p-1f;
3568 const float ct_log = 0x1.0bfbe8p-15f;
3569
3570 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3571 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3572
3573 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3574 auto YH = B.buildAnd(Ty, Y, MaskConst);
3575 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3576 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3577
3578 Register Mad0 =
3579 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3580 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3581 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3582 }
3583
3584 const bool IsFiniteOnly =
3585 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3586 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3587
3588 if (!IsFiniteOnly) {
3589 // Expand isfinite(x) => fabs(x) < inf
3590 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3591 auto Fabs = B.buildFAbs(Ty, Y);
3592 auto IsFinite =
3593 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3594 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3595 }
3596
3597 if (ScaledInput) {
3598 auto Zero = B.buildFConstant(Ty, 0.0);
3599 auto ShiftK =
3600 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3601 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3602 B.buildFSub(Dst, R, Shift, Flags);
3603 } else {
3604 B.buildCopy(Dst, R);
3605 }
3606
3607 MI.eraseFromParent();
3608 return true;
3609}
3610
3612 Register Src, bool IsLog10,
3613 unsigned Flags) const {
3614 const double Log2BaseInverted =
3616
3617 LLT Ty = B.getMRI()->getType(Dst);
3618
3619 if (Ty == LLT::scalar(32)) {
3620 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3621 if (ScaledInput) {
3622 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3623 .addUse(Src)
3624 .setMIFlags(Flags);
3625 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3626 auto Zero = B.buildFConstant(Ty, 0.0);
3627 auto ResultOffset =
3628 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3629 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3630
3631 if (ST.hasFastFMAF32())
3632 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3633 else {
3634 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3635 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3636 }
3637
3638 return true;
3639 }
3640 }
3641
3642 auto Log2Operand = Ty == LLT::scalar(16)
3643 ? B.buildFLog2(Ty, Src, Flags)
3644 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3645 .addUse(Src)
3646 .setMIFlags(Flags);
3647 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3648 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3649 return true;
3650}
3651
3653 MachineIRBuilder &B) const {
3654 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3655 // If we have to handle denormals, scale up the input and adjust the result.
3656
3657 Register Dst = MI.getOperand(0).getReg();
3658 Register Src = MI.getOperand(1).getReg();
3659 unsigned Flags = MI.getFlags();
3660 LLT Ty = B.getMRI()->getType(Dst);
3661 const LLT F16 = LLT::scalar(16);
3662 const LLT F32 = LLT::scalar(32);
3663
3664 if (Ty == F16) {
3665 // Nothing in half is a denormal when promoted to f32.
3666 auto Ext = B.buildFPExt(F32, Src, Flags);
3667 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3668 .addUse(Ext.getReg(0))
3669 .setMIFlags(Flags);
3670 B.buildFPTrunc(Dst, Log2, Flags);
3671 MI.eraseFromParent();
3672 return true;
3673 }
3674
3675 assert(Ty == F32);
3676
3677 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3678 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3679 .addUse(Src)
3680 .setMIFlags(Flags);
3681 MI.eraseFromParent();
3682 return true;
3683 }
3684
3685 // bool needs_scaling = x < -0x1.f80000p+6f;
3686 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3687
3688 // -nextafter(128.0, -1)
3689 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3690 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3691 RangeCheckConst, Flags);
3692
3693 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3694 auto Zero = B.buildFConstant(Ty, 0.0);
3695 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3696 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3697
3698 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3699 .addUse(AddInput.getReg(0))
3700 .setMIFlags(Flags);
3701
3702 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3703 auto One = B.buildFConstant(Ty, 1.0);
3704 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3705 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3706 MI.eraseFromParent();
3707 return true;
3708}
3709
3711 Register X, unsigned Flags) const {
3712 LLT Ty = B.getMRI()->getType(Dst);
3713 LLT F32 = LLT::scalar(32);
3714
3715 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3716 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3717 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3718
3719 if (Ty == F32) {
3720 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3721 .addUse(Mul.getReg(0))
3722 .setMIFlags(Flags);
3723 } else {
3724 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3725 }
3726
3727 return true;
3728 }
3729
3730 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3731 auto NeedsScaling =
3732 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3733 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3734 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3735 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3736
3737 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3738 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3739
3740 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3741 .addUse(ExpInput.getReg(0))
3742 .setMIFlags(Flags);
3743
3744 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3745 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3746 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3747 return true;
3748}
3749
3751 MachineIRBuilder &B) const {
3752 Register Dst = MI.getOperand(0).getReg();
3753 Register X = MI.getOperand(1).getReg();
3754 const unsigned Flags = MI.getFlags();
3755 MachineFunction &MF = B.getMF();
3756 MachineRegisterInfo &MRI = *B.getMRI();
3757 LLT Ty = MRI.getType(Dst);
3758 const LLT F16 = LLT::scalar(16);
3759 const LLT F32 = LLT::scalar(32);
3760 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3761
3762 if (Ty == F16) {
3763 // v_exp_f16 (fmul x, log2e)
3764 if (allowApproxFunc(MF, Flags)) {
3765 // TODO: Does this really require fast?
3766 legalizeFExpUnsafe(B, Dst, X, Flags);
3767 MI.eraseFromParent();
3768 return true;
3769 }
3770
3771 // exp(f16 x) ->
3772 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3773
3774 // Nothing in half is a denormal when promoted to f32.
3775 auto Ext = B.buildFPExt(F32, X, Flags);
3776 Register Lowered = MRI.createGenericVirtualRegister(F32);
3777 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3778 B.buildFPTrunc(Dst, Lowered, Flags);
3779 MI.eraseFromParent();
3780 return true;
3781 }
3782
3783 assert(Ty == F32);
3784
3785 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3786 // library behavior. Also, is known-not-daz source sufficient?
3787 if (allowApproxFunc(MF, Flags)) {
3788 legalizeFExpUnsafe(B, Dst, X, Flags);
3789 MI.eraseFromParent();
3790 return true;
3791 }
3792
3793 // Algorithm:
3794 //
3795 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3796 //
3797 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3798 // n = 64*m + j, 0 <= j < 64
3799 //
3800 // e^x = 2^((64*m + j + f)/64)
3801 // = (2^m) * (2^(j/64)) * 2^(f/64)
3802 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3803 //
3804 // f = x*(64/ln(2)) - n
3805 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3806 //
3807 // e^x = (2^m) * (2^(j/64)) * e^r
3808 //
3809 // (2^(j/64)) is precomputed
3810 //
3811 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3812 // e^r = 1 + q
3813 //
3814 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3815 //
3816 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3817 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3818 Register PH, PL;
3819
3820 if (ST.hasFastFMAF32()) {
3821 const float c_exp = numbers::log2ef;
3822 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3823 const float c_exp10 = 0x1.a934f0p+1f;
3824 const float cc_exp10 = 0x1.2f346ep-24f;
3825
3826 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3827 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3828 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3829 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3830
3831 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3832 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3833 } else {
3834 const float ch_exp = 0x1.714000p+0f;
3835 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3836
3837 const float ch_exp10 = 0x1.a92000p+1f;
3838 const float cl_exp10 = 0x1.4f0978p-11f;
3839
3840 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3841 auto XH = B.buildAnd(Ty, X, MaskConst);
3842 auto XL = B.buildFSub(Ty, X, XH, Flags);
3843
3844 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3845 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3846
3847 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3848 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3849
3850 Register Mad0 =
3851 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3852 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3853 }
3854
3855 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3856
3857 // It is unsafe to contract this fsub into the PH multiply.
3858 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3859 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3860 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3861
3862 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3863 .addUse(A.getReg(0))
3864 .setMIFlags(Flags);
3865 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3866
3867 auto UnderflowCheckConst =
3868 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3869 auto Zero = B.buildFConstant(Ty, 0.0);
3870 auto Underflow =
3871 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3872
3873 R = B.buildSelect(Ty, Underflow, Zero, R);
3874
3875 const auto &Options = MF.getTarget().Options;
3876
3877 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3878 auto OverflowCheckConst =
3879 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3880
3881 auto Overflow =
3882 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3883 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3884 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3885 }
3886
3887 B.buildCopy(Dst, R);
3888 MI.eraseFromParent();
3889 return true;
3890}
3891
3893 MachineIRBuilder &B) const {
3894 Register Dst = MI.getOperand(0).getReg();
3895 Register Src0 = MI.getOperand(1).getReg();
3896 Register Src1 = MI.getOperand(2).getReg();
3897 unsigned Flags = MI.getFlags();
3898 LLT Ty = B.getMRI()->getType(Dst);
3899 const LLT F16 = LLT::float16();
3900 const LLT F32 = LLT::float32();
3901
3902 if (Ty == F32) {
3903 auto Log = B.buildFLog2(F32, Src0, Flags);
3904 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3905 .addUse(Log.getReg(0))
3906 .addUse(Src1)
3907 .setMIFlags(Flags);
3908 B.buildFExp2(Dst, Mul, Flags);
3909 } else if (Ty == F16) {
3910 // There's no f16 fmul_legacy, so we need to convert for it.
3911 auto Log = B.buildFLog2(F16, Src0, Flags);
3912 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3913 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3914 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3915 .addUse(Ext0.getReg(0))
3916 .addUse(Ext1.getReg(0))
3917 .setMIFlags(Flags);
3918 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3919 } else
3920 return false;
3921
3922 MI.eraseFromParent();
3923 return true;
3924}
3925
3926// Find a source register, ignoring any possible source modifiers.
3928 Register ModSrc = OrigSrc;
3929 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3930 ModSrc = SrcFNeg->getOperand(1).getReg();
3931 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3932 ModSrc = SrcFAbs->getOperand(1).getReg();
3933 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3934 ModSrc = SrcFAbs->getOperand(1).getReg();
3935 return ModSrc;
3936}
3937
3940 MachineIRBuilder &B) const {
3941
3942 const LLT S1 = LLT::scalar(1);
3943 const LLT F64 = LLT::float64();
3944 Register Dst = MI.getOperand(0).getReg();
3945 Register OrigSrc = MI.getOperand(1).getReg();
3946 unsigned Flags = MI.getFlags();
3947 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3948 "this should not have been custom lowered");
3949
3950 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3951 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3952 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3953 // V_FRACT bug is:
3954 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3955 //
3956 // Convert floor(x) to (x - fract(x))
3957
3958 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3959 .addUse(OrigSrc)
3960 .setMIFlags(Flags);
3961
3962 // Give source modifier matching some assistance before obscuring a foldable
3963 // pattern.
3964
3965 // TODO: We can avoid the neg on the fract? The input sign to fract
3966 // shouldn't matter?
3967 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3968
3969 auto Const =
3970 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3971
3972 Register Min = MRI.createGenericVirtualRegister(F64);
3973
3974 // We don't need to concern ourselves with the snan handling difference, so
3975 // use the one which will directly select.
3976 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3977 if (MFI->getMode().IEEE)
3978 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3979 else
3980 B.buildFMinNum(Min, Fract, Const, Flags);
3981
3982 Register CorrectedFract = Min;
3983 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3984 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3985 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3986 }
3987
3988 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3989 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3990
3991 MI.eraseFromParent();
3992 return true;
3993}
3994
3995// Turn an illegal packed v2s16 build vector into bit operations.
3996// TODO: This should probably be a bitcast action in LegalizerHelper.
3999 Register Dst = MI.getOperand(0).getReg();
4000 const LLT S32 = LLT::scalar(32);
4001 const LLT S16 = LLT::scalar(16);
4002 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4003
4004 Register Src0 = MI.getOperand(1).getReg();
4005 Register Src1 = MI.getOperand(2).getReg();
4006
4007 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4008 assert(MRI.getType(Src0) == S32);
4009 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4010 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4011 }
4012
4013 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4014 B.buildBitcast(Dst, Merge);
4015
4016 MI.eraseFromParent();
4017 return true;
4018}
4019
4020// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4021//
4022// Source and accumulation registers must all be 32-bits.
4023//
4024// TODO: When the multiply is uniform, we should produce a code sequence
4025// that is better suited to instruction selection on the SALU. Instead of
4026// the outer loop going over parts of the result, the outer loop should go
4027// over parts of one of the factors. This should result in instruction
4028// selection that makes full use of S_ADDC_U32 instructions.
4031 ArrayRef<Register> Src0,
4032 ArrayRef<Register> Src1,
4033 bool UsePartialMad64_32,
4034 bool SeparateOddAlignedProducts) const {
4035 // Use (possibly empty) vectors of S1 registers to represent the set of
4036 // carries from one pair of positions to the next.
4037 using Carry = SmallVector<Register, 2>;
4038
4039 MachineIRBuilder &B = Helper.MIRBuilder;
4040 GISelValueTracking &VT = *Helper.getValueTracking();
4041
4042 const LLT S1 = LLT::scalar(1);
4043 const LLT S32 = LLT::scalar(32);
4044 const LLT S64 = LLT::scalar(64);
4045
4046 Register Zero32;
4047 Register Zero64;
4048
4049 auto getZero32 = [&]() -> Register {
4050 if (!Zero32)
4051 Zero32 = B.buildConstant(S32, 0).getReg(0);
4052 return Zero32;
4053 };
4054 auto getZero64 = [&]() -> Register {
4055 if (!Zero64)
4056 Zero64 = B.buildConstant(S64, 0).getReg(0);
4057 return Zero64;
4058 };
4059
4060 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4061 for (unsigned i = 0; i < Src0.size(); ++i) {
4062 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4063 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4064 }
4065
4066 // Merge the given carries into the 32-bit LocalAccum, which is modified
4067 // in-place.
4068 //
4069 // Returns the carry-out, which is a single S1 register or null.
4070 auto mergeCarry =
4071 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4072 if (CarryIn.empty())
4073 return Register();
4074
4075 bool HaveCarryOut = true;
4076 Register CarryAccum;
4077 if (CarryIn.size() == 1) {
4078 if (!LocalAccum) {
4079 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4080 return Register();
4081 }
4082
4083 CarryAccum = getZero32();
4084 } else {
4085 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4086 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4087 CarryAccum =
4088 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4089 .getReg(0);
4090 }
4091
4092 if (!LocalAccum) {
4093 LocalAccum = getZero32();
4094 HaveCarryOut = false;
4095 }
4096 }
4097
4098 auto Add =
4099 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4100 LocalAccum = Add.getReg(0);
4101 return HaveCarryOut ? Add.getReg(1) : Register();
4102 };
4103
4104 // Build a multiply-add chain to compute
4105 //
4106 // LocalAccum + (partial products at DstIndex)
4107 // + (opportunistic subset of CarryIn)
4108 //
4109 // LocalAccum is an array of one or two 32-bit registers that are updated
4110 // in-place. The incoming registers may be null.
4111 //
4112 // In some edge cases, carry-ins can be consumed "for free". In that case,
4113 // the consumed carry bits are removed from CarryIn in-place.
4114 auto buildMadChain =
4115 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4116 -> Carry {
4117 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4118 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4119
4120 Carry CarryOut;
4121 unsigned j0 = 0;
4122
4123 // Use plain 32-bit multiplication for the most significant part of the
4124 // result by default.
4125 if (LocalAccum.size() == 1 &&
4126 (!UsePartialMad64_32 || !CarryIn.empty())) {
4127 do {
4128 // Skip multiplication if one of the operands is 0
4129 unsigned j1 = DstIndex - j0;
4130 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4131 ++j0;
4132 continue;
4133 }
4134 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4135 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4136 LocalAccum[0] = Mul.getReg(0);
4137 } else {
4138 if (CarryIn.empty()) {
4139 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4140 } else {
4141 LocalAccum[0] =
4142 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4143 .getReg(0);
4144 CarryIn.pop_back();
4145 }
4146 }
4147 ++j0;
4148 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4149 }
4150
4151 // Build full 64-bit multiplies.
4152 if (j0 <= DstIndex) {
4153 bool HaveSmallAccum = false;
4154 Register Tmp;
4155
4156 if (LocalAccum[0]) {
4157 if (LocalAccum.size() == 1) {
4158 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4159 HaveSmallAccum = true;
4160 } else if (LocalAccum[1]) {
4161 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4162 HaveSmallAccum = false;
4163 } else {
4164 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4165 HaveSmallAccum = true;
4166 }
4167 } else {
4168 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4169 Tmp = getZero64();
4170 HaveSmallAccum = true;
4171 }
4172
4173 do {
4174 unsigned j1 = DstIndex - j0;
4175 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4176 ++j0;
4177 continue;
4178 }
4179 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4180 {Src0[j0], Src1[j1], Tmp});
4181 Tmp = Mad.getReg(0);
4182 if (!HaveSmallAccum)
4183 CarryOut.push_back(Mad.getReg(1));
4184 HaveSmallAccum = false;
4185
4186 ++j0;
4187 } while (j0 <= DstIndex);
4188
4189 auto Unmerge = B.buildUnmerge(S32, Tmp);
4190 LocalAccum[0] = Unmerge.getReg(0);
4191 if (LocalAccum.size() > 1)
4192 LocalAccum[1] = Unmerge.getReg(1);
4193 }
4194
4195 return CarryOut;
4196 };
4197
4198 // Outer multiply loop, iterating over destination parts from least
4199 // significant to most significant parts.
4200 //
4201 // The columns of the following diagram correspond to the destination parts
4202 // affected by one iteration of the outer loop (ignoring boundary
4203 // conditions).
4204 //
4205 // Dest index relative to 2 * i: 1 0 -1
4206 // ------
4207 // Carries from previous iteration: e o
4208 // Even-aligned partial product sum: E E .
4209 // Odd-aligned partial product sum: O O
4210 //
4211 // 'o' is OddCarry, 'e' is EvenCarry.
4212 // EE and OO are computed from partial products via buildMadChain and use
4213 // accumulation where possible and appropriate.
4214 //
4215 Register SeparateOddCarry;
4216 Carry EvenCarry;
4217 Carry OddCarry;
4218
4219 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4220 Carry OddCarryIn = std::move(OddCarry);
4221 Carry EvenCarryIn = std::move(EvenCarry);
4222 OddCarry.clear();
4223 EvenCarry.clear();
4224
4225 // Partial products at offset 2 * i.
4226 if (2 * i < Accum.size()) {
4227 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4228 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4229 }
4230
4231 // Partial products at offset 2 * i - 1.
4232 if (i > 0) {
4233 if (!SeparateOddAlignedProducts) {
4234 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4235 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4236 } else {
4237 bool IsHighest = 2 * i >= Accum.size();
4238 Register SeparateOddOut[2];
4239 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4240 .take_front(IsHighest ? 1 : 2);
4241 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4242
4244
4245 if (i == 1) {
4246 if (!IsHighest)
4247 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4248 else
4249 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4250 } else {
4251 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4252 SeparateOddCarry);
4253 }
4254 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4255
4256 if (!IsHighest) {
4257 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4258 Lo->getOperand(1).getReg());
4259 Accum[2 * i] = Hi.getReg(0);
4260 SeparateOddCarry = Hi.getReg(1);
4261 }
4262 }
4263 }
4264
4265 // Add in the carries from the previous iteration
4266 if (i > 0) {
4267 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4268 EvenCarryIn.push_back(CarryOut);
4269
4270 if (2 * i < Accum.size()) {
4271 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4272 OddCarry.push_back(CarryOut);
4273 }
4274 }
4275 }
4276}
4277
4278// Custom narrowing of wide multiplies using wide multiply-add instructions.
4279//
4280// TODO: If the multiply is followed by an addition, we should attempt to
4281// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4283 MachineInstr &MI) const {
4284 assert(ST.hasMad64_32());
4285 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4286
4287 MachineIRBuilder &B = Helper.MIRBuilder;
4288 MachineRegisterInfo &MRI = *B.getMRI();
4289
4290 Register DstReg = MI.getOperand(0).getReg();
4291 Register Src0 = MI.getOperand(1).getReg();
4292 Register Src1 = MI.getOperand(2).getReg();
4293
4294 LLT Ty = MRI.getType(DstReg);
4295 assert(Ty.isScalar());
4296
4297 unsigned Size = Ty.getSizeInBits();
4298 if (ST.hasVectorMulU64() && Size == 64)
4299 return true;
4300
4301 unsigned NumParts = Size / 32;
4302 assert((Size % 32) == 0);
4303 assert(NumParts >= 2);
4304
4305 // Whether to use MAD_64_32 for partial products whose high half is
4306 // discarded. This avoids some ADD instructions but risks false dependency
4307 // stalls on some subtargets in some cases.
4308 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4309
4310 // Whether to compute odd-aligned partial products separately. This is
4311 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4312 // in an even-aligned VGPR.
4313 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4314
4315 LLT S32 = LLT::scalar(32);
4316 SmallVector<Register, 2> Src0Parts, Src1Parts;
4317 for (unsigned i = 0; i < NumParts; ++i) {
4318 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4319 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4320 }
4321 B.buildUnmerge(Src0Parts, Src0);
4322 B.buildUnmerge(Src1Parts, Src1);
4323
4324 SmallVector<Register, 2> AccumRegs(NumParts);
4325 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4326 SeparateOddAlignedProducts);
4327
4328 B.buildMergeLikeInstr(DstReg, AccumRegs);
4329 MI.eraseFromParent();
4330 return true;
4331}
4332
4333// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4334// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4335// case with a single min instruction instead of a compare+select.
4338 MachineIRBuilder &B) const {
4339 Register Dst = MI.getOperand(0).getReg();
4340 Register Src = MI.getOperand(1).getReg();
4341 LLT DstTy = MRI.getType(Dst);
4342 LLT SrcTy = MRI.getType(Src);
4343
4344 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4345 ? AMDGPU::G_AMDGPU_FFBH_U32
4346 : AMDGPU::G_AMDGPU_FFBL_B32;
4347 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4348 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4349
4350 MI.eraseFromParent();
4351 return true;
4352}
4353
4356 MachineIRBuilder &B) const {
4357 Register Dst = MI.getOperand(0).getReg();
4358 Register Src = MI.getOperand(1).getReg();
4359 LLT SrcTy = MRI.getType(Src);
4360 TypeSize NumBits = SrcTy.getSizeInBits();
4361
4362 assert(NumBits < 32u);
4363
4364 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4365 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4366 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4367 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4368 B.buildTrunc(Dst, Ctlz);
4369 MI.eraseFromParent();
4370 return true;
4371}
4372
4373// Check that this is a G_XOR x, -1
4374static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4375 if (MI.getOpcode() != TargetOpcode::G_XOR)
4376 return false;
4377 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4378 return ConstVal == -1;
4379}
4380
4381// Return the use branch instruction, otherwise null if the usage is invalid.
4382static MachineInstr *
4384 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4385 Register CondDef = MI.getOperand(0).getReg();
4386 if (!MRI.hasOneNonDBGUse(CondDef))
4387 return nullptr;
4388
4389 MachineBasicBlock *Parent = MI.getParent();
4390 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4391
4392 if (isNot(MRI, *UseMI)) {
4393 Register NegatedCond = UseMI->getOperand(0).getReg();
4394 if (!MRI.hasOneNonDBGUse(NegatedCond))
4395 return nullptr;
4396
4397 // We're deleting the def of this value, so we need to remove it.
4398 eraseInstr(*UseMI, MRI);
4399
4400 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4401 Negated = true;
4402 }
4403
4404 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4405 return nullptr;
4406
4407 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4408 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4409 if (Next == Parent->end()) {
4410 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4411 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4412 return nullptr;
4413 UncondBrTarget = &*NextMBB;
4414 } else {
4415 if (Next->getOpcode() != AMDGPU::G_BR)
4416 return nullptr;
4417 Br = &*Next;
4418 UncondBrTarget = Br->getOperand(0).getMBB();
4419 }
4420
4421 return UseMI;
4422}
4423
4426 const ArgDescriptor *Arg,
4427 const TargetRegisterClass *ArgRC,
4428 LLT ArgTy) const {
4429 MCRegister SrcReg = Arg->getRegister();
4430 assert(SrcReg.isPhysical() && "Physical register expected");
4431 assert(DstReg.isVirtual() && "Virtual register expected");
4432
4433 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4434 *ArgRC, B.getDebugLoc(), ArgTy);
4435 if (Arg->isMasked()) {
4436 // TODO: Should we try to emit this once in the entry block?
4437 const LLT S32 = LLT::scalar(32);
4438 const unsigned Mask = Arg->getMask();
4439 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4440
4441 Register AndMaskSrc = LiveIn;
4442
4443 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4444 // 0.
4445 if (Shift != 0) {
4446 auto ShiftAmt = B.buildConstant(S32, Shift);
4447 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4448 }
4449
4450 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4451 } else {
4452 B.buildCopy(DstReg, LiveIn);
4453 }
4454}
4455
4457 Register DstReg, MachineIRBuilder &B,
4459 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4460 const ArgDescriptor *Arg = nullptr;
4461 const TargetRegisterClass *ArgRC;
4462 LLT ArgTy;
4463
4464 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4465 const ArgDescriptor WorkGroupIDX =
4466 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4467 // If GridZ is not programmed in an entry function then the hardware will set
4468 // it to all zeros, so there is no need to mask the GridY value in the low
4469 // order bits.
4470 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4471 AMDGPU::TTMP7,
4472 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4473 const ArgDescriptor WorkGroupIDZ =
4474 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4475 if (ST.hasArchitectedSGPRs() &&
4477 switch (ArgType) {
4479 Arg = &WorkGroupIDX;
4480 ArgRC = &AMDGPU::SReg_32RegClass;
4481 ArgTy = LLT::scalar(32);
4482 break;
4484 Arg = &WorkGroupIDY;
4485 ArgRC = &AMDGPU::SReg_32RegClass;
4486 ArgTy = LLT::scalar(32);
4487 break;
4489 Arg = &WorkGroupIDZ;
4490 ArgRC = &AMDGPU::SReg_32RegClass;
4491 ArgTy = LLT::scalar(32);
4492 break;
4493 default:
4494 break;
4495 }
4496 }
4497
4498 if (!Arg)
4499 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4500
4501 if (!Arg) {
4503 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4504 // case the pointer argument may be missing and we use null.
4505 B.buildConstant(DstReg, 0);
4506 return true;
4507 }
4508
4509 // It's undefined behavior if a function marked with the amdgpu-no-*
4510 // attributes uses the corresponding intrinsic.
4511 B.buildUndef(DstReg);
4512 return true;
4513 }
4514
4515 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4516 return false; // TODO: Handle these
4517 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4518 return true;
4519}
4520
4524 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4525 return false;
4526
4527 MI.eraseFromParent();
4528 return true;
4529}
4530
4532 int64_t C) {
4533 B.buildConstant(MI.getOperand(0).getReg(), C);
4534 MI.eraseFromParent();
4535 return true;
4536}
4537
4540 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4541 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4542 if (MaxID == 0)
4543 return replaceWithConstant(B, MI, 0);
4544
4545 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4546 const ArgDescriptor *Arg;
4547 const TargetRegisterClass *ArgRC;
4548 LLT ArgTy;
4549 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4550
4551 Register DstReg = MI.getOperand(0).getReg();
4552 if (!Arg) {
4553 // It's undefined behavior if a function marked with the amdgpu-no-*
4554 // attributes uses the corresponding intrinsic.
4555 B.buildUndef(DstReg);
4556 MI.eraseFromParent();
4557 return true;
4558 }
4559
4560 if (Arg->isMasked()) {
4561 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4562 // masking operations anyway.
4563 //
4564 // TODO: We could assert the top bit is 0 for the source copy.
4565 if (!loadInputValue(DstReg, B, ArgType))
4566 return false;
4567 } else {
4568 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4569 if (!loadInputValue(TmpReg, B, ArgType))
4570 return false;
4571 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4572 }
4573
4574 MI.eraseFromParent();
4575 return true;
4576}
4577
4579 int64_t Offset) const {
4581 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4582
4583 // TODO: If we passed in the base kernel offset we could have a better
4584 // alignment than 4, but we don't really need it.
4585 if (!loadInputValue(KernArgReg, B,
4587 llvm_unreachable("failed to find kernarg segment ptr");
4588
4589 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4590 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4591}
4592
4593/// Legalize a value that's loaded from kernel arguments. This is only used by
4594/// legacy intrinsics.
4598 Align Alignment) const {
4599 Register DstReg = MI.getOperand(0).getReg();
4600
4601 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4602 "unexpected kernarg parameter type");
4603
4606 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4609 MI.eraseFromParent();
4610 return true;
4611}
4612
4615 MachineIRBuilder &B) const {
4616 Register Dst = MI.getOperand(0).getReg();
4617 LLT DstTy = MRI.getType(Dst);
4618 LLT S16 = LLT::scalar(16);
4619 LLT S32 = LLT::scalar(32);
4620 LLT S64 = LLT::scalar(64);
4621
4622 if (DstTy == S16)
4623 return legalizeFDIV16(MI, MRI, B);
4624 if (DstTy == S32)
4625 return legalizeFDIV32(MI, MRI, B);
4626 if (DstTy == S64)
4627 return legalizeFDIV64(MI, MRI, B);
4628
4629 return false;
4630}
4631
4633 Register DstDivReg,
4634 Register DstRemReg,
4635 Register X,
4636 Register Y) const {
4637 const LLT S1 = LLT::scalar(1);
4638 const LLT S32 = LLT::scalar(32);
4639
4640 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4641 // algorithm used here.
4642
4643 // Initial estimate of inv(y).
4644 auto FloatY = B.buildUITOFP(S32, Y);
4645 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4646 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4647 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4648 auto Z = B.buildFPTOUI(S32, ScaledY);
4649
4650 // One round of UNR.
4651 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4652 auto NegYZ = B.buildMul(S32, NegY, Z);
4653 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4654
4655 // Quotient/remainder estimate.
4656 auto Q = B.buildUMulH(S32, X, Z);
4657 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4658
4659 // First quotient/remainder refinement.
4660 auto One = B.buildConstant(S32, 1);
4661 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4662 if (DstDivReg)
4663 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4664 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4665
4666 // Second quotient/remainder refinement.
4667 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4668 if (DstDivReg)
4669 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4670
4671 if (DstRemReg)
4672 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4673}
4674
4675// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4676//
4677// Return lo, hi of result
4678//
4679// %cvt.lo = G_UITOFP Val.lo
4680// %cvt.hi = G_UITOFP Val.hi
4681// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4682// %rcp = G_AMDGPU_RCP_IFLAG %mad
4683// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4684// %mul2 = G_FMUL %mul1, 2**(-32)
4685// %trunc = G_INTRINSIC_TRUNC %mul2
4686// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4687// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4688static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4689 Register Val) {
4690 const LLT S32 = LLT::scalar(32);
4691 auto Unmerge = B.buildUnmerge(S32, Val);
4692
4693 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4694 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4695
4696 auto Mad = B.buildFMAD(
4697 S32, CvtHi, // 2**32
4698 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4699
4700 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4701 auto Mul1 = B.buildFMul(
4702 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4703
4704 // 2**(-32)
4705 auto Mul2 = B.buildFMul(
4706 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4707 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4708
4709 // -(2**32)
4710 auto Mad2 = B.buildFMAD(
4711 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4712 Mul1);
4713
4714 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4715 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4716
4717 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4718}
4719
4721 Register DstDivReg,
4722 Register DstRemReg,
4723 Register Numer,
4724 Register Denom) const {
4725 const LLT S32 = LLT::scalar(32);
4726 const LLT S64 = LLT::scalar(64);
4727 const LLT S1 = LLT::scalar(1);
4728 Register RcpLo, RcpHi;
4729
4730 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4731
4732 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4733
4734 auto Zero64 = B.buildConstant(S64, 0);
4735 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4736
4737 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4738 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4739
4740 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4741 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4742 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4743
4744 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4745 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4746 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4747
4748 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4749 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4750 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4751 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4752 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4753
4754 auto Zero32 = B.buildConstant(S32, 0);
4755 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4756 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4757 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4758
4759 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4760 Register NumerLo = UnmergeNumer.getReg(0);
4761 Register NumerHi = UnmergeNumer.getReg(1);
4762
4763 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4764 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4765 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4766 Register Mul3_Lo = UnmergeMul3.getReg(0);
4767 Register Mul3_Hi = UnmergeMul3.getReg(1);
4768 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4769 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4770 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4771 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4772
4773 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4774 Register DenomLo = UnmergeDenom.getReg(0);
4775 Register DenomHi = UnmergeDenom.getReg(1);
4776
4777 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4778 auto C1 = B.buildSExt(S32, CmpHi);
4779
4780 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4781 auto C2 = B.buildSExt(S32, CmpLo);
4782
4783 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4784 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4785
4786 // TODO: Here and below portions of the code can be enclosed into if/endif.
4787 // Currently control flow is unconditional and we have 4 selects after
4788 // potential endif to substitute PHIs.
4789
4790 // if C3 != 0 ...
4791 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4792 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4793 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4794 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4795
4796 auto One64 = B.buildConstant(S64, 1);
4797 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4798
4799 auto C4 =
4800 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4801 auto C5 =
4802 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4803 auto C6 = B.buildSelect(
4804 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4805
4806 // if (C6 != 0)
4807 auto Add4 = B.buildAdd(S64, Add3, One64);
4808 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4809
4810 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4811 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4812 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4813
4814 // endif C6
4815 // endif C3
4816
4817 if (DstDivReg) {
4818 auto Sel1 = B.buildSelect(
4819 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4820 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4821 Sel1, MulHi3);
4822 }
4823
4824 if (DstRemReg) {
4825 auto Sel2 = B.buildSelect(
4826 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4827 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4828 Sel2, Sub1);
4829 }
4830}
4831
4834 MachineIRBuilder &B) const {
4835 Register DstDivReg, DstRemReg;
4836 switch (MI.getOpcode()) {
4837 default:
4838 llvm_unreachable("Unexpected opcode!");
4839 case AMDGPU::G_UDIV: {
4840 DstDivReg = MI.getOperand(0).getReg();
4841 break;
4842 }
4843 case AMDGPU::G_UREM: {
4844 DstRemReg = MI.getOperand(0).getReg();
4845 break;
4846 }
4847 case AMDGPU::G_UDIVREM: {
4848 DstDivReg = MI.getOperand(0).getReg();
4849 DstRemReg = MI.getOperand(1).getReg();
4850 break;
4851 }
4852 }
4853
4854 const LLT S64 = LLT::scalar(64);
4855 const LLT S32 = LLT::scalar(32);
4856 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4857 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4858 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4859 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4860
4861 if (Ty == S32)
4862 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4863 else if (Ty == S64)
4864 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4865 else
4866 return false;
4867
4868 MI.eraseFromParent();
4869 return true;
4870}
4871
4874 MachineIRBuilder &B) const {
4875 const LLT S64 = LLT::scalar(64);
4876 const LLT S32 = LLT::scalar(32);
4877
4878 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4879 if (Ty != S32 && Ty != S64)
4880 return false;
4881
4882 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4883 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4884 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4885
4886 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4887 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4888 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4889
4890 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4891 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4892
4893 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4894 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4895
4896 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4897 switch (MI.getOpcode()) {
4898 default:
4899 llvm_unreachable("Unexpected opcode!");
4900 case AMDGPU::G_SDIV: {
4901 DstDivReg = MI.getOperand(0).getReg();
4902 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4903 break;
4904 }
4905 case AMDGPU::G_SREM: {
4906 DstRemReg = MI.getOperand(0).getReg();
4907 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4908 break;
4909 }
4910 case AMDGPU::G_SDIVREM: {
4911 DstDivReg = MI.getOperand(0).getReg();
4912 DstRemReg = MI.getOperand(1).getReg();
4913 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4914 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4915 break;
4916 }
4917 }
4918
4919 if (Ty == S32)
4920 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4921 else
4922 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4923
4924 if (DstDivReg) {
4925 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4926 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4927 B.buildSub(DstDivReg, SignXor, Sign);
4928 }
4929
4930 if (DstRemReg) {
4931 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4932 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4933 B.buildSub(DstRemReg, SignXor, Sign);
4934 }
4935
4936 MI.eraseFromParent();
4937 return true;
4938}
4939
4942 MachineIRBuilder &B) const {
4943 Register Res = MI.getOperand(0).getReg();
4944 Register LHS = MI.getOperand(1).getReg();
4945 Register RHS = MI.getOperand(2).getReg();
4946 uint16_t Flags = MI.getFlags();
4947 LLT ResTy = MRI.getType(Res);
4948
4949 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
4950
4951 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
4952 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4953 return false;
4954
4955 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4956 // the CI documentation has a worst case error of 1 ulp.
4957 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4958 // use it as long as we aren't trying to use denormals.
4959 //
4960 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4961
4962 // 1 / x -> RCP(x)
4963 if (CLHS->isExactlyValue(1.0)) {
4964 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4965 .addUse(RHS)
4966 .setMIFlags(Flags);
4967
4968 MI.eraseFromParent();
4969 return true;
4970 }
4971
4972 // -1 / x -> RCP( FNEG(x) )
4973 if (CLHS->isExactlyValue(-1.0)) {
4974 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4975 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4976 .addUse(FNeg.getReg(0))
4977 .setMIFlags(Flags);
4978
4979 MI.eraseFromParent();
4980 return true;
4981 }
4982 }
4983
4984 // For f16 require afn or arcp.
4985 // For f32 require afn.
4986 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4987 !MI.getFlag(MachineInstr::FmArcp)))
4988 return false;
4989
4990 // x / y -> x * (1.0 / y)
4991 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4992 .addUse(RHS)
4993 .setMIFlags(Flags);
4994 B.buildFMul(Res, LHS, RCP, Flags);
4995
4996 MI.eraseFromParent();
4997 return true;
4998}
4999
5002 MachineIRBuilder &B) const {
5003 Register Res = MI.getOperand(0).getReg();
5004 Register X = MI.getOperand(1).getReg();
5005 Register Y = MI.getOperand(2).getReg();
5006 uint16_t Flags = MI.getFlags();
5007 LLT ResTy = MRI.getType(Res);
5008
5009 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5010
5011 if (!AllowInaccurateRcp)
5012 return false;
5013
5014 auto NegY = B.buildFNeg(ResTy, Y);
5015 auto One = B.buildFConstant(ResTy, 1.0);
5016
5017 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5018 .addUse(Y)
5019 .setMIFlags(Flags);
5020
5021 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5022 R = B.buildFMA(ResTy, Tmp0, R, R);
5023
5024 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5025 R = B.buildFMA(ResTy, Tmp1, R, R);
5026
5027 auto Ret = B.buildFMul(ResTy, X, R);
5028 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5029
5030 B.buildFMA(Res, Tmp2, R, Ret);
5031 MI.eraseFromParent();
5032 return true;
5033}
5034
5037 MachineIRBuilder &B) const {
5039 return true;
5040
5041 Register Res = MI.getOperand(0).getReg();
5042 Register LHS = MI.getOperand(1).getReg();
5043 Register RHS = MI.getOperand(2).getReg();
5044
5045 uint16_t Flags = MI.getFlags();
5046
5047 LLT S16 = LLT::scalar(16);
5048 LLT S32 = LLT::scalar(32);
5049
5050 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5051 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5052 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5053 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5054 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5055 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5056 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5057 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5058 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5059 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5060 // q16.u = opx(V_CVT_F16_F32, q32.u);
5061 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5062
5063 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5064 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5065 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5066 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5067 .addUse(RHSExt.getReg(0))
5068 .setMIFlags(Flags);
5069 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5071 if (ST.hasMadMacF32Insts()) {
5072 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5073 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5074 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5075 } else {
5076 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5077 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5078 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5079 }
5080 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5081 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5082 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5083 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5084 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5085 .addUse(RDst.getReg(0))
5086 .addUse(RHS)
5087 .addUse(LHS)
5088 .setMIFlags(Flags);
5089
5090 MI.eraseFromParent();
5091 return true;
5092}
5093
5094static constexpr unsigned SPDenormModeBitField =
5096
5097// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5098// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5100 const GCNSubtarget &ST,
5102 // Set SP denorm mode to this value.
5103 unsigned SPDenormMode =
5104 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5105
5106 if (ST.hasDenormModeInst()) {
5107 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5108 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5109
5110 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5111 B.buildInstr(AMDGPU::S_DENORM_MODE)
5112 .addImm(NewDenormModeValue);
5113
5114 } else {
5115 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5116 .addImm(SPDenormMode)
5117 .addImm(SPDenormModeBitField);
5118 }
5119}
5120
5123 MachineIRBuilder &B) const {
5125 return true;
5126
5127 Register Res = MI.getOperand(0).getReg();
5128 Register LHS = MI.getOperand(1).getReg();
5129 Register RHS = MI.getOperand(2).getReg();
5130 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5131 SIModeRegisterDefaults Mode = MFI->getMode();
5132
5133 uint16_t Flags = MI.getFlags();
5134
5135 LLT S32 = LLT::scalar(32);
5136 LLT S1 = LLT::scalar(1);
5137
5138 auto One = B.buildFConstant(S32, 1.0f);
5139
5140 auto DenominatorScaled =
5141 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5142 .addUse(LHS)
5143 .addUse(RHS)
5144 .addImm(0)
5145 .setMIFlags(Flags);
5146 auto NumeratorScaled =
5147 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5148 .addUse(LHS)
5149 .addUse(RHS)
5150 .addImm(1)
5151 .setMIFlags(Flags);
5152
5153 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5154 .addUse(DenominatorScaled.getReg(0))
5155 .setMIFlags(Flags);
5156 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5157
5158 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5159 const bool HasDynamicDenormals =
5160 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5161 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5162
5163 Register SavedSPDenormMode;
5164 if (!PreservesDenormals) {
5165 if (HasDynamicDenormals) {
5166 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5167 B.buildInstr(AMDGPU::S_GETREG_B32)
5168 .addDef(SavedSPDenormMode)
5169 .addImm(SPDenormModeBitField);
5170 }
5171 toggleSPDenormMode(true, B, ST, Mode);
5172 }
5173
5174 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5175 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5176 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5177 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5178 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5179 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5180
5181 if (!PreservesDenormals) {
5182 if (HasDynamicDenormals) {
5183 assert(SavedSPDenormMode);
5184 B.buildInstr(AMDGPU::S_SETREG_B32)
5185 .addReg(SavedSPDenormMode)
5186 .addImm(SPDenormModeBitField);
5187 } else
5188 toggleSPDenormMode(false, B, ST, Mode);
5189 }
5190
5191 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5192 .addUse(Fma4.getReg(0))
5193 .addUse(Fma1.getReg(0))
5194 .addUse(Fma3.getReg(0))
5195 .addUse(NumeratorScaled.getReg(1))
5196 .setMIFlags(Flags);
5197
5198 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5199 .addUse(Fmas.getReg(0))
5200 .addUse(RHS)
5201 .addUse(LHS)
5202 .setMIFlags(Flags);
5203
5204 MI.eraseFromParent();
5205 return true;
5206}
5207
5210 MachineIRBuilder &B) const {
5212 return true;
5213
5214 Register Res = MI.getOperand(0).getReg();
5215 Register LHS = MI.getOperand(1).getReg();
5216 Register RHS = MI.getOperand(2).getReg();
5217
5218 uint16_t Flags = MI.getFlags();
5219
5220 LLT S64 = LLT::scalar(64);
5221 LLT S1 = LLT::scalar(1);
5222
5223 auto One = B.buildFConstant(S64, 1.0);
5224
5225 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5226 .addUse(LHS)
5227 .addUse(RHS)
5228 .addImm(0)
5229 .setMIFlags(Flags);
5230
5231 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5232
5233 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5234 .addUse(DivScale0.getReg(0))
5235 .setMIFlags(Flags);
5236
5237 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5238 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5239 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5240
5241 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5242 .addUse(LHS)
5243 .addUse(RHS)
5244 .addImm(1)
5245 .setMIFlags(Flags);
5246
5247 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5248 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5249 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5250
5251 Register Scale;
5253 // Workaround a hardware bug on SI where the condition output from div_scale
5254 // is not usable.
5255
5256 LLT S32 = LLT::scalar(32);
5257
5258 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5259 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5260 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5261 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5262
5263 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5264 Scale1Unmerge.getReg(1));
5265 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5266 Scale0Unmerge.getReg(1));
5267 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5268 } else {
5269 Scale = DivScale1.getReg(1);
5270 }
5271
5272 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5273 .addUse(Fma4.getReg(0))
5274 .addUse(Fma3.getReg(0))
5275 .addUse(Mul.getReg(0))
5276 .addUse(Scale)
5277 .setMIFlags(Flags);
5278
5279 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5280 .addUse(Fmas.getReg(0))
5281 .addUse(RHS)
5282 .addUse(LHS)
5283 .setMIFlags(Flags);
5284
5285 MI.eraseFromParent();
5286 return true;
5287}
5288
5291 MachineIRBuilder &B) const {
5292 Register Res0 = MI.getOperand(0).getReg();
5293 Register Res1 = MI.getOperand(1).getReg();
5294 Register Val = MI.getOperand(2).getReg();
5295 uint16_t Flags = MI.getFlags();
5296
5297 LLT Ty = MRI.getType(Res0);
5298 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5299
5300 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5301 .addUse(Val)
5302 .setMIFlags(Flags);
5303 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5304 .addUse(Val)
5305 .setMIFlags(Flags);
5306
5307 if (ST.hasFractBug()) {
5308 auto Fabs = B.buildFAbs(Ty, Val);
5309 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5310 auto IsFinite =
5311 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5312 auto Zero = B.buildConstant(InstrExpTy, 0);
5313 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5314 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5315 }
5316
5317 B.buildCopy(Res0, Mant);
5318 B.buildSExtOrTrunc(Res1, Exp);
5319
5320 MI.eraseFromParent();
5321 return true;
5322}
5323
5326 MachineIRBuilder &B) const {
5327 Register Res = MI.getOperand(0).getReg();
5328 Register LHS = MI.getOperand(2).getReg();
5329 Register RHS = MI.getOperand(3).getReg();
5330 uint16_t Flags = MI.getFlags();
5331
5332 LLT S32 = LLT::scalar(32);
5333 LLT S1 = LLT::scalar(1);
5334
5335 auto Abs = B.buildFAbs(S32, RHS, Flags);
5336 const APFloat C0Val(1.0f);
5337
5338 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5339 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5340 auto C2 = B.buildFConstant(S32, 1.0f);
5341
5342 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5343 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5344
5345 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5346
5347 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5348 .addUse(Mul0.getReg(0))
5349 .setMIFlags(Flags);
5350
5351 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5352
5353 B.buildFMul(Res, Sel, Mul1, Flags);
5354
5355 MI.eraseFromParent();
5356 return true;
5357}
5358
5361 MachineIRBuilder &B) const {
5362 // Bypass the correct expansion a standard promotion through G_FSQRT would
5363 // get. The f32 op is accurate enough for the f16 cas.
5364 unsigned Flags = MI.getFlags();
5365 assert(!ST.has16BitInsts());
5366 const LLT F32 = LLT::scalar(32);
5367 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5368 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5369 .addUse(Ext.getReg(0))
5370 .setMIFlags(Flags);
5371 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5372 MI.eraseFromParent();
5373 return true;
5374}
5375
5378 MachineIRBuilder &B) const {
5379 MachineFunction &MF = B.getMF();
5380 Register Dst = MI.getOperand(0).getReg();
5381 Register X = MI.getOperand(1).getReg();
5382 const unsigned Flags = MI.getFlags();
5383 const LLT S1 = LLT::scalar(1);
5384 const LLT F32 = LLT::scalar(32);
5385 const LLT I32 = LLT::scalar(32);
5386
5387 if (allowApproxFunc(MF, Flags)) {
5388 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5389 .addUse(X)
5390 .setMIFlags(Flags);
5391 MI.eraseFromParent();
5392 return true;
5393 }
5394
5395 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5396 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5397 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5398 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5399 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5400
5401 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5402 if (needsDenormHandlingF32(MF, X, Flags)) {
5403 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5404 .addUse(SqrtX.getReg(0))
5405 .setMIFlags(Flags);
5406
5407 auto NegOne = B.buildConstant(I32, -1);
5408 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5409
5410 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5411 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5412
5413 auto PosOne = B.buildConstant(I32, 1);
5414 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5415
5416 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5417 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5418
5419 auto Zero = B.buildFConstant(F32, 0.0f);
5420 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5421
5422 SqrtS =
5423 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5424
5425 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5426 SqrtS =
5427 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5428 } else {
5429 auto SqrtR =
5430 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5431 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5432
5433 auto Half = B.buildFConstant(F32, 0.5f);
5434 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5435 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5436 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5437 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5438 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5439 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5440 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5441 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5442 }
5443
5444 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5445
5446 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5447
5448 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5449
5450 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5451 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5452
5453 MI.eraseFromParent();
5454 return true;
5455}
5456
5459 MachineIRBuilder &B) const {
5460 // For double type, the SQRT and RSQ instructions don't have required
5461 // precision, we apply Goldschmidt's algorithm to improve the result:
5462 //
5463 // y0 = rsq(x)
5464 // g0 = x * y0
5465 // h0 = 0.5 * y0
5466 //
5467 // r0 = 0.5 - h0 * g0
5468 // g1 = g0 * r0 + g0
5469 // h1 = h0 * r0 + h0
5470 //
5471 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5472 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5473 // h2 = h1 * r1 + h1
5474 //
5475 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5476 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5477 //
5478 // sqrt(x) = g3
5479
5480 const LLT S1 = LLT::scalar(1);
5481 const LLT S32 = LLT::scalar(32);
5482 const LLT F64 = LLT::scalar(64);
5483
5484 Register Dst = MI.getOperand(0).getReg();
5485 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5486
5487 Register X = MI.getOperand(1).getReg();
5488 unsigned Flags = MI.getFlags();
5489
5490 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5491
5492 auto ZeroInt = B.buildConstant(S32, 0);
5493 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5494
5495 // Scale up input if it is too small.
5496 auto ScaleUpFactor = B.buildConstant(S32, 256);
5497 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5498 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5499
5500 auto SqrtY =
5501 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5502
5503 auto Half = B.buildFConstant(F64, 0.5);
5504 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5505 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5506
5507 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5508 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5509
5510 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5511 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5512
5513 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5514 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5515
5516 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5517
5518 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5519 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5520
5521 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5522
5523 // Scale down the result.
5524 auto ScaleDownFactor = B.buildConstant(S32, -128);
5525 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5526 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5527
5528 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5529 // with finite only or nsz because rsq(+/-0) = +/-inf
5530
5531 // TODO: Check for DAZ and expand to subnormals
5532 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5533
5534 // If x is +INF, +0, or -0, use its original value
5535 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5536
5537 MI.eraseFromParent();
5538 return true;
5539}
5540
5543 MachineIRBuilder &B) const {
5544 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5545 if (Ty == LLT::scalar(32))
5546 return legalizeFSQRTF32(MI, MRI, B);
5547 if (Ty == LLT::scalar(64))
5548 return legalizeFSQRTF64(MI, MRI, B);
5549 if (Ty == LLT::scalar(16))
5550 return legalizeFSQRTF16(MI, MRI, B);
5551 return false;
5552}
5553
5554// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5555// FIXME: Why do we handle this one but not other removed instructions?
5556//
5557// Reciprocal square root. The clamp prevents infinite results, clamping
5558// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5559// +-max_float.
5562 MachineIRBuilder &B) const {
5564 return true;
5565
5566 Register Dst = MI.getOperand(0).getReg();
5567 Register Src = MI.getOperand(2).getReg();
5568 auto Flags = MI.getFlags();
5569
5570 LLT Ty = MRI.getType(Dst);
5571
5572 const fltSemantics *FltSemantics;
5573 if (Ty == LLT::scalar(32))
5574 FltSemantics = &APFloat::IEEEsingle();
5575 else if (Ty == LLT::scalar(64))
5576 FltSemantics = &APFloat::IEEEdouble();
5577 else
5578 return false;
5579
5580 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5581 .addUse(Src)
5582 .setMIFlags(Flags);
5583
5584 // We don't need to concern ourselves with the snan handling difference, since
5585 // the rsq quieted (or not) so use the one which will directly select.
5586 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5587 const bool UseIEEE = MFI->getMode().IEEE;
5588
5589 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5590 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5591 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5592
5593 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5594
5595 if (UseIEEE)
5596 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5597 else
5598 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5599 MI.eraseFromParent();
5600 return true;
5601}
5602
5603// TODO: Fix pointer type handling
5606 Intrinsic::ID IID) const {
5607
5608 MachineIRBuilder &B = Helper.MIRBuilder;
5609 MachineRegisterInfo &MRI = *B.getMRI();
5610
5611 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5612 IID == Intrinsic::amdgcn_permlanex16;
5613 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5614 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5615
5616 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5617 Register Src2, LLT VT) -> Register {
5618 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5619 switch (IID) {
5620 case Intrinsic::amdgcn_readfirstlane:
5621 case Intrinsic::amdgcn_permlane64:
5622 return LaneOp.getReg(0);
5623 case Intrinsic::amdgcn_readlane:
5624 case Intrinsic::amdgcn_set_inactive:
5625 case Intrinsic::amdgcn_set_inactive_chain_arg:
5626 return LaneOp.addUse(Src1).getReg(0);
5627 case Intrinsic::amdgcn_writelane:
5628 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5629 case Intrinsic::amdgcn_permlane16:
5630 case Intrinsic::amdgcn_permlanex16: {
5631 Register Src3 = MI.getOperand(5).getReg();
5632 int64_t Src4 = MI.getOperand(6).getImm();
5633 int64_t Src5 = MI.getOperand(7).getImm();
5634 return LaneOp.addUse(Src1)
5635 .addUse(Src2)
5636 .addUse(Src3)
5637 .addImm(Src4)
5638 .addImm(Src5)
5639 .getReg(0);
5640 }
5641 case Intrinsic::amdgcn_mov_dpp8:
5642 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5643 case Intrinsic::amdgcn_update_dpp:
5644 return LaneOp.addUse(Src1)
5645 .addImm(MI.getOperand(4).getImm())
5646 .addImm(MI.getOperand(5).getImm())
5647 .addImm(MI.getOperand(6).getImm())
5648 .addImm(MI.getOperand(7).getImm())
5649 .getReg(0);
5650 default:
5651 llvm_unreachable("unhandled lane op");
5652 }
5653 };
5654
5655 Register DstReg = MI.getOperand(0).getReg();
5656 Register Src0 = MI.getOperand(2).getReg();
5657 Register Src1, Src2;
5658 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5659 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5660 Src1 = MI.getOperand(3).getReg();
5661 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5662 Src2 = MI.getOperand(4).getReg();
5663 }
5664 }
5665
5666 LLT Ty = MRI.getType(DstReg);
5667 unsigned Size = Ty.getSizeInBits();
5668
5669 unsigned SplitSize = 32;
5670 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5671 ST.hasDPALU_DPP() &&
5672 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
5673 SplitSize = 64;
5674
5675 if (Size == SplitSize) {
5676 // Already legal
5677 return true;
5678 }
5679
5680 if (Size < 32) {
5681 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5682
5683 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5684 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5685
5686 if (IID == Intrinsic::amdgcn_writelane)
5687 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5688
5689 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5690 B.buildTrunc(DstReg, LaneOpDst);
5691 MI.eraseFromParent();
5692 return true;
5693 }
5694
5695 if (Size % SplitSize != 0)
5696 return false;
5697
5698 LLT PartialResTy = LLT::scalar(SplitSize);
5699 bool NeedsBitcast = false;
5700 if (Ty.isVector()) {
5701 LLT EltTy = Ty.getElementType();
5702 unsigned EltSize = EltTy.getSizeInBits();
5703 if (EltSize == SplitSize) {
5704 PartialResTy = EltTy;
5705 } else if (EltSize == 16 || EltSize == 32) {
5706 unsigned NElem = SplitSize / EltSize;
5707 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5708 } else {
5709 // Handle all other cases via S32/S64 pieces
5710 NeedsBitcast = true;
5711 }
5712 }
5713
5714 SmallVector<Register, 4> PartialRes;
5715 unsigned NumParts = Size / SplitSize;
5716 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5717 MachineInstrBuilder Src1Parts, Src2Parts;
5718
5719 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5720 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5721
5722 if (IID == Intrinsic::amdgcn_writelane)
5723 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5724
5725 for (unsigned i = 0; i < NumParts; ++i) {
5726 Src0 = Src0Parts.getReg(i);
5727
5728 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5729 Src1 = Src1Parts.getReg(i);
5730
5731 if (IID == Intrinsic::amdgcn_writelane)
5732 Src2 = Src2Parts.getReg(i);
5733
5734 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5735 }
5736
5737 if (NeedsBitcast)
5738 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
5739 LLT::scalar(Ty.getSizeInBits()), PartialRes));
5740 else
5741 B.buildMergeLikeInstr(DstReg, PartialRes);
5742
5743 MI.eraseFromParent();
5744 return true;
5745}
5746
5749 MachineIRBuilder &B) const {
5753 LLT DstTy = MRI.getType(DstReg);
5754 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5755
5756 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5757 if (!loadInputValue(KernargPtrReg, B,
5759 return false;
5760
5761 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
5762 B.buildConstant(IdxTy, Offset).getReg(0));
5763 return true;
5764}
5765
5766/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5767/// bits of the pointer and replace them with the stride argument, then
5768/// merge_values everything together. In the common case of a raw buffer (the
5769/// stride component is 0), we can just AND off the upper half.
5772 Register Result = MI.getOperand(0).getReg();
5773 Register Pointer = MI.getOperand(2).getReg();
5774 Register Stride = MI.getOperand(3).getReg();
5775 Register NumRecords = MI.getOperand(4).getReg();
5776 Register Flags = MI.getOperand(5).getReg();
5777
5778 LLT S32 = LLT::scalar(32);
5779
5780 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5781 auto Unmerge = B.buildUnmerge(S32, Pointer);
5782 Register LowHalf = Unmerge.getReg(0);
5783 Register HighHalf = Unmerge.getReg(1);
5784
5785 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5786 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5787
5788 MachineInstrBuilder NewHighHalf = Masked;
5789 std::optional<ValueAndVReg> StrideConst =
5791 if (!StrideConst || !StrideConst->Value.isZero()) {
5792 MachineInstrBuilder ShiftedStride;
5793 if (StrideConst) {
5794 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5795 uint32_t ShiftedStrideVal = StrideVal << 16;
5796 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5797 } else {
5798 auto ExtStride = B.buildAnyExt(S32, Stride);
5799 auto ShiftConst = B.buildConstant(S32, 16);
5800 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5801 }
5802 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5803 }
5804 Register NewHighHalfReg = NewHighHalf.getReg(0);
5805 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5806 MI.eraseFromParent();
5807 return true;
5808}
5809
5812 MachineIRBuilder &B) const {
5813 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5814 if (!MFI->isEntryFunction()) {
5817 }
5818
5819 Register DstReg = MI.getOperand(0).getReg();
5820 if (!getImplicitArgPtr(DstReg, MRI, B))
5821 return false;
5822
5823 MI.eraseFromParent();
5824 return true;
5825}
5826
5829 MachineIRBuilder &B) const {
5830 Function &F = B.getMF().getFunction();
5831 std::optional<uint32_t> KnownSize =
5833 if (KnownSize.has_value())
5834 B.buildConstant(DstReg, *KnownSize);
5835 return false;
5836}
5837
5840 MachineIRBuilder &B) const {
5841
5842 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5843 if (!MFI->isEntryFunction()) {
5846 }
5847
5848 Register DstReg = MI.getOperand(0).getReg();
5849 if (!getLDSKernelId(DstReg, MRI, B))
5850 return false;
5851
5852 MI.eraseFromParent();
5853 return true;
5854}
5855
5859 unsigned AddrSpace) const {
5860 const LLT S32 = LLT::scalar(32);
5861 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
5862 Register Hi32 = Unmerge.getReg(1);
5863
5864 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
5866 Register FlatScratchBaseHi =
5867 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
5868 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
5869 .getReg(0);
5870 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
5871 // Test bits 63..58 against the aperture address.
5872 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
5873 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
5874 B.buildConstant(S32, 1u << 26));
5875 } else {
5876 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5877 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5878 }
5879 MI.eraseFromParent();
5880 return true;
5881}
5882
5883// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5884// offset (the offset that is included in bounds checking and swizzling, to be
5885// split between the instruction's voffset and immoffset fields) and soffset
5886// (the offset that is excluded from bounds checking and swizzling, to go in
5887// the instruction's soffset field). This function takes the first kind of
5888// offset and figures out how to split it between voffset and immoffset.
5889std::pair<Register, unsigned>
5891 Register OrigOffset) const {
5892 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5893 Register BaseReg;
5894 unsigned ImmOffset;
5895 const LLT S32 = LLT::scalar(32);
5896 MachineRegisterInfo &MRI = *B.getMRI();
5897
5898 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
5899 // being added, so we can only safely match a 32-bit addition with no unsigned
5900 // overflow.
5901 bool CheckNUW = AMDGPU::isGFX1250(ST);
5902 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
5903 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
5904
5905 // If BaseReg is a pointer, convert it to int.
5906 if (MRI.getType(BaseReg).isPointer())
5907 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5908
5909 // If the immediate value is too big for the immoffset field, put only bits
5910 // that would normally fit in the immoffset field. The remaining value that
5911 // is copied/added for the voffset field is a large power of 2, and it
5912 // stands more chance of being CSEd with the copy/add for another similar
5913 // load/store.
5914 // However, do not do that rounding down if that is a negative
5915 // number, as it appears to be illegal to have a negative offset in the
5916 // vgpr, even if adding the immediate offset makes it positive.
5917 unsigned Overflow = ImmOffset & ~MaxImm;
5918 ImmOffset -= Overflow;
5919 if ((int32_t)Overflow < 0) {
5920 Overflow += ImmOffset;
5921 ImmOffset = 0;
5922 }
5923
5924 if (Overflow != 0) {
5925 if (!BaseReg) {
5926 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5927 } else {
5928 auto OverflowVal = B.buildConstant(S32, Overflow);
5929 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5930 }
5931 }
5932
5933 if (!BaseReg)
5934 BaseReg = B.buildConstant(S32, 0).getReg(0);
5935
5936 return std::pair(BaseReg, ImmOffset);
5937}
5938
5939/// Handle register layout difference for f16 images for some subtargets.
5942 Register Reg,
5943 bool ImageStore) const {
5944 const LLT S16 = LLT::scalar(16);
5945 const LLT S32 = LLT::scalar(32);
5946 LLT StoreVT = MRI.getType(Reg);
5947 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5948
5949 if (ST.hasUnpackedD16VMem()) {
5950 auto Unmerge = B.buildUnmerge(S16, Reg);
5951
5952 SmallVector<Register, 4> WideRegs;
5953 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5954 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5955
5956 int NumElts = StoreVT.getNumElements();
5957
5958 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5959 .getReg(0);
5960 }
5961
5962 if (ImageStore && ST.hasImageStoreD16Bug()) {
5963 if (StoreVT.getNumElements() == 2) {
5964 SmallVector<Register, 4> PackedRegs;
5965 Reg = B.buildBitcast(S32, Reg).getReg(0);
5966 PackedRegs.push_back(Reg);
5967 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5968 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5969 .getReg(0);
5970 }
5971
5972 if (StoreVT.getNumElements() == 3) {
5973 SmallVector<Register, 4> PackedRegs;
5974 auto Unmerge = B.buildUnmerge(S16, Reg);
5975 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5976 PackedRegs.push_back(Unmerge.getReg(I));
5977 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5978 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5979 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5980 }
5981
5982 if (StoreVT.getNumElements() == 4) {
5983 SmallVector<Register, 4> PackedRegs;
5984 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5985 auto Unmerge = B.buildUnmerge(S32, Reg);
5986 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5987 PackedRegs.push_back(Unmerge.getReg(I));
5988 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5989 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5990 .getReg(0);
5991 }
5992
5993 llvm_unreachable("invalid data type");
5994 }
5995
5996 if (StoreVT == LLT::fixed_vector(3, S16)) {
5997 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5998 .getReg(0);
5999 }
6000 return Reg;
6001}
6002
6004 Register VData, LLT MemTy,
6005 bool IsFormat) const {
6006 MachineRegisterInfo *MRI = B.getMRI();
6007 LLT Ty = MRI->getType(VData);
6008
6009 const LLT S16 = LLT::scalar(16);
6010
6011 // Fixup buffer resources themselves needing to be v4i128.
6013 return castBufferRsrcToV4I32(VData, B);
6014
6015 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6016 Ty = getBitcastRegisterType(Ty);
6017 VData = B.buildBitcast(Ty, VData).getReg(0);
6018 }
6019 // Fixup illegal register types for i8 stores.
6020 if (Ty == LLT::scalar(8) || Ty == S16) {
6021 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6022 return AnyExt;
6023 }
6024
6025 if (Ty.isVector()) {
6026 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6027 if (IsFormat)
6028 return handleD16VData(B, *MRI, VData);
6029 }
6030 }
6031
6032 return VData;
6033}
6034
6036 LegalizerHelper &Helper,
6037 bool IsTyped,
6038 bool IsFormat) const {
6039 MachineIRBuilder &B = Helper.MIRBuilder;
6040 MachineRegisterInfo &MRI = *B.getMRI();
6041
6042 Register VData = MI.getOperand(1).getReg();
6043 LLT Ty = MRI.getType(VData);
6044 LLT EltTy = Ty.getScalarType();
6045 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6046 const LLT S32 = LLT::scalar(32);
6047
6048 MachineMemOperand *MMO = *MI.memoperands_begin();
6049 const int MemSize = MMO->getSize().getValue();
6050 LLT MemTy = MMO->getMemoryType();
6051
6052 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6053
6055 Register RSrc = MI.getOperand(2).getReg();
6056
6057 unsigned ImmOffset;
6058
6059 // The typed intrinsics add an immediate after the registers.
6060 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6061
6062 // The struct intrinsic variants add one additional operand over raw.
6063 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6064 Register VIndex;
6065 int OpOffset = 0;
6066 if (HasVIndex) {
6067 VIndex = MI.getOperand(3).getReg();
6068 OpOffset = 1;
6069 } else {
6070 VIndex = B.buildConstant(S32, 0).getReg(0);
6071 }
6072
6073 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6074 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6075
6076 unsigned Format = 0;
6077 if (IsTyped) {
6078 Format = MI.getOperand(5 + OpOffset).getImm();
6079 ++OpOffset;
6080 }
6081
6082 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6083
6084 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6085
6086 unsigned Opc;
6087 if (IsTyped) {
6088 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6089 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6090 } else if (IsFormat) {
6091 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6092 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6093 } else {
6094 switch (MemSize) {
6095 case 1:
6096 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6097 break;
6098 case 2:
6099 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6100 break;
6101 default:
6102 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6103 break;
6104 }
6105 }
6106
6107 auto MIB = B.buildInstr(Opc)
6108 .addUse(VData) // vdata
6109 .addUse(RSrc) // rsrc
6110 .addUse(VIndex) // vindex
6111 .addUse(VOffset) // voffset
6112 .addUse(SOffset) // soffset
6113 .addImm(ImmOffset); // offset(imm)
6114
6115 if (IsTyped)
6116 MIB.addImm(Format);
6117
6118 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6119 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6120 .addMemOperand(MMO);
6121
6122 MI.eraseFromParent();
6123 return true;
6124}
6125
6126static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6127 Register VIndex, Register VOffset, Register SOffset,
6128 unsigned ImmOffset, unsigned Format,
6129 unsigned AuxiliaryData, MachineMemOperand *MMO,
6130 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6131 auto MIB = B.buildInstr(Opc)
6132 .addDef(LoadDstReg) // vdata
6133 .addUse(RSrc) // rsrc
6134 .addUse(VIndex) // vindex
6135 .addUse(VOffset) // voffset
6136 .addUse(SOffset) // soffset
6137 .addImm(ImmOffset); // offset(imm)
6138
6139 if (IsTyped)
6140 MIB.addImm(Format);
6141
6142 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6143 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6144 .addMemOperand(MMO);
6145}
6146
6148 LegalizerHelper &Helper,
6149 bool IsFormat,
6150 bool IsTyped) const {
6151 MachineIRBuilder &B = Helper.MIRBuilder;
6152 MachineRegisterInfo &MRI = *B.getMRI();
6153 GISelChangeObserver &Observer = Helper.Observer;
6154
6155 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6156 MachineMemOperand *MMO = *MI.memoperands_begin();
6157 const LLT MemTy = MMO->getMemoryType();
6158 const LLT S32 = LLT::scalar(32);
6159
6160 Register Dst = MI.getOperand(0).getReg();
6161
6162 Register StatusDst;
6163 int OpOffset = 0;
6164 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6165 bool IsTFE = MI.getNumExplicitDefs() == 2;
6166 if (IsTFE) {
6167 StatusDst = MI.getOperand(1).getReg();
6168 ++OpOffset;
6169 }
6170
6171 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6172 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6173
6174 // The typed intrinsics add an immediate after the registers.
6175 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6176
6177 // The struct intrinsic variants add one additional operand over raw.
6178 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6179 Register VIndex;
6180 if (HasVIndex) {
6181 VIndex = MI.getOperand(3 + OpOffset).getReg();
6182 ++OpOffset;
6183 } else {
6184 VIndex = B.buildConstant(S32, 0).getReg(0);
6185 }
6186
6187 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6188 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6189
6190 unsigned Format = 0;
6191 if (IsTyped) {
6192 Format = MI.getOperand(5 + OpOffset).getImm();
6193 ++OpOffset;
6194 }
6195
6196 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6197 unsigned ImmOffset;
6198
6199 LLT Ty = MRI.getType(Dst);
6200 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6201 // logic doesn't have to handle that case.
6202 if (hasBufferRsrcWorkaround(Ty)) {
6203 Observer.changingInstr(MI);
6204 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6205 Observer.changedInstr(MI);
6206 Dst = MI.getOperand(0).getReg();
6207 B.setInsertPt(B.getMBB(), MI);
6208 }
6209 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6210 Ty = getBitcastRegisterType(Ty);
6211 Observer.changingInstr(MI);
6212 Helper.bitcastDst(MI, Ty, 0);
6213 Observer.changedInstr(MI);
6214 Dst = MI.getOperand(0).getReg();
6215 B.setInsertPt(B.getMBB(), MI);
6216 }
6217
6218 LLT EltTy = Ty.getScalarType();
6219 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6220 const bool Unpacked = ST.hasUnpackedD16VMem();
6221
6222 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6223
6224 unsigned Opc;
6225
6226 // TODO: Support TFE for typed and narrow loads.
6227 if (IsTyped) {
6228 if (IsTFE)
6229 return false;
6230 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6231 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6232 } else if (IsFormat) {
6233 if (IsD16) {
6234 if (IsTFE)
6235 return false;
6236 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6237 } else {
6238 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6239 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6240 }
6241 } else {
6242 switch (MemTy.getSizeInBits()) {
6243 case 8:
6244 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6245 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6246 break;
6247 case 16:
6248 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6249 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6250 break;
6251 default:
6252 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6253 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6254 break;
6255 }
6256 }
6257
6258 if (IsTFE) {
6259 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6260 unsigned NumLoadDWords = NumValueDWords + 1;
6261 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6262 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6263 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6264 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6265 if (MemTy.getSizeInBits() < 32) {
6266 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6267 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6268 B.buildTrunc(Dst, ExtDst);
6269 } else if (NumValueDWords == 1) {
6270 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6271 } else {
6272 SmallVector<Register, 5> LoadElts;
6273 for (unsigned I = 0; I != NumValueDWords; ++I)
6274 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6275 LoadElts.push_back(StatusDst);
6276 B.buildUnmerge(LoadElts, LoadDstReg);
6277 LoadElts.truncate(NumValueDWords);
6278 B.buildMergeLikeInstr(Dst, LoadElts);
6279 }
6280 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6281 (IsD16 && !Ty.isVector())) {
6282 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6283 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6284 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6285 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6286 B.buildTrunc(Dst, LoadDstReg);
6287 } else if (Unpacked && IsD16 && Ty.isVector()) {
6288 LLT UnpackedTy = Ty.changeElementSize(32);
6289 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6290 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6291 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6292 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6293 // FIXME: G_TRUNC should work, but legalization currently fails
6294 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6296 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6297 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6298 B.buildMergeLikeInstr(Dst, Repack);
6299 } else {
6300 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6301 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6302 }
6303
6304 MI.eraseFromParent();
6305 return true;
6306}
6307
6308static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6309 switch (IntrID) {
6310 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6311 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6312 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6313 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6314 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6315 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6316 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6317 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6318 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6319 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6320 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6321 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6322 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6323 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6324 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6325 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6326 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6327 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6328 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6329 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6330 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6331 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6332 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6333 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6334 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6335 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6336 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6337 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6338 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6339 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6340 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6342 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6344 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6345 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6346 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6347 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6348 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6349 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6350 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6351 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6352 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6353 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6354 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6355 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6356 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6357 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6358 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6359 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6360 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6361 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6362 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6363 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6364 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6365 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6366 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6367 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6368 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6369 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6370 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6371 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6372 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6373 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6374 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6375 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6376 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6377 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6378 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6379 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6380 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6381 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6382 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6383 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6384 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6385 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6386 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6387 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6388 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6389 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6390 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6391 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6392 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6393 default:
6394 llvm_unreachable("unhandled atomic opcode");
6395 }
6396}
6397
6400 Intrinsic::ID IID) const {
6401 const bool IsCmpSwap =
6402 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6403 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6404 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6405 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6406
6407 Register Dst = MI.getOperand(0).getReg();
6408 // Since we don't have 128-bit atomics, we don't need to handle the case of
6409 // p8 argmunents to the atomic itself
6410 Register VData = MI.getOperand(2).getReg();
6411
6412 Register CmpVal;
6413 int OpOffset = 0;
6414
6415 if (IsCmpSwap) {
6416 CmpVal = MI.getOperand(3).getReg();
6417 ++OpOffset;
6418 }
6419
6420 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6421 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6422 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6423
6424 // The struct intrinsic variants add one additional operand over raw.
6425 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6426 Register VIndex;
6427 if (HasVIndex) {
6428 VIndex = MI.getOperand(4 + OpOffset).getReg();
6429 ++OpOffset;
6430 } else {
6431 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6432 }
6433
6434 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6435 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6436 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6437
6438 MachineMemOperand *MMO = *MI.memoperands_begin();
6439
6440 unsigned ImmOffset;
6441 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6442
6443 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6444 .addDef(Dst)
6445 .addUse(VData); // vdata
6446
6447 if (IsCmpSwap)
6448 MIB.addReg(CmpVal);
6449
6450 MIB.addUse(RSrc) // rsrc
6451 .addUse(VIndex) // vindex
6452 .addUse(VOffset) // voffset
6453 .addUse(SOffset) // soffset
6454 .addImm(ImmOffset) // offset(imm)
6455 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6456 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6457 .addMemOperand(MMO);
6458
6459 MI.eraseFromParent();
6460 return true;
6461}
6462
6463/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6464/// vector with s16 typed elements.
6466 SmallVectorImpl<Register> &PackedAddrs,
6467 unsigned ArgOffset,
6469 bool IsA16, bool IsG16) {
6470 const LLT S16 = LLT::scalar(16);
6471 const LLT V2S16 = LLT::fixed_vector(2, 16);
6472 auto EndIdx = Intr->VAddrEnd;
6473
6474 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6475 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6476 if (!SrcOp.isReg())
6477 continue; // _L to _LZ may have eliminated this.
6478
6479 Register AddrReg = SrcOp.getReg();
6480
6481 if ((I < Intr->GradientStart) ||
6482 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6483 (I >= Intr->CoordStart && !IsA16)) {
6484 if ((I < Intr->GradientStart) && IsA16 &&
6485 (B.getMRI()->getType(AddrReg) == S16)) {
6486 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6487 // Special handling of bias when A16 is on. Bias is of type half but
6488 // occupies full 32-bit.
6489 PackedAddrs.push_back(
6490 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6491 .getReg(0));
6492 } else {
6493 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6494 "Bias needs to be converted to 16 bit in A16 mode");
6495 // Handle any gradient or coordinate operands that should not be packed
6496 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6497 PackedAddrs.push_back(AddrReg);
6498 }
6499 } else {
6500 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6501 // derivatives dx/dh and dx/dv are packed with undef.
6502 if (((I + 1) >= EndIdx) ||
6503 ((Intr->NumGradients / 2) % 2 == 1 &&
6504 (I == static_cast<unsigned>(Intr->GradientStart +
6505 (Intr->NumGradients / 2) - 1) ||
6506 I == static_cast<unsigned>(Intr->GradientStart +
6507 Intr->NumGradients - 1))) ||
6508 // Check for _L to _LZ optimization
6509 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6510 PackedAddrs.push_back(
6511 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6512 .getReg(0));
6513 } else {
6514 PackedAddrs.push_back(
6515 B.buildBuildVector(
6516 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6517 .getReg(0));
6518 ++I;
6519 }
6520 }
6521 }
6522}
6523
6524/// Convert from separate vaddr components to a single vector address register,
6525/// and replace the remaining operands with $noreg.
6527 int DimIdx, int NumVAddrs) {
6528 const LLT S32 = LLT::scalar(32);
6529 (void)S32;
6530 SmallVector<Register, 8> AddrRegs;
6531 for (int I = 0; I != NumVAddrs; ++I) {
6532 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6533 if (SrcOp.isReg()) {
6534 AddrRegs.push_back(SrcOp.getReg());
6535 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6536 }
6537 }
6538
6539 int NumAddrRegs = AddrRegs.size();
6540 if (NumAddrRegs != 1) {
6541 auto VAddr =
6542 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6543 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6544 }
6545
6546 for (int I = 1; I != NumVAddrs; ++I) {
6547 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6548 if (SrcOp.isReg())
6549 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6550 }
6551}
6552
6553/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6554///
6555/// Depending on the subtarget, load/store with 16-bit element data need to be
6556/// rewritten to use the low half of 32-bit registers, or directly use a packed
6557/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6558/// registers.
6559///
6560/// We don't want to directly select image instructions just yet, but also want
6561/// to exposes all register repacking to the legalizer/combiners. We also don't
6562/// want a selected instruction entering RegBankSelect. In order to avoid
6563/// defining a multitude of intermediate image instructions, directly hack on
6564/// the intrinsic's arguments. In cases like a16 addresses, this requires
6565/// padding now unnecessary arguments with $noreg.
6568 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6569
6570 const MachineFunction &MF = *MI.getMF();
6571 const unsigned NumDefs = MI.getNumExplicitDefs();
6572 const unsigned ArgOffset = NumDefs + 1;
6573 bool IsTFE = NumDefs == 2;
6574 // We are only processing the operands of d16 image operations on subtargets
6575 // that use the unpacked register layout, or need to repack the TFE result.
6576
6577 // TODO: Do we need to guard against already legalized intrinsics?
6578 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6580
6581 MachineRegisterInfo *MRI = B.getMRI();
6582 const LLT S32 = LLT::scalar(32);
6583 const LLT S16 = LLT::scalar(16);
6584 const LLT V2S16 = LLT::fixed_vector(2, 16);
6585
6586 unsigned DMask = 0;
6587 Register VData;
6588 LLT Ty;
6589
6590 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6591 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6592 Ty = MRI->getType(VData);
6593 }
6594
6595 const bool IsAtomicPacked16Bit =
6596 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6597 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6598
6599 // Check for 16 bit addresses and pack if true.
6600 LLT GradTy =
6601 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6602 LLT AddrTy =
6603 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6604 const bool IsG16 =
6605 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6606 const bool IsA16 = AddrTy == S16;
6607 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6608
6609 int DMaskLanes = 0;
6610 if (!BaseOpcode->Atomic) {
6611 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6612 if (BaseOpcode->Gather4) {
6613 DMaskLanes = 4;
6614 } else if (DMask != 0) {
6615 DMaskLanes = llvm::popcount(DMask);
6616 } else if (!IsTFE && !BaseOpcode->Store) {
6617 // If dmask is 0, this is a no-op load. This can be eliminated.
6618 B.buildUndef(MI.getOperand(0));
6619 MI.eraseFromParent();
6620 return true;
6621 }
6622 }
6623
6624 Observer.changingInstr(MI);
6625 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6626
6627 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6628 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6629 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6630 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6631 unsigned NewOpcode = LoadOpcode;
6632 if (BaseOpcode->Store)
6633 NewOpcode = StoreOpcode;
6634 else if (BaseOpcode->NoReturn)
6635 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6636
6637 // Track that we legalized this
6638 MI.setDesc(B.getTII().get(NewOpcode));
6639
6640 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6641 // dmask to be at least 1 otherwise the instruction will fail
6642 if (IsTFE && DMask == 0) {
6643 DMask = 0x1;
6644 DMaskLanes = 1;
6645 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6646 }
6647
6648 if (BaseOpcode->Atomic) {
6649 Register VData0 = MI.getOperand(2).getReg();
6650 LLT Ty = MRI->getType(VData0);
6651
6652 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6653 if (Ty.isVector() && !IsAtomicPacked16Bit)
6654 return false;
6655
6656 if (BaseOpcode->AtomicX2) {
6657 Register VData1 = MI.getOperand(3).getReg();
6658 // The two values are packed in one register.
6659 LLT PackedTy = LLT::fixed_vector(2, Ty);
6660 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6661 MI.getOperand(2).setReg(Concat.getReg(0));
6662 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6663 }
6664 }
6665
6666 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6667
6668 // Rewrite the addressing register layout before doing anything else.
6669 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6670 // 16 bit gradients are supported, but are tied to the A16 control
6671 // so both gradients and addresses must be 16 bit
6672 return false;
6673 }
6674
6675 if (IsA16 && !ST.hasA16()) {
6676 // A16 not supported
6677 return false;
6678 }
6679
6680 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6681 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6682
6683 if (IsA16 || IsG16) {
6684 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6685 // instructions expect VGPR_32
6686 SmallVector<Register, 4> PackedRegs;
6687
6688 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6689
6690 // See also below in the non-a16 branch
6691 const bool UseNSA = ST.hasNSAEncoding() &&
6692 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6693 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6694 const bool UsePartialNSA =
6695 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6696
6697 if (UsePartialNSA) {
6698 // Pack registers that would go over NSAMaxSize into last VAddr register
6699 LLT PackedAddrTy =
6700 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6701 auto Concat = B.buildConcatVectors(
6702 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6703 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6704 PackedRegs.resize(NSAMaxSize);
6705 } else if (!UseNSA && PackedRegs.size() > 1) {
6706 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6707 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6708 PackedRegs[0] = Concat.getReg(0);
6709 PackedRegs.resize(1);
6710 }
6711
6712 const unsigned NumPacked = PackedRegs.size();
6713 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6714 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6715 if (!SrcOp.isReg()) {
6716 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6717 continue;
6718 }
6719
6720 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6721
6722 if (I - Intr->VAddrStart < NumPacked)
6723 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6724 else
6725 SrcOp.setReg(AMDGPU::NoRegister);
6726 }
6727 } else {
6728 // If the register allocator cannot place the address registers contiguously
6729 // without introducing moves, then using the non-sequential address encoding
6730 // is always preferable, since it saves VALU instructions and is usually a
6731 // wash in terms of code size or even better.
6732 //
6733 // However, we currently have no way of hinting to the register allocator
6734 // that MIMG addresses should be placed contiguously when it is possible to
6735 // do so, so force non-NSA for the common 2-address case as a heuristic.
6736 //
6737 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6738 // allocation when possible.
6739 //
6740 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6741 // set of the remaining addresses.
6742 const bool UseNSA = ST.hasNSAEncoding() &&
6743 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6744 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6745 const bool UsePartialNSA =
6746 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6747
6748 if (UsePartialNSA) {
6750 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6751 Intr->NumVAddrs - NSAMaxSize + 1);
6752 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6753 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6754 Intr->NumVAddrs);
6755 }
6756 }
6757
6758 int Flags = 0;
6759 if (IsA16)
6760 Flags |= 1;
6761 if (IsG16)
6762 Flags |= 2;
6763 MI.addOperand(MachineOperand::CreateImm(Flags));
6764
6765 if (BaseOpcode->NoReturn) { // No TFE for stores?
6766 // TODO: Handle dmask trim
6767 if (!Ty.isVector() || !IsD16)
6768 return true;
6769
6770 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6771 if (RepackedReg != VData) {
6772 MI.getOperand(1).setReg(RepackedReg);
6773 }
6774
6775 return true;
6776 }
6777
6778 Register DstReg = MI.getOperand(0).getReg();
6779 const LLT EltTy = Ty.getScalarType();
6780 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6781
6782 // Confirm that the return type is large enough for the dmask specified
6783 if (NumElts < DMaskLanes)
6784 return false;
6785
6786 if (NumElts > 4 || DMaskLanes > 4)
6787 return false;
6788
6789 // Image atomic instructions are using DMask to specify how many bits
6790 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6791 // DMaskLanes for image atomic has default value '0'.
6792 // We must be sure that atomic variants (especially packed) will not be
6793 // truncated from v2s16 or v4s16 to s16 type.
6794 //
6795 // ChangeElementCount will be needed for image load where Ty is always scalar.
6796 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6797 const LLT AdjustedTy =
6798 DMaskLanes == 0
6799 ? Ty
6800 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6801
6802 // The raw dword aligned data component of the load. The only legal cases
6803 // where this matters should be when using the packed D16 format, for
6804 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6805 LLT RoundedTy;
6806
6807 // S32 vector to cover all data, plus TFE result element.
6808 LLT TFETy;
6809
6810 // Register type to use for each loaded component. Will be S32 or V2S16.
6811 LLT RegTy;
6812
6813 if (IsD16 && ST.hasUnpackedD16VMem()) {
6814 RoundedTy =
6815 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6816 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6817 RegTy = S32;
6818 } else {
6819 unsigned EltSize = EltTy.getSizeInBits();
6820 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6821 unsigned RoundedSize = 32 * RoundedElts;
6822 RoundedTy = LLT::scalarOrVector(
6823 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6824 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6825 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6826 }
6827
6828 // The return type does not need adjustment.
6829 // TODO: Should we change s16 case to s32 or <2 x s16>?
6830 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6831 return true;
6832
6833 Register Dst1Reg;
6834
6835 // Insert after the instruction.
6836 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6837
6838 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6839 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6840 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6841 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6842
6843 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6844
6845 MI.getOperand(0).setReg(NewResultReg);
6846
6847 // In the IR, TFE is supposed to be used with a 2 element struct return
6848 // type. The instruction really returns these two values in one contiguous
6849 // register, with one additional dword beyond the loaded data. Rewrite the
6850 // return type to use a single register result.
6851
6852 if (IsTFE) {
6853 Dst1Reg = MI.getOperand(1).getReg();
6854 if (MRI->getType(Dst1Reg) != S32)
6855 return false;
6856
6857 // TODO: Make sure the TFE operand bit is set.
6858 MI.removeOperand(1);
6859
6860 // Handle the easy case that requires no repack instructions.
6861 if (Ty == S32) {
6862 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6863 return true;
6864 }
6865 }
6866
6867 // Now figure out how to copy the new result register back into the old
6868 // result.
6869 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6870
6871 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6872
6873 if (ResultNumRegs == 1) {
6874 assert(!IsTFE);
6875 ResultRegs[0] = NewResultReg;
6876 } else {
6877 // We have to repack into a new vector of some kind.
6878 for (int I = 0; I != NumDataRegs; ++I)
6879 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6880 B.buildUnmerge(ResultRegs, NewResultReg);
6881
6882 // Drop the final TFE element to get the data part. The TFE result is
6883 // directly written to the right place already.
6884 if (IsTFE)
6885 ResultRegs.resize(NumDataRegs);
6886 }
6887
6888 // For an s16 scalar result, we form an s32 result with a truncate regardless
6889 // of packed vs. unpacked.
6890 if (IsD16 && !Ty.isVector()) {
6891 B.buildTrunc(DstReg, ResultRegs[0]);
6892 return true;
6893 }
6894
6895 // Avoid a build/concat_vector of 1 entry.
6896 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6897 B.buildBitcast(DstReg, ResultRegs[0]);
6898 return true;
6899 }
6900
6901 assert(Ty.isVector());
6902
6903 if (IsD16) {
6904 // For packed D16 results with TFE enabled, all the data components are
6905 // S32. Cast back to the expected type.
6906 //
6907 // TODO: We don't really need to use load s32 elements. We would only need one
6908 // cast for the TFE result if a multiple of v2s16 was used.
6909 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6910 for (Register &Reg : ResultRegs)
6911 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6912 } else if (ST.hasUnpackedD16VMem()) {
6913 for (Register &Reg : ResultRegs)
6914 Reg = B.buildTrunc(S16, Reg).getReg(0);
6915 }
6916 }
6917
6918 auto padWithUndef = [&](LLT Ty, int NumElts) {
6919 if (NumElts == 0)
6920 return;
6921 Register Undef = B.buildUndef(Ty).getReg(0);
6922 for (int I = 0; I != NumElts; ++I)
6923 ResultRegs.push_back(Undef);
6924 };
6925
6926 // Pad out any elements eliminated due to the dmask.
6927 LLT ResTy = MRI->getType(ResultRegs[0]);
6928 if (!ResTy.isVector()) {
6929 padWithUndef(ResTy, NumElts - ResultRegs.size());
6930 B.buildBuildVector(DstReg, ResultRegs);
6931 return true;
6932 }
6933
6934 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6935 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6936
6937 // Deal with the one annoying legal case.
6938 const LLT V3S16 = LLT::fixed_vector(3, 16);
6939 if (Ty == V3S16) {
6940 if (IsTFE) {
6941 if (ResultRegs.size() == 1) {
6942 NewResultReg = ResultRegs[0];
6943 } else if (ResultRegs.size() == 2) {
6944 LLT V4S16 = LLT::fixed_vector(4, 16);
6945 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6946 } else {
6947 return false;
6948 }
6949 }
6950
6951 if (MRI->getType(DstReg).getNumElements() <
6952 MRI->getType(NewResultReg).getNumElements()) {
6953 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6954 } else {
6955 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6956 }
6957 return true;
6958 }
6959
6960 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6961 B.buildConcatVectors(DstReg, ResultRegs);
6962 return true;
6963}
6964
6966 MachineInstr &MI) const {
6967 MachineIRBuilder &B = Helper.MIRBuilder;
6968 GISelChangeObserver &Observer = Helper.Observer;
6969
6970 Register OrigDst = MI.getOperand(0).getReg();
6971 Register Dst;
6972 LLT Ty = B.getMRI()->getType(OrigDst);
6973 unsigned Size = Ty.getSizeInBits();
6974 MachineFunction &MF = B.getMF();
6975 unsigned Opc = 0;
6976 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6977 assert(Size == 8 || Size == 16);
6978 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6979 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6980 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6981 // destination register.
6982 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6983 } else {
6984 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6985 Dst = OrigDst;
6986 }
6987
6988 Observer.changingInstr(MI);
6989
6990 // Handle needing to s.buffer.load() a p8 value.
6991 if (hasBufferRsrcWorkaround(Ty)) {
6992 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6993 B.setInsertPt(B.getMBB(), MI);
6994 }
6996 Ty = getBitcastRegisterType(Ty);
6997 Helper.bitcastDst(MI, Ty, 0);
6998 B.setInsertPt(B.getMBB(), MI);
6999 }
7000
7001 // FIXME: We don't really need this intermediate instruction. The intrinsic
7002 // should be fixed to have a memory operand. Since it's readnone, we're not
7003 // allowed to add one.
7004 MI.setDesc(B.getTII().get(Opc));
7005 MI.removeOperand(1); // Remove intrinsic ID
7006
7007 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7008 const unsigned MemSize = (Size + 7) / 8;
7009 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7015 MemSize, MemAlign);
7016 MI.addMemOperand(MF, MMO);
7017 if (Dst != OrigDst) {
7018 MI.getOperand(0).setReg(Dst);
7019 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7020 B.buildTrunc(OrigDst, Dst);
7021 }
7022
7023 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7024 // always be legal. We may need to restore this to a 96-bit result if it turns
7025 // out this needs to be converted to a vector load during RegBankSelect.
7026 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7027 if (Ty.isVector())
7029 else
7030 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7031 }
7032
7033 Observer.changedInstr(MI);
7034 return true;
7035}
7036
7038 MachineInstr &MI) const {
7039 MachineIRBuilder &B = Helper.MIRBuilder;
7040 GISelChangeObserver &Observer = Helper.Observer;
7041 Observer.changingInstr(MI);
7042 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7043 MI.removeOperand(0); // Remove intrinsic ID
7045 Observer.changedInstr(MI);
7046 return true;
7047}
7048
7049// TODO: Move to selection
7052 MachineIRBuilder &B) const {
7053 if (!ST.isTrapHandlerEnabled() ||
7055 return legalizeTrapEndpgm(MI, MRI, B);
7056
7057 return ST.supportsGetDoorbellID() ?
7059}
7060
7063 const DebugLoc &DL = MI.getDebugLoc();
7064 MachineBasicBlock &BB = B.getMBB();
7065 MachineFunction *MF = BB.getParent();
7066
7067 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7068 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7069 .addImm(0);
7070 MI.eraseFromParent();
7071 return true;
7072 }
7073
7074 // We need a block split to make the real endpgm a terminator. We also don't
7075 // want to break phis in successor blocks, so we can't just delete to the
7076 // end of the block.
7077 BB.splitAt(MI, false /*UpdateLiveIns*/);
7079 MF->push_back(TrapBB);
7080 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7081 .addImm(0);
7082 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7083 .addMBB(TrapBB);
7084
7085 BB.addSuccessor(TrapBB);
7086 MI.eraseFromParent();
7087 return true;
7088}
7089
7092 MachineFunction &MF = B.getMF();
7093 const LLT S64 = LLT::scalar(64);
7094
7095 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7096 // For code object version 5, queue_ptr is passed through implicit kernarg.
7102 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7103
7104 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7106
7107 if (!loadInputValue(KernargPtrReg, B,
7109 return false;
7110
7111 // TODO: can we be smarter about machine pointer info?
7114 PtrInfo,
7118
7119 // Pointer address
7120 Register LoadAddr = MRI.createGenericVirtualRegister(
7122 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7123 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7124 // Load address
7125 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7126 B.buildCopy(SGPR01, Temp);
7127 B.buildInstr(AMDGPU::S_TRAP)
7128 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7129 .addReg(SGPR01, RegState::Implicit);
7130 MI.eraseFromParent();
7131 return true;
7132 }
7133
7134 // Pass queue pointer to trap handler as input, and insert trap instruction
7135 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7136 Register LiveIn =
7137 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
7139 return false;
7140
7141 B.buildCopy(SGPR01, LiveIn);
7142 B.buildInstr(AMDGPU::S_TRAP)
7143 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7144 .addReg(SGPR01, RegState::Implicit);
7145
7146 MI.eraseFromParent();
7147 return true;
7148}
7149
7152 MachineIRBuilder &B) const {
7153 // We need to simulate the 's_trap 2' instruction on targets that run in
7154 // PRIV=1 (where it is treated as a nop).
7155 if (ST.hasPrivEnabledTrap2NopBug()) {
7156 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7157 MI.getDebugLoc());
7158 MI.eraseFromParent();
7159 return true;
7160 }
7161
7162 B.buildInstr(AMDGPU::S_TRAP)
7163 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7164 MI.eraseFromParent();
7165 return true;
7166}
7167
7170 MachineIRBuilder &B) const {
7171 // Is non-HSA path or trap-handler disabled? Then, report a warning
7172 // accordingly
7173 if (!ST.isTrapHandlerEnabled() ||
7175 Function &Fn = B.getMF().getFunction();
7177 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7178 } else {
7179 // Insert debug-trap instruction
7180 B.buildInstr(AMDGPU::S_TRAP)
7181 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7182 }
7183
7184 MI.eraseFromParent();
7185 return true;
7186}
7187
7189 MachineInstr &MI, MachineIRBuilder &B) const {
7190 MachineRegisterInfo &MRI = *B.getMRI();
7191 const LLT S16 = LLT::scalar(16);
7192 const LLT S32 = LLT::scalar(32);
7193 const LLT V2S16 = LLT::fixed_vector(2, 16);
7194 const LLT V3S32 = LLT::fixed_vector(3, 32);
7195
7196 Register DstReg = MI.getOperand(0).getReg();
7197 Register NodePtr = MI.getOperand(2).getReg();
7198 Register RayExtent = MI.getOperand(3).getReg();
7199 Register RayOrigin = MI.getOperand(4).getReg();
7200 Register RayDir = MI.getOperand(5).getReg();
7201 Register RayInvDir = MI.getOperand(6).getReg();
7202 Register TDescr = MI.getOperand(7).getReg();
7203
7204 if (!ST.hasGFX10_AEncoding()) {
7205 Function &Fn = B.getMF().getFunction();
7207 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7208 return false;
7209 }
7210
7211 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7212 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7213 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7214 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7215 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7216 const unsigned NumVDataDwords = 4;
7217 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7218 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7219 const bool UseNSA =
7220 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7221
7222 const unsigned BaseOpcodes[2][2] = {
7223 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7224 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7225 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7226 int Opcode;
7227 if (UseNSA) {
7228 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7229 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7230 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7231 : AMDGPU::MIMGEncGfx10NSA,
7232 NumVDataDwords, NumVAddrDwords);
7233 } else {
7234 assert(!IsGFX12Plus);
7235 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7236 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7237 : AMDGPU::MIMGEncGfx10Default,
7238 NumVDataDwords, NumVAddrDwords);
7239 }
7240 assert(Opcode != -1);
7241
7243 if (UseNSA && IsGFX11Plus) {
7244 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7245 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7246 auto Merged = B.buildMergeLikeInstr(
7247 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7248 Ops.push_back(Merged.getReg(0));
7249 };
7250
7251 Ops.push_back(NodePtr);
7252 Ops.push_back(RayExtent);
7253 packLanes(RayOrigin);
7254
7255 if (IsA16) {
7256 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7257 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7258 auto MergedDir = B.buildMergeLikeInstr(
7259 V3S32,
7260 {B.buildBitcast(
7261 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7262 UnmergeRayDir.getReg(0)}))
7263 .getReg(0),
7264 B.buildBitcast(
7265 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7266 UnmergeRayDir.getReg(1)}))
7267 .getReg(0),
7268 B.buildBitcast(
7269 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7270 UnmergeRayDir.getReg(2)}))
7271 .getReg(0)});
7272 Ops.push_back(MergedDir.getReg(0));
7273 } else {
7274 packLanes(RayDir);
7275 packLanes(RayInvDir);
7276 }
7277 } else {
7278 if (Is64) {
7279 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7280 Ops.push_back(Unmerge.getReg(0));
7281 Ops.push_back(Unmerge.getReg(1));
7282 } else {
7283 Ops.push_back(NodePtr);
7284 }
7285 Ops.push_back(RayExtent);
7286
7287 auto packLanes = [&Ops, &S32, &B](Register Src) {
7288 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7289 Ops.push_back(Unmerge.getReg(0));
7290 Ops.push_back(Unmerge.getReg(1));
7291 Ops.push_back(Unmerge.getReg(2));
7292 };
7293
7294 packLanes(RayOrigin);
7295 if (IsA16) {
7296 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7297 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7298 Register R1 = MRI.createGenericVirtualRegister(S32);
7299 Register R2 = MRI.createGenericVirtualRegister(S32);
7300 Register R3 = MRI.createGenericVirtualRegister(S32);
7301 B.buildMergeLikeInstr(R1,
7302 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7303 B.buildMergeLikeInstr(
7304 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7305 B.buildMergeLikeInstr(
7306 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7307 Ops.push_back(R1);
7308 Ops.push_back(R2);
7309 Ops.push_back(R3);
7310 } else {
7311 packLanes(RayDir);
7312 packLanes(RayInvDir);
7313 }
7314 }
7315
7316 if (!UseNSA) {
7317 // Build a single vector containing all the operands so far prepared.
7318 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7319 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7320 Ops.clear();
7321 Ops.push_back(MergedOps);
7322 }
7323
7324 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7325 .addDef(DstReg)
7326 .addImm(Opcode);
7327
7328 for (Register R : Ops) {
7329 MIB.addUse(R);
7330 }
7331
7332 MIB.addUse(TDescr)
7333 .addImm(IsA16 ? 1 : 0)
7334 .cloneMemRefs(MI);
7335
7336 MI.eraseFromParent();
7337 return true;
7338}
7339
7341 MachineInstr &MI, MachineIRBuilder &B) const {
7342 const LLT S32 = LLT::scalar(32);
7343 const LLT V2S32 = LLT::fixed_vector(2, 32);
7344
7345 Register DstReg = MI.getOperand(0).getReg();
7346 Register DstOrigin = MI.getOperand(1).getReg();
7347 Register DstDir = MI.getOperand(2).getReg();
7348 Register NodePtr = MI.getOperand(4).getReg();
7349 Register RayExtent = MI.getOperand(5).getReg();
7350 Register InstanceMask = MI.getOperand(6).getReg();
7351 Register RayOrigin = MI.getOperand(7).getReg();
7352 Register RayDir = MI.getOperand(8).getReg();
7353 Register Offsets = MI.getOperand(9).getReg();
7354 Register TDescr = MI.getOperand(10).getReg();
7355
7356 if (!ST.hasBVHDualAndBVH8Insts()) {
7357 Function &Fn = B.getMF().getFunction();
7359 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7360 return false;
7361 }
7362
7363 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7364 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7365 const unsigned NumVDataDwords = 10;
7366 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7367 int Opcode = AMDGPU::getMIMGOpcode(
7368 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7369 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7370 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7371 assert(Opcode != -1);
7372
7373 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7374 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7375
7376 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7377 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7378 .addDef(DstReg)
7379 .addDef(DstOrigin)
7380 .addDef(DstDir)
7381 .addImm(Opcode)
7382 .addUse(NodePtr)
7383 .addUse(RayExtentInstanceMaskVec.getReg(0))
7384 .addUse(RayOrigin)
7385 .addUse(RayDir)
7386 .addUse(Offsets)
7387 .addUse(TDescr)
7388 .cloneMemRefs(MI);
7389
7390 MI.eraseFromParent();
7391 return true;
7392}
7393
7395 MachineIRBuilder &B) const {
7396 const SITargetLowering *TLI = ST.getTargetLowering();
7398 Register DstReg = MI.getOperand(0).getReg();
7399 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7400 MI.eraseFromParent();
7401 return true;
7402}
7403
7405 MachineIRBuilder &B) const {
7406 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7407 if (!ST.hasArchitectedSGPRs())
7408 return false;
7409 LLT S32 = LLT::scalar(32);
7410 Register DstReg = MI.getOperand(0).getReg();
7411 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7412 auto LSB = B.buildConstant(S32, 25);
7413 auto Width = B.buildConstant(S32, 5);
7414 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7415 MI.eraseFromParent();
7416 return true;
7417}
7418
7419static constexpr unsigned FPEnvModeBitField =
7421
7422static constexpr unsigned FPEnvTrapBitField =
7424
7427 MachineIRBuilder &B) const {
7428 Register Src = MI.getOperand(0).getReg();
7429 if (MRI.getType(Src) != S64)
7430 return false;
7431
7432 auto ModeReg =
7433 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7434 /*HasSideEffects=*/true, /*isConvergent=*/false)
7435 .addImm(FPEnvModeBitField);
7436 auto TrapReg =
7437 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7438 /*HasSideEffects=*/true, /*isConvergent=*/false)
7439 .addImm(FPEnvTrapBitField);
7440 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7441 MI.eraseFromParent();
7442 return true;
7443}
7444
7447 MachineIRBuilder &B) const {
7448 Register Src = MI.getOperand(0).getReg();
7449 if (MRI.getType(Src) != S64)
7450 return false;
7451
7452 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7453 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7454 /*HasSideEffects=*/true, /*isConvergent=*/false)
7455 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7456 .addReg(Unmerge.getReg(0));
7457 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7458 /*HasSideEffects=*/true, /*isConvergent=*/false)
7459 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7460 .addReg(Unmerge.getReg(1));
7461 MI.eraseFromParent();
7462 return true;
7463}
7464
7466 MachineInstr &MI) const {
7467 MachineIRBuilder &B = Helper.MIRBuilder;
7468 MachineRegisterInfo &MRI = *B.getMRI();
7469
7470 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7471 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7472 switch (IntrID) {
7473 case Intrinsic::amdgcn_if:
7474 case Intrinsic::amdgcn_else: {
7475 MachineInstr *Br = nullptr;
7476 MachineBasicBlock *UncondBrTarget = nullptr;
7477 bool Negated = false;
7478 if (MachineInstr *BrCond =
7479 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7480 const SIRegisterInfo *TRI
7481 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7482
7483 Register Def = MI.getOperand(1).getReg();
7484 Register Use = MI.getOperand(3).getReg();
7485
7486 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7487
7488 if (Negated)
7489 std::swap(CondBrTarget, UncondBrTarget);
7490
7491 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7492 if (IntrID == Intrinsic::amdgcn_if) {
7493 B.buildInstr(AMDGPU::SI_IF)
7494 .addDef(Def)
7495 .addUse(Use)
7496 .addMBB(UncondBrTarget);
7497 } else {
7498 B.buildInstr(AMDGPU::SI_ELSE)
7499 .addDef(Def)
7500 .addUse(Use)
7501 .addMBB(UncondBrTarget);
7502 }
7503
7504 if (Br) {
7505 Br->getOperand(0).setMBB(CondBrTarget);
7506 } else {
7507 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7508 // since we're swapping branch targets it needs to be reinserted.
7509 // FIXME: IRTranslator should probably not do this
7510 B.buildBr(*CondBrTarget);
7511 }
7512
7513 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7514 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7515 MI.eraseFromParent();
7516 BrCond->eraseFromParent();
7517 return true;
7518 }
7519
7520 return false;
7521 }
7522 case Intrinsic::amdgcn_loop: {
7523 MachineInstr *Br = nullptr;
7524 MachineBasicBlock *UncondBrTarget = nullptr;
7525 bool Negated = false;
7526 if (MachineInstr *BrCond =
7527 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7528 const SIRegisterInfo *TRI
7529 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7530
7531 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7532 Register Reg = MI.getOperand(2).getReg();
7533
7534 if (Negated)
7535 std::swap(CondBrTarget, UncondBrTarget);
7536
7537 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7538 B.buildInstr(AMDGPU::SI_LOOP)
7539 .addUse(Reg)
7540 .addMBB(UncondBrTarget);
7541
7542 if (Br)
7543 Br->getOperand(0).setMBB(CondBrTarget);
7544 else
7545 B.buildBr(*CondBrTarget);
7546
7547 MI.eraseFromParent();
7548 BrCond->eraseFromParent();
7549 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7550 return true;
7551 }
7552
7553 return false;
7554 }
7555 case Intrinsic::amdgcn_addrspacecast_nonnull:
7556 return legalizeAddrSpaceCast(MI, MRI, B);
7557 case Intrinsic::amdgcn_make_buffer_rsrc:
7559 case Intrinsic::amdgcn_kernarg_segment_ptr:
7560 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7561 // This only makes sense to call in a kernel, so just lower to null.
7562 B.buildConstant(MI.getOperand(0).getReg(), 0);
7563 MI.eraseFromParent();
7564 return true;
7565 }
7566
7569 case Intrinsic::amdgcn_implicitarg_ptr:
7570 return legalizeImplicitArgPtr(MI, MRI, B);
7571 case Intrinsic::amdgcn_workitem_id_x:
7572 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7574 case Intrinsic::amdgcn_workitem_id_y:
7575 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7577 case Intrinsic::amdgcn_workitem_id_z:
7578 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7580 case Intrinsic::amdgcn_workgroup_id_x:
7583 case Intrinsic::amdgcn_workgroup_id_y:
7586 case Intrinsic::amdgcn_workgroup_id_z:
7589 case Intrinsic::amdgcn_wave_id:
7590 return legalizeWaveID(MI, B);
7591 case Intrinsic::amdgcn_lds_kernel_id:
7594 case Intrinsic::amdgcn_dispatch_ptr:
7597 case Intrinsic::amdgcn_queue_ptr:
7600 case Intrinsic::amdgcn_implicit_buffer_ptr:
7603 case Intrinsic::amdgcn_dispatch_id:
7606 case Intrinsic::r600_read_ngroups_x:
7607 // TODO: Emit error for hsa
7610 case Intrinsic::r600_read_ngroups_y:
7613 case Intrinsic::r600_read_ngroups_z:
7616 case Intrinsic::r600_read_local_size_x:
7617 // TODO: Could insert G_ASSERT_ZEXT from s16
7619 case Intrinsic::r600_read_local_size_y:
7620 // TODO: Could insert G_ASSERT_ZEXT from s16
7622 // TODO: Could insert G_ASSERT_ZEXT from s16
7623 case Intrinsic::r600_read_local_size_z:
7626 case Intrinsic::amdgcn_fdiv_fast:
7627 return legalizeFDIVFastIntrin(MI, MRI, B);
7628 case Intrinsic::amdgcn_is_shared:
7630 case Intrinsic::amdgcn_is_private:
7632 case Intrinsic::amdgcn_wavefrontsize: {
7633 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7634 MI.eraseFromParent();
7635 return true;
7636 }
7637 case Intrinsic::amdgcn_s_buffer_load:
7638 return legalizeSBufferLoad(Helper, MI);
7639 case Intrinsic::amdgcn_raw_buffer_store:
7640 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7641 case Intrinsic::amdgcn_struct_buffer_store:
7642 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7643 return legalizeBufferStore(MI, Helper, false, false);
7644 case Intrinsic::amdgcn_raw_buffer_store_format:
7645 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7646 case Intrinsic::amdgcn_struct_buffer_store_format:
7647 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7648 return legalizeBufferStore(MI, Helper, false, true);
7649 case Intrinsic::amdgcn_raw_tbuffer_store:
7650 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7651 case Intrinsic::amdgcn_struct_tbuffer_store:
7652 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7653 return legalizeBufferStore(MI, Helper, true, true);
7654 case Intrinsic::amdgcn_raw_buffer_load:
7655 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7656 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7657 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7658 case Intrinsic::amdgcn_struct_buffer_load:
7659 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7660 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7661 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7662 return legalizeBufferLoad(MI, Helper, false, false);
7663 case Intrinsic::amdgcn_raw_buffer_load_format:
7664 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7665 case Intrinsic::amdgcn_struct_buffer_load_format:
7666 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7667 return legalizeBufferLoad(MI, Helper, true, false);
7668 case Intrinsic::amdgcn_raw_tbuffer_load:
7669 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7670 case Intrinsic::amdgcn_struct_tbuffer_load:
7671 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7672 return legalizeBufferLoad(MI, Helper, true, true);
7673 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7674 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7675 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7676 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7677 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7678 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7679 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7680 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7681 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7682 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7683 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7684 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7685 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7686 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7687 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7688 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7689 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7690 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7691 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7692 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7693 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7694 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7695 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7696 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7697 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7698 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7699 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7700 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7701 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7702 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7703 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7704 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7705 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7706 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7707 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7708 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7709 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7710 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7711 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7712 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7713 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7714 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7715 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7716 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7717 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7718 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7719 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7720 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7721 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7722 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7723 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7724 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7725 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7726 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7727 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7728 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7729 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7730 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7731 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7732 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7733 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7734 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7735 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7736 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7737 return legalizeBufferAtomic(MI, B, IntrID);
7738 case Intrinsic::amdgcn_rsq_clamp:
7740 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7742 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7743 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7745 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
7746 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
7747 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
7748 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
7749 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
7750 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
7751 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
7752 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
7753 Register Index = MI.getOperand(5).getReg();
7754 LLT S64 = LLT::scalar(64);
7755 if (MRI.getType(Index) != S64)
7756 MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0));
7757 return true;
7758 }
7759 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7760 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7761 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7762 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7763 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7764 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7765 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7766 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7767 Register Index = MI.getOperand(5).getReg();
7768 LLT S32 = LLT::scalar(32);
7769 if (MRI.getType(Index) != S32)
7770 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7771 return true;
7772 }
7773 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
7774 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
7775 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
7776 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
7777 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
7778 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
7779 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7780 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7781 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7782 Register Index = MI.getOperand(7).getReg();
7783 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
7784 ? LLT::scalar(64)
7785 : LLT::scalar(32);
7786 if (MRI.getType(Index) != IdxTy)
7787 MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0));
7788 return true;
7789 }
7790
7791 case Intrinsic::amdgcn_fmed3: {
7792 GISelChangeObserver &Observer = Helper.Observer;
7793
7794 // FIXME: This is to workaround the inability of tablegen match combiners to
7795 // match intrinsics in patterns.
7796 Observer.changingInstr(MI);
7797 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7798 MI.removeOperand(1);
7799 Observer.changedInstr(MI);
7800 return true;
7801 }
7802 case Intrinsic::amdgcn_readlane:
7803 case Intrinsic::amdgcn_writelane:
7804 case Intrinsic::amdgcn_readfirstlane:
7805 case Intrinsic::amdgcn_permlane16:
7806 case Intrinsic::amdgcn_permlanex16:
7807 case Intrinsic::amdgcn_permlane64:
7808 case Intrinsic::amdgcn_set_inactive:
7809 case Intrinsic::amdgcn_set_inactive_chain_arg:
7810 case Intrinsic::amdgcn_mov_dpp8:
7811 case Intrinsic::amdgcn_update_dpp:
7812 return legalizeLaneOp(Helper, MI, IntrID);
7813 case Intrinsic::amdgcn_s_buffer_prefetch_data:
7814 return legalizeSBufferPrefetch(Helper, MI);
7815 case Intrinsic::amdgcn_dead: {
7816 // TODO: Use poison instead of undef
7817 for (const MachineOperand &Def : MI.defs())
7818 B.buildUndef(Def);
7819 MI.eraseFromParent();
7820 return true;
7821 }
7822 default: {
7823 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7825 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7826 return true;
7827 }
7828 }
7829
7830 return true;
7831}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
unsigned Intr
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition: MachO.cpp:71
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1234
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
Value * RHS
Value * LHS
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1158
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1098
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:682
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:702
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:686
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:687
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:169
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:359
bool hasA16() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:522
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:308
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:500
bool hasMadF16() const
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:506
bool hasMad64_32() const
Definition: GCNSubtarget.h:796
bool hasBVHDualAndBVH8Insts() const
bool hasGloballyAddressableScratch() const
bool has64BitLiterals() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:502
bool hasIntClamp() const
Definition: GCNSubtarget.h:400
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:316
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:420
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:652
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:785
bool hasNSAEncoding() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
bool hasVectorMulU64() const
Generation getGeneration() const
Definition: GCNSubtarget.h:356
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:783
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:787
bool hasDPALU_DPP() const
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:779
bool hasFractBug() const
Definition: GCNSubtarget.h:438
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Definition: GlobalValue.h:513
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
Type * getValueType() const
Definition: GlobalValue.h:298
static constexpr LLT float64()
Get a 64-bit IEEE double value.
Definition: LowLevelType.h:95
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:265
constexpr bool isScalar() const
Definition: LowLevelType.h:147
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
Definition: LowLevelType.h:212
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelType.h:65
constexpr bool isPointerVector() const
Definition: LowLevelType.h:153
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:43
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:160
constexpr bool isVector() const
Definition: LowLevelType.h:149
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:58
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:191
constexpr bool isPointer() const
Definition: LowLevelType.h:150
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:278
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:184
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:219
static constexpr LLT float16()
Get a 16-bit IEEE half value.
Definition: LowLevelType.h:85
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:271
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:101
constexpr bool isPointerOrPointerVector() const
Definition: LowLevelType.h:154
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
Definition: LowLevelType.h:228
constexpr LLT getScalarType() const
Definition: LowLevelType.h:206
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
Definition: LowLevelType.h:125
static constexpr LLT float32()
Get a 32-bit IEEE float value.
Definition: LowLevelType.h:90
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: MCRegister.h:64
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:72
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:587
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:595
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:392
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:417
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:74
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void truncate(size_type N)
Like resize, but requires that N is less than size().
Definition: SmallVector.h:645
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
TargetOptions Options
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
self_iterator getIterator()
Definition: ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:291
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
constexpr double inv_pi
Definition: MathExtras.h:54
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:916
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:349
@ Offset
Definition: DWP.cpp:477
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition: Utils.cpp:2029
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:651
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:307
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
void * PointerTy
Definition: GenericValue.h:21
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:270
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:314
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition: Utils.cpp:1718
@ DS_Warning
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:280
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:378
@ Enable
Enable colors.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
#define N
static constexpr uint64_t encode(Fields... Values)
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:266
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:267
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:80
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.