LLVM 22.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
37
38#define DEBUG_TYPE "amdgpu-legalinfo"
39
40using namespace llvm;
41using namespace LegalizeActions;
42using namespace LegalizeMutations;
43using namespace LegalityPredicates;
44using namespace MIPatternMatch;
45
46// Hack until load/store selection patterns support any tuple of legal types.
48 "amdgpu-global-isel-new-legality",
49 cl::desc("Use GlobalISel desired legality, rather than try to use"
50 "rules compatible with selection patterns"),
51 cl::init(false),
53
54static constexpr unsigned MaxRegisterSize = 1024;
55
56// Round the number of elements to the next power of two elements
58 unsigned NElts = Ty.getNumElements();
59 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
60 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
61}
62
63// Round the number of bits to the next power of two bits
65 unsigned Bits = Ty.getSizeInBits();
66 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
67 return LLT::scalar(Pow2Bits);
68}
69
70/// \returns true if this is an odd sized vector which should widen by adding an
71/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
72/// excludes s1 vectors, which should always be scalarized.
73static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
74 return [=](const LegalityQuery &Query) {
75 const LLT Ty = Query.Types[TypeIdx];
76 if (!Ty.isVector())
77 return false;
78
79 const LLT EltTy = Ty.getElementType();
80 const unsigned EltSize = EltTy.getSizeInBits();
81 return Ty.getNumElements() % 2 != 0 &&
82 EltSize > 1 && EltSize < 32 &&
83 Ty.getSizeInBits() % 32 != 0;
84 };
85}
86
87static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 return Ty.getSizeInBits() % 32 == 0;
91 };
92}
93
94static LegalityPredicate isWideVec16(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getScalarType();
98 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
99 };
100}
101
102static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105 const LLT EltTy = Ty.getElementType();
106 return std::pair(TypeIdx,
107 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
108 };
109}
110
112 return [=](const LegalityQuery &Query) {
113 const LLT Ty = Query.Types[TypeIdx];
114 const LLT EltTy = Ty.getElementType();
115 unsigned Size = Ty.getSizeInBits();
116 unsigned Pieces = (Size + 63) / 64;
117 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
118 return std::pair(TypeIdx, LLT::scalarOrVector(
119 ElementCount::getFixed(NewNumElts), EltTy));
120 };
121}
122
123// Increase the number of vector elements to reach the next multiple of 32-bit
124// type.
125static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
126 return [=](const LegalityQuery &Query) {
127 const LLT Ty = Query.Types[TypeIdx];
128
129 const LLT EltTy = Ty.getElementType();
130 const int Size = Ty.getSizeInBits();
131 const int EltSize = EltTy.getSizeInBits();
132 const int NextMul32 = (Size + 31) / 32;
133
134 assert(EltSize < 32);
135
136 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
137 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
138 };
139}
140
141// Retrieves the scalar type that's the same size as the mem desc
143 return [=](const LegalityQuery &Query) {
144 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
145 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
146 };
147}
148
149// Increase the number of vector elements to reach the next legal RegClass.
151 return [=](const LegalityQuery &Query) {
152 const LLT Ty = Query.Types[TypeIdx];
153 const unsigned NumElts = Ty.getNumElements();
154 const unsigned EltSize = Ty.getElementType().getSizeInBits();
155 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
156
157 assert(EltSize == 32 || EltSize == 64);
158 assert(Ty.getSizeInBits() < MaxRegisterSize);
159
160 unsigned NewNumElts;
161 // Find the nearest legal RegClass that is larger than the current type.
162 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
163 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
164 break;
165 }
166 return std::pair(TypeIdx,
167 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
168 };
169}
170
172 if (!Ty.isVector())
173 return LLT::scalar(128);
174 const ElementCount NumElems = Ty.getElementCount();
175 return LLT::vector(NumElems, LLT::scalar(128));
176}
177
179 if (!Ty.isVector())
180 return LLT::fixed_vector(4, LLT::scalar(32));
181 const unsigned NumElems = Ty.getElementCount().getFixedValue();
182 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
183}
184
186 const unsigned Size = Ty.getSizeInBits();
187
188 if (Size <= 32) {
189 // <2 x s8> -> s16
190 // <4 x s8> -> s32
191 return LLT::scalar(Size);
192 }
193
195}
196
197static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
198 return [=](const LegalityQuery &Query) {
199 const LLT Ty = Query.Types[TypeIdx];
200 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
201 };
202}
203
205 return [=](const LegalityQuery &Query) {
206 const LLT Ty = Query.Types[TypeIdx];
207 unsigned Size = Ty.getSizeInBits();
208 assert(Size % 32 == 0);
209 return std::pair(
211 };
212}
213
214static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
215 return [=](const LegalityQuery &Query) {
216 const LLT QueryTy = Query.Types[TypeIdx];
217 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
218 };
219}
220
221static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
222 return [=](const LegalityQuery &Query) {
223 const LLT QueryTy = Query.Types[TypeIdx];
224 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
225 };
226}
227
228static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
229 return [=](const LegalityQuery &Query) {
230 const LLT QueryTy = Query.Types[TypeIdx];
231 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
232 };
233}
234
235static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
236 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
238}
239
241 const int EltSize = EltTy.getSizeInBits();
242 return EltSize == 16 || EltSize % 32 == 0;
243}
244
245static bool isRegisterVectorType(LLT Ty) {
246 const int EltSize = Ty.getElementType().getSizeInBits();
247 return EltSize == 32 || EltSize == 64 ||
248 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
249 EltSize == 128 || EltSize == 256;
250}
251
252// TODO: replace all uses of isRegisterType with isRegisterClassType
253static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
254 if (!isRegisterSize(ST, Ty.getSizeInBits()))
255 return false;
256
257 if (Ty.isVector())
258 return isRegisterVectorType(Ty);
259
260 return true;
261}
262
263// Any combination of 32 or 64-bit elements up the maximum register size, and
264// multiples of v2s16.
266 unsigned TypeIdx) {
267 return [=, &ST](const LegalityQuery &Query) {
268 return isRegisterType(ST, Query.Types[TypeIdx]);
269 };
270}
271
272// RegisterType that doesn't have a corresponding RegClass.
273// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
274// should be removed.
276 unsigned TypeIdx) {
277 return [=, &ST](const LegalityQuery &Query) {
278 LLT Ty = Query.Types[TypeIdx];
279 return isRegisterType(ST, Ty) &&
280 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
281 };
282}
283
284static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
285 return [=](const LegalityQuery &Query) {
286 const LLT QueryTy = Query.Types[TypeIdx];
287 if (!QueryTy.isVector())
288 return false;
289 const LLT EltTy = QueryTy.getElementType();
290 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
291 };
292}
293
294constexpr LLT S1 = LLT::scalar(1);
295constexpr LLT S8 = LLT::scalar(8);
296constexpr LLT S16 = LLT::scalar(16);
297constexpr LLT S32 = LLT::scalar(32);
298constexpr LLT F32 = LLT::float32();
299constexpr LLT S64 = LLT::scalar(64);
300constexpr LLT F64 = LLT::float64();
301constexpr LLT S96 = LLT::scalar(96);
302constexpr LLT S128 = LLT::scalar(128);
303constexpr LLT S160 = LLT::scalar(160);
304constexpr LLT S192 = LLT::scalar(192);
305constexpr LLT S224 = LLT::scalar(224);
306constexpr LLT S256 = LLT::scalar(256);
307constexpr LLT S512 = LLT::scalar(512);
308constexpr LLT S1024 = LLT::scalar(1024);
310
311constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
312constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
313constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
314constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
315constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
316constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
317constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
318constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
319
321constexpr LLT V2BF16 = V2F16; // FIXME
322
323constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
324constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
325constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
326constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
327constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
328constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
329constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
330constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
331constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
332constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
333constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
334constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
335constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
336
337constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
338constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
339constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
340constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
341constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
342constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
343constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
344constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
345
346constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
347constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
348
349constexpr std::initializer_list<LLT> AllScalarTypes = {
351
352constexpr std::initializer_list<LLT> AllS16Vectors{
354
355constexpr std::initializer_list<LLT> AllS32Vectors = {
358
359constexpr std::initializer_list<LLT> AllS64Vectors = {
361
367
368// Checks whether a type is in the list of legal register types.
369static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
370 if (Ty.isPointerOrPointerVector())
371 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
372
375 (ST.useRealTrue16Insts() && Ty == S16) ||
377}
378
380 unsigned TypeIdx) {
381 return [&ST, TypeIdx](const LegalityQuery &Query) {
382 return isRegisterClassType(ST, Query.Types[TypeIdx]);
383 };
384}
385
386// If we have a truncating store or an extending load with a data size larger
387// than 32-bits, we need to reduce to a 32-bit type.
389 return [=](const LegalityQuery &Query) {
390 const LLT Ty = Query.Types[TypeIdx];
391 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
392 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
393 };
394}
395
396// If we have a truncating store or an extending load with a data size larger
397// than 32-bits and mem location is a power of 2
399 return [=](const LegalityQuery &Query) {
400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
401 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
402 isPowerOf2_64(MemSize);
403 };
404}
405
406// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
407// handle some operations by just promoting the register during
408// selection. There are also d16 loads on GFX9+ which preserve the high bits.
409static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
410 bool IsLoad, bool IsAtomic) {
411 switch (AS) {
413 // FIXME: Private element size.
414 return ST.enableFlatScratch() ? 128 : 32;
416 return ST.useDS128() ? 128 : 64;
421 // Treat constant and global as identical. SMRD loads are sometimes usable for
422 // global loads (ideally constant address space should be eliminated)
423 // depending on the context. Legality cannot be context dependent, but
424 // RegBankSelect can split the load as necessary depending on the pointer
425 // register bank/uniformity and if the memory is invariant or not written in a
426 // kernel.
427 return IsLoad ? 512 : 128;
428 default:
429 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
430 // if they may alias scratch depending on the subtarget. This needs to be
431 // moved to custom handling to use addressMayBeAccessedAsPrivate
432 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
433 }
434}
435
436static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
437 const LegalityQuery &Query) {
438 const LLT Ty = Query.Types[0];
439
440 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
441 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
442
443 unsigned RegSize = Ty.getSizeInBits();
444 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
445 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
446 unsigned AS = Query.Types[1].getAddressSpace();
447
448 // All of these need to be custom lowered to cast the pointer operand.
450 return false;
451
452 // Do not handle extending vector loads.
453 if (Ty.isVector() && MemSize != RegSize)
454 return false;
455
456 // TODO: We should be able to widen loads if the alignment is high enough, but
457 // we also need to modify the memory access size.
458#if 0
459 // Accept widening loads based on alignment.
460 if (IsLoad && MemSize < Size)
461 MemSize = std::max(MemSize, Align);
462#endif
463
464 // Only 1-byte and 2-byte to 32-bit extloads are valid.
465 if (MemSize != RegSize && RegSize != 32)
466 return false;
467
468 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
469 Query.MMODescrs[0].Ordering !=
471 return false;
472
473 switch (MemSize) {
474 case 8:
475 case 16:
476 case 32:
477 case 64:
478 case 128:
479 break;
480 case 96:
481 if (!ST.hasDwordx3LoadStores())
482 return false;
483 break;
484 case 256:
485 case 512:
486 // These may contextually need to be broken down.
487 break;
488 default:
489 return false;
490 }
491
492 assert(RegSize >= MemSize);
493
494 if (AlignBits < MemSize) {
495 const SITargetLowering *TLI = ST.getTargetLowering();
496 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
497 Align(AlignBits / 8)))
498 return false;
499 }
500
501 return true;
502}
503
504// The newer buffer intrinsic forms take their resource arguments as
505// pointers in address space 8, aka s128 values. However, in order to not break
506// SelectionDAG, the underlying operations have to continue to take v4i32
507// arguments. Therefore, we convert resource pointers - or vectors of them
508// to integer values here.
509static bool hasBufferRsrcWorkaround(const LLT Ty) {
510 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
511 return true;
512 if (Ty.isVector()) {
513 const LLT ElemTy = Ty.getElementType();
514 return hasBufferRsrcWorkaround(ElemTy);
515 }
516 return false;
517}
518
519// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
520// workaround this. Eventually it should ignore the type for loads and only care
521// about the size. Return true in cases where we will workaround this for now by
522// bitcasting.
523static bool loadStoreBitcastWorkaround(const LLT Ty) {
525 return false;
526
527 const unsigned Size = Ty.getSizeInBits();
528 if (Ty.isPointerVector())
529 return true;
530 if (Size <= 64)
531 return false;
532 // Address space 8 pointers get their own workaround.
534 return false;
535 if (!Ty.isVector())
536 return true;
537
538 unsigned EltSize = Ty.getScalarSizeInBits();
539 return EltSize != 32 && EltSize != 64;
540}
541
542static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
543 const LLT Ty = Query.Types[0];
544 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
546}
547
548/// Return true if a load or store of the type should be lowered with a bitcast
549/// to a different type.
550static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
551 const LLT MemTy) {
552 const unsigned MemSizeInBits = MemTy.getSizeInBits();
553 const unsigned Size = Ty.getSizeInBits();
554 if (Size != MemSizeInBits)
555 return Size <= 32 && Ty.isVector();
556
558 return true;
559
560 // Don't try to handle bitcasting vector ext loads for now.
561 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
562 (Size <= 32 || isRegisterSize(ST, Size)) &&
563 !isRegisterVectorElementType(Ty.getElementType());
564}
565
566/// Return true if we should legalize a load by widening an odd sized memory
567/// access up to the alignment. Note this case when the memory access itself
568/// changes, not the size of the result register.
569static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
570 uint64_t AlignInBits, unsigned AddrSpace,
571 unsigned Opcode) {
572 unsigned SizeInBits = MemoryTy.getSizeInBits();
573 // We don't want to widen cases that are naturally legal.
574 if (isPowerOf2_32(SizeInBits))
575 return false;
576
577 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
578 // end up widening these for a scalar load during RegBankSelect, if we don't
579 // have 96-bit scalar loads.
580 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
581 return false;
582
583 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
584 return false;
585
586 // A load is known dereferenceable up to the alignment, so it's legal to widen
587 // to it.
588 //
589 // TODO: Could check dereferenceable for less aligned cases.
590 unsigned RoundedSize = NextPowerOf2(SizeInBits);
591 if (AlignInBits < RoundedSize)
592 return false;
593
594 // Do not widen if it would introduce a slow unaligned load.
595 const SITargetLowering *TLI = ST.getTargetLowering();
596 unsigned Fast = 0;
598 RoundedSize, AddrSpace, Align(AlignInBits / 8),
600 Fast;
601}
602
603static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
604 unsigned Opcode) {
605 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
606 return false;
607
608 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
609 Query.MMODescrs[0].AlignInBits,
610 Query.Types[1].getAddressSpace(), Opcode);
611}
612
613/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
614/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
615/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
617 MachineRegisterInfo &MRI, unsigned Idx) {
618 MachineOperand &MO = MI.getOperand(Idx);
619
620 const LLT PointerTy = MRI.getType(MO.getReg());
621
622 // Paranoidly prevent us from doing this multiple times.
624 return PointerTy;
625
626 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
627 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
628 if (!PointerTy.isVector()) {
629 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
630 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
631 const LLT S32 = LLT::scalar(32);
632
633 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
634 std::array<Register, 4> VectorElems;
635 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
636 for (unsigned I = 0; I < NumParts; ++I)
637 VectorElems[I] =
638 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
639 B.buildMergeValues(MO, VectorElems);
640 MO.setReg(VectorReg);
641 return VectorTy;
642 }
643 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
644 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
645 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
646 B.buildIntToPtr(MO, Scalar);
647 MO.setReg(BitcastReg);
648
649 return VectorTy;
650}
651
652/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
653/// the form in which the value must be in order to be passed to the low-level
654/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
655/// needed in order to account for the fact that we can't define a register
656/// class for s128 without breaking SelectionDAG.
658 MachineRegisterInfo &MRI = *B.getMRI();
659 const LLT PointerTy = MRI.getType(Pointer);
660 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
661 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
662
663 if (!PointerTy.isVector()) {
664 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
665 SmallVector<Register, 4> PointerParts;
666 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
667 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
668 for (unsigned I = 0; I < NumParts; ++I)
669 PointerParts.push_back(Unmerged.getReg(I));
670 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
671 }
672 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
673 return B.buildBitcast(VectorTy, Scalar).getReg(0);
674}
675
677 unsigned Idx) {
678 MachineOperand &MO = MI.getOperand(Idx);
679
680 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
681 // Paranoidly prevent us from doing this multiple times.
683 return;
685}
686
688 const GCNTargetMachine &TM)
689 : ST(ST_) {
690 using namespace TargetOpcode;
691
692 auto GetAddrSpacePtr = [&TM](unsigned AS) {
693 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
694 };
695
696 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
697 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
698 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
699 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
700 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
701 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
702 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
703 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
704 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
705 const LLT BufferStridedPtr =
706 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
707
708 const LLT CodePtr = FlatPtr;
709
710 const std::initializer_list<LLT> AddrSpaces64 = {
711 GlobalPtr, ConstantPtr, FlatPtr
712 };
713
714 const std::initializer_list<LLT> AddrSpaces32 = {
715 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
716 };
717
718 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
719
720 const std::initializer_list<LLT> FPTypesBase = {
721 S32, S64
722 };
723
724 const std::initializer_list<LLT> FPTypes16 = {
725 S32, S64, S16
726 };
727
728 const std::initializer_list<LLT> FPTypesPK16 = {
729 S32, S64, S16, V2S16
730 };
731
732 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
733
734 // s1 for VCC branches, s32 for SCC branches.
736
737 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
738 // elements for v3s16
741 .legalFor(AllS32Vectors)
743 .legalFor(AddrSpaces64)
744 .legalFor(AddrSpaces32)
745 .legalFor(AddrSpaces128)
746 .legalIf(isPointer(0))
747 .clampScalar(0, S16, S256)
749 .clampMaxNumElements(0, S32, 16)
751 .scalarize(0);
752
753 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
754 // Full set of gfx9 features.
755 if (ST.hasScalarAddSub64()) {
756 getActionDefinitionsBuilder({G_ADD, G_SUB})
757 .legalFor({S64, S32, S16, V2S16})
758 .clampMaxNumElementsStrict(0, S16, 2)
759 .scalarize(0)
760 .minScalar(0, S16)
762 .maxScalar(0, S32);
763 } else {
764 getActionDefinitionsBuilder({G_ADD, G_SUB})
765 .legalFor({S32, S16, V2S16})
766 .clampMaxNumElementsStrict(0, S16, 2)
767 .scalarize(0)
768 .minScalar(0, S16)
770 .maxScalar(0, S32);
771 }
772
773 if (ST.hasScalarSMulU64()) {
775 .legalFor({S64, S32, S16, V2S16})
776 .clampMaxNumElementsStrict(0, S16, 2)
777 .scalarize(0)
778 .minScalar(0, S16)
780 .custom();
781 } else {
783 .legalFor({S32, S16, V2S16})
784 .clampMaxNumElementsStrict(0, S16, 2)
785 .scalarize(0)
786 .minScalar(0, S16)
788 .custom();
789 }
790 assert(ST.hasMad64_32());
791
792 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
793 .legalFor({S32, S16, V2S16}) // Clamp modifier
794 .minScalarOrElt(0, S16)
796 .scalarize(0)
798 .lower();
799 } else if (ST.has16BitInsts()) {
800 getActionDefinitionsBuilder({G_ADD, G_SUB})
801 .legalFor({S32, S16})
802 .minScalar(0, S16)
804 .maxScalar(0, S32)
805 .scalarize(0);
806
808 .legalFor({S32, S16})
809 .scalarize(0)
810 .minScalar(0, S16)
812 .custom();
813 assert(ST.hasMad64_32());
814
815 // Technically the saturating operations require clamp bit support, but this
816 // was introduced at the same time as 16-bit operations.
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32, S16}) // Clamp modifier
819 .minScalar(0, S16)
820 .scalarize(0)
822 .lower();
823
824 // We're just lowering this, but it helps get a better result to try to
825 // coerce to the desired type first.
826 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
827 .minScalar(0, S16)
828 .scalarize(0)
829 .lower();
830 } else {
831 getActionDefinitionsBuilder({G_ADD, G_SUB})
832 .legalFor({S32})
833 .widenScalarToNextMultipleOf(0, 32)
834 .clampScalar(0, S32, S32)
835 .scalarize(0);
836
837 auto &Mul = getActionDefinitionsBuilder(G_MUL)
838 .legalFor({S32})
839 .scalarize(0)
840 .minScalar(0, S32)
842
843 if (ST.hasMad64_32())
844 Mul.custom();
845 else
846 Mul.maxScalar(0, S32);
847
848 if (ST.hasIntClamp()) {
849 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
850 .legalFor({S32}) // Clamp modifier.
851 .scalarize(0)
853 .lower();
854 } else {
855 // Clamp bit support was added in VI, along with 16-bit operations.
856 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
857 .minScalar(0, S32)
858 .scalarize(0)
859 .lower();
860 }
861
862 // FIXME: DAG expansion gets better results. The widening uses the smaller
863 // range values and goes for the min/max lowering directly.
864 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
865 .minScalar(0, S32)
866 .scalarize(0)
867 .lower();
868 }
869
871 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
872 .customFor({S32, S64})
873 .clampScalar(0, S32, S64)
875 .scalarize(0);
876
877 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
878 .legalFor({S32})
879 .maxScalar(0, S32);
880
881 if (ST.hasVOP3PInsts()) {
882 Mulh
883 .clampMaxNumElements(0, S8, 2)
884 .lowerFor({V2S8});
885 }
886
887 Mulh
888 .scalarize(0)
889 .lower();
890
891 // Report legal for any types we can handle anywhere. For the cases only legal
892 // on the SALU, RegBankSelect will be able to re-legalize.
893 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
894 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
895 .clampScalar(0, S32, S64)
901 .scalarize(0);
902
904 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
905 .legalFor({{S32, S1}, {S32, S32}})
906 .clampScalar(0, S32, S32)
907 .scalarize(0);
908
910 // Don't worry about the size constraint.
912 .lower();
913
915 .legalFor({S1, S32, S64, S16, GlobalPtr,
916 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
917 .legalIf(isPointer(0))
918 .clampScalar(0, S32, S64)
920
921 getActionDefinitionsBuilder(G_FCONSTANT)
922 .legalFor({S32, S64, S16})
923 .clampScalar(0, S16, S64);
924
925 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
926 .legalIf(isRegisterClassType(ST, 0))
927 // s1 and s16 are special cases because they have legal operations on
928 // them, but don't really occupy registers in the normal way.
929 .legalFor({S1, S16})
930 .clampNumElements(0, V16S32, V32S32)
934 .clampMaxNumElements(0, S32, 16);
935
936 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
937
938 // If the amount is divergent, we have to do a wave reduction to get the
939 // maximum value, so this is expanded during RegBankSelect.
940 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
941 .legalFor({{PrivatePtr, S32}});
942
943 getActionDefinitionsBuilder(G_STACKSAVE)
944 .customFor({PrivatePtr});
945 getActionDefinitionsBuilder(G_STACKRESTORE)
946 .legalFor({PrivatePtr});
947
948 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
949
950 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
951 .customIf(typeIsNot(0, PrivatePtr));
952
953 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
954
955 auto &FPOpActions = getActionDefinitionsBuilder(
956 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
957 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
958 .legalFor({S32, S64});
959 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
960 .customFor({S32, S64});
961 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
962 .customFor({S32, S64});
963
964 if (ST.has16BitInsts()) {
965 if (ST.hasVOP3PInsts())
966 FPOpActions.legalFor({S16, V2S16});
967 else
968 FPOpActions.legalFor({S16});
969
970 TrigActions.customFor({S16});
971 FDIVActions.customFor({S16});
972 }
973
974 if (ST.hasPackedFP32Ops()) {
975 FPOpActions.legalFor({V2S32});
976 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
977 }
978
979 auto &MinNumMaxNum = getActionDefinitionsBuilder(
980 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
981 G_FMAXNUM_IEEE});
982
983 if (ST.hasVOP3PInsts()) {
984 MinNumMaxNum.customFor(FPTypesPK16)
985 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
986 .clampMaxNumElements(0, S16, 2)
987 .clampScalar(0, S16, S64)
988 .scalarize(0);
989 } else if (ST.has16BitInsts()) {
990 MinNumMaxNum.customFor(FPTypes16)
991 .clampScalar(0, S16, S64)
992 .scalarize(0);
993 } else {
994 MinNumMaxNum.customFor(FPTypesBase)
995 .clampScalar(0, S32, S64)
996 .scalarize(0);
997 }
998
999 if (ST.hasVOP3PInsts())
1000 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1001
1002 FPOpActions
1003 .scalarize(0)
1004 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1005
1006 TrigActions
1007 .scalarize(0)
1008 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1009
1010 FDIVActions
1011 .scalarize(0)
1012 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1013
1014 getActionDefinitionsBuilder({G_FNEG, G_FABS})
1015 .legalFor(FPTypesPK16)
1017 .scalarize(0)
1018 .clampScalar(0, S16, S64);
1019
1020 if (ST.has16BitInsts()) {
1022 .legalFor({S16})
1023 .customFor({S32, S64})
1024 .scalarize(0)
1025 .unsupported();
1027 .legalFor({S32, S64, S16})
1028 .scalarize(0)
1029 .clampScalar(0, S16, S64);
1030
1031 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1032 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1033 .scalarize(0)
1034 .maxScalarIf(typeIs(0, S16), 1, S16)
1035 .clampScalar(1, S32, S32)
1036 .lower();
1037
1039 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1040 .scalarize(0)
1041 .lower();
1042 } else {
1044 .customFor({S32, S64, S16})
1045 .scalarize(0)
1046 .unsupported();
1047
1048
1049 if (ST.hasFractBug()) {
1051 .customFor({S64})
1052 .legalFor({S32, S64})
1053 .scalarize(0)
1054 .clampScalar(0, S32, S64);
1055 } else {
1057 .legalFor({S32, S64})
1058 .scalarize(0)
1059 .clampScalar(0, S32, S64);
1060 }
1061
1062 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1063 .legalFor({{S32, S32}, {S64, S32}})
1064 .scalarize(0)
1065 .clampScalar(0, S32, S64)
1066 .clampScalar(1, S32, S32)
1067 .lower();
1068
1070 .customFor({{S32, S32}, {S64, S32}})
1071 .scalarize(0)
1072 .minScalar(0, S32)
1073 .clampScalar(1, S32, S32)
1074 .lower();
1075 }
1076
1077 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1078 if (ST.hasCvtPkF16F32Inst()) {
1079 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1080 .clampMaxNumElements(0, S16, 2);
1081 } else {
1082 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1083 }
1084 FPTruncActions.scalarize(0).lower();
1085
1087 .legalFor({{S64, S32}, {S32, S16}})
1088 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1089 .scalarize(0);
1090
1091 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1092 if (ST.has16BitInsts()) {
1093 FSubActions
1094 // Use actual fsub instruction
1095 .legalFor({S32, S16})
1096 // Must use fadd + fneg
1097 .lowerFor({S64, V2S16});
1098 } else {
1099 FSubActions
1100 // Use actual fsub instruction
1101 .legalFor({S32})
1102 // Must use fadd + fneg
1103 .lowerFor({S64, S16, V2S16});
1104 }
1105
1106 FSubActions
1107 .scalarize(0)
1108 .clampScalar(0, S32, S64);
1109
1110 // Whether this is legal depends on the floating point mode for the function.
1111 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1112 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1113 FMad.customFor({S32, S16});
1114 else if (ST.hasMadMacF32Insts())
1115 FMad.customFor({S32});
1116 else if (ST.hasMadF16())
1117 FMad.customFor({S16});
1118 FMad.scalarize(0)
1119 .lower();
1120
1121 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1122 if (ST.has16BitInsts()) {
1123 FRem.customFor({S16, S32, S64});
1124 } else {
1125 FRem.minScalar(0, S32)
1126 .customFor({S32, S64});
1127 }
1128 FRem.scalarize(0);
1129
1130 // TODO: Do we need to clamp maximum bitwidth?
1132 .legalIf(isScalar(0))
1133 .legalFor({{V2S16, V2S32}})
1134 .clampMaxNumElements(0, S16, 2)
1135 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1136 // situations (like an invalid implicit use), we don't want to infinite loop
1137 // in the legalizer.
1139 .alwaysLegal();
1140
1141 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1142 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1143 {S32, S1}, {S64, S1}, {S16, S1}})
1144 .scalarize(0)
1145 .clampScalar(0, S32, S64)
1146 .widenScalarToNextPow2(1, 32);
1147
1148 // TODO: Split s1->s64 during regbankselect for VALU.
1149 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1150 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1151 .lowerIf(typeIs(1, S1))
1152 .customFor({{S32, S64}, {S64, S64}});
1153 if (ST.has16BitInsts())
1154 IToFP.legalFor({{S16, S16}});
1155 IToFP.clampScalar(1, S32, S64)
1156 .minScalar(0, S32)
1157 .scalarize(0)
1159
1160 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1161 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1162 .customFor({{S64, S32}, {S64, S64}})
1163 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1164 if (ST.has16BitInsts())
1165 FPToI.legalFor({{S16, S16}});
1166 else
1167 FPToI.minScalar(1, S32);
1168
1169 FPToI.minScalar(0, S32)
1170 .widenScalarToNextPow2(0, 32)
1171 .scalarize(0)
1172 .lower();
1173
1174 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1175 .clampScalar(0, S16, S64)
1176 .scalarize(0)
1177 .lower();
1178
1179 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1180 .legalFor({S16, S32})
1181 .scalarize(0)
1182 .lower();
1183
1184 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1185 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1186 .scalarize(0)
1187 .lower();
1188
1189 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1190 .clampScalar(0, S16, S64)
1191 .scalarize(0)
1192 .lower();
1193
1194 if (ST.has16BitInsts()) {
1196 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1197 .legalFor({S16, S32, S64})
1198 .clampScalar(0, S16, S64)
1199 .scalarize(0);
1200 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1202 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1203 .legalFor({S32, S64})
1204 .clampScalar(0, S32, S64)
1205 .scalarize(0);
1206 } else {
1208 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1209 .legalFor({S32})
1210 .customFor({S64})
1211 .clampScalar(0, S32, S64)
1212 .scalarize(0);
1213 }
1214
1216 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1217 .legalIf(all(isPointer(0), sameSize(0, 1)))
1218 .scalarize(0)
1219 .scalarSameSizeAs(1, 0);
1220
1222 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1223 .scalarSameSizeAs(1, 0)
1224 .scalarize(0);
1225
1226 auto &CmpBuilder =
1228 // The compare output type differs based on the register bank of the output,
1229 // so make both s1 and s32 legal.
1230 //
1231 // Scalar compares producing output in scc will be promoted to s32, as that
1232 // is the allocatable register type that will be needed for the copy from
1233 // scc. This will be promoted during RegBankSelect, and we assume something
1234 // before that won't try to use s32 result types.
1235 //
1236 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1237 // bank.
1239 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1240 .legalForCartesianProduct(
1241 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1242 if (ST.has16BitInsts()) {
1243 CmpBuilder.legalFor({{S1, S16}});
1244 }
1245
1246 CmpBuilder
1248 .clampScalar(1, S32, S64)
1249 .scalarize(0)
1250 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1251
1252 auto &FCmpBuilder =
1254 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1255
1256 if (ST.hasSALUFloatInsts())
1257 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1258
1259 FCmpBuilder
1261 .clampScalar(1, S32, S64)
1262 .scalarize(0);
1263
1264 // FIXME: fpow has a selection pattern that should move to custom lowering.
1265 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1266 if (ST.has16BitInsts())
1267 ExpOps.customFor({{S32}, {S16}});
1268 else
1269 ExpOps.customFor({S32});
1270 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1271 .scalarize(0);
1272
1274 .clampScalar(0, MinScalarFPTy, S32)
1275 .lower();
1276
1277 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1278 Log2Ops.customFor({S32});
1279 if (ST.has16BitInsts())
1280 Log2Ops.legalFor({S16});
1281 else
1282 Log2Ops.customFor({S16});
1283 Log2Ops.scalarize(0)
1284 .lower();
1285
1286 auto &LogOps =
1287 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1288 LogOps.customFor({S32, S16});
1289 LogOps.clampScalar(0, MinScalarFPTy, S32)
1290 .scalarize(0);
1291
1292 // The 64-bit versions produce 32-bit results, but only on the SALU.
1294 .legalFor({{S32, S32}, {S32, S64}})
1295 .clampScalar(0, S32, S32)
1296 .widenScalarToNextPow2(1, 32)
1297 .clampScalar(1, S32, S64)
1298 .scalarize(0)
1299 .widenScalarToNextPow2(0, 32);
1300
1301 // If no 16 bit instr is available, lower into different instructions.
1302 if (ST.has16BitInsts())
1303 getActionDefinitionsBuilder(G_IS_FPCLASS)
1304 .legalForCartesianProduct({S1}, FPTypes16)
1305 .widenScalarToNextPow2(1)
1306 .scalarize(0)
1307 .lower();
1308 else
1309 getActionDefinitionsBuilder(G_IS_FPCLASS)
1310 .legalForCartesianProduct({S1}, FPTypesBase)
1311 .lowerFor({S1, S16})
1312 .widenScalarToNextPow2(1)
1313 .scalarize(0)
1314 .lower();
1315
1316 // The hardware instructions return a different result on 0 than the generic
1317 // instructions expect. The hardware produces -1, but these produce the
1318 // bitwidth.
1319 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1320 .scalarize(0)
1321 .clampScalar(0, S32, S32)
1322 .clampScalar(1, S32, S64)
1323 .widenScalarToNextPow2(0, 32)
1324 .widenScalarToNextPow2(1, 32)
1325 .custom();
1326
1327 // The 64-bit versions produce 32-bit results, but only on the SALU.
1328 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1329 .legalFor({{S32, S32}, {S32, S64}})
1330 .customIf(scalarNarrowerThan(1, 32))
1331 .clampScalar(0, S32, S32)
1332 .clampScalar(1, S32, S64)
1333 .scalarize(0)
1334 .widenScalarToNextPow2(0, 32)
1335 .widenScalarToNextPow2(1, 32);
1336
1337 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1338 .legalFor({{S32, S32}, {S32, S64}})
1339 .clampScalar(0, S32, S32)
1340 .clampScalar(1, S32, S64)
1341 .scalarize(0)
1342 .widenScalarToNextPow2(0, 32)
1343 .widenScalarToNextPow2(1, 32);
1344
1345 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1346 // RegBankSelect.
1347 getActionDefinitionsBuilder(G_BITREVERSE)
1348 .legalFor({S32, S64})
1349 .clampScalar(0, S32, S64)
1350 .scalarize(0)
1352
1353 if (ST.has16BitInsts()) {
1355 .legalFor({S16, S32, V2S16})
1356 .clampMaxNumElementsStrict(0, S16, 2)
1357 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1358 // narrowScalar limitation.
1360 .clampScalar(0, S16, S32)
1361 .scalarize(0);
1362
1363 if (ST.hasVOP3PInsts()) {
1365 .legalFor({S32, S16, V2S16})
1366 .clampMaxNumElements(0, S16, 2)
1367 .minScalar(0, S16)
1369 .scalarize(0)
1370 .lower();
1371 if (ST.hasIntMinMax64()) {
1372 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1373 .legalFor({S32, S16, S64, V2S16})
1374 .clampMaxNumElements(0, S16, 2)
1375 .minScalar(0, S16)
1377 .scalarize(0)
1378 .lower();
1379 } else {
1380 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1381 .legalFor({S32, S16, V2S16})
1382 .clampMaxNumElements(0, S16, 2)
1383 .minScalar(0, S16)
1385 .scalarize(0)
1386 .lower();
1387 }
1388 } else {
1389 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1390 .legalFor({S32, S16})
1391 .widenScalarToNextPow2(0)
1392 .minScalar(0, S16)
1393 .scalarize(0)
1394 .lower();
1395 }
1396 } else {
1397 // TODO: Should have same legality without v_perm_b32
1399 .legalFor({S32})
1400 .lowerIf(scalarNarrowerThan(0, 32))
1401 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1402 // narrowScalar limitation.
1404 .maxScalar(0, S32)
1405 .scalarize(0)
1406 .lower();
1407
1408 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1409 .legalFor({S32})
1410 .minScalar(0, S32)
1412 .scalarize(0)
1413 .lower();
1414 }
1415
1416 getActionDefinitionsBuilder(G_INTTOPTR)
1417 // List the common cases
1418 .legalForCartesianProduct(AddrSpaces64, {S64})
1419 .legalForCartesianProduct(AddrSpaces32, {S32})
1420 .scalarize(0)
1421 // Accept any address space as long as the size matches
1422 .legalIf(sameSize(0, 1))
1424 [](const LegalityQuery &Query) {
1425 return std::pair(
1426 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1427 })
1428 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1429 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1430 });
1431
1432 getActionDefinitionsBuilder(G_PTRTOINT)
1433 // List the common cases
1434 .legalForCartesianProduct(AddrSpaces64, {S64})
1435 .legalForCartesianProduct(AddrSpaces32, {S32})
1436 .scalarize(0)
1437 // Accept any address space as long as the size matches
1438 .legalIf(sameSize(0, 1))
1440 [](const LegalityQuery &Query) {
1441 return std::pair(
1442 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1443 })
1444 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1445 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1446 });
1447
1448 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1449 .scalarize(0)
1450 .custom();
1451
1452 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1453 bool IsLoad) -> bool {
1454 const LLT DstTy = Query.Types[0];
1455
1456 // Split vector extloads.
1457 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1458
1459 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1460 return true;
1461
1462 const LLT PtrTy = Query.Types[1];
1463 unsigned AS = PtrTy.getAddressSpace();
1464 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1465 Query.MMODescrs[0].Ordering !=
1467 return true;
1468
1469 // Catch weird sized loads that don't evenly divide into the access sizes
1470 // TODO: May be able to widen depending on alignment etc.
1471 unsigned NumRegs = (MemSize + 31) / 32;
1472 if (NumRegs == 3) {
1473 if (!ST.hasDwordx3LoadStores())
1474 return true;
1475 } else {
1476 // If the alignment allows, these should have been widened.
1477 if (!isPowerOf2_32(NumRegs))
1478 return true;
1479 }
1480
1481 return false;
1482 };
1483
1484 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1485 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1486 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1487
1488 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1489 // LDS
1490 // TODO: Unsupported flat for SI.
1491
1492 for (unsigned Op : {G_LOAD, G_STORE}) {
1493 const bool IsStore = Op == G_STORE;
1494
1495 auto &Actions = getActionDefinitionsBuilder(Op);
1496 // Explicitly list some common cases.
1497 // TODO: Does this help compile time at all?
1498 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1499 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1500 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1501 {S64, GlobalPtr, S64, GlobalAlign32},
1502 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1503 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1504 {S32, GlobalPtr, S8, GlobalAlign8},
1505 {S32, GlobalPtr, S16, GlobalAlign16},
1506
1507 {S32, LocalPtr, S32, 32},
1508 {S64, LocalPtr, S64, 32},
1509 {V2S32, LocalPtr, V2S32, 32},
1510 {S32, LocalPtr, S8, 8},
1511 {S32, LocalPtr, S16, 16},
1512 {V2S16, LocalPtr, S32, 32},
1513
1514 {S32, PrivatePtr, S32, 32},
1515 {S32, PrivatePtr, S8, 8},
1516 {S32, PrivatePtr, S16, 16},
1517 {V2S16, PrivatePtr, S32, 32},
1518
1519 {S32, ConstantPtr, S32, GlobalAlign32},
1520 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1521 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1522 {S64, ConstantPtr, S64, GlobalAlign32},
1523 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1524 Actions.legalIf(
1525 [=](const LegalityQuery &Query) -> bool {
1526 return isLoadStoreLegal(ST, Query);
1527 });
1528
1529 // The custom pointers (fat pointers, buffer resources) don't work with load
1530 // and store at this level. Fat pointers should have been lowered to
1531 // intrinsics before the translation to MIR.
1532 Actions.unsupportedIf(
1533 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1534
1535 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1536 // ptrtoint. This is needed to account for the fact that we can't have i128
1537 // as a register class for SelectionDAG reasons.
1538 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1539 return hasBufferRsrcWorkaround(Query.Types[0]);
1540 });
1541
1542 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1543 // 64-bits.
1544 //
1545 // TODO: Should generalize bitcast action into coerce, which will also cover
1546 // inserting addrspacecasts.
1547 Actions.customIf(typeIs(1, Constant32Ptr));
1548
1549 // Turn any illegal element vectors into something easier to deal
1550 // with. These will ultimately produce 32-bit scalar shifts to extract the
1551 // parts anyway.
1552 //
1553 // For odd 16-bit element vectors, prefer to split those into pieces with
1554 // 16-bit vector parts.
1555 Actions.bitcastIf(
1556 [=](const LegalityQuery &Query) -> bool {
1557 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1558 Query.MMODescrs[0].MemoryTy);
1559 }, bitcastToRegisterType(0));
1560
1561 if (!IsStore) {
1562 // Widen suitably aligned loads by loading extra bytes. The standard
1563 // legalization actions can't properly express widening memory operands.
1564 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1565 return shouldWidenLoad(ST, Query, G_LOAD);
1566 });
1567 }
1568
1569 // FIXME: load/store narrowing should be moved to lower action
1570 Actions
1571 .narrowScalarIf(
1572 [=](const LegalityQuery &Query) -> bool {
1573 return !Query.Types[0].isVector() &&
1574 needToSplitMemOp(Query, Op == G_LOAD);
1575 },
1576 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1577 const LLT DstTy = Query.Types[0];
1578 const LLT PtrTy = Query.Types[1];
1579
1580 const unsigned DstSize = DstTy.getSizeInBits();
1581 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1582
1583 // Split extloads.
1584 if (DstSize > MemSize)
1585 return std::pair(0, LLT::scalar(MemSize));
1586
1587 unsigned MaxSize = maxSizeForAddrSpace(
1588 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1589 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1590 if (MemSize > MaxSize)
1591 return std::pair(0, LLT::scalar(MaxSize));
1592
1593 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1594 return std::pair(0, LLT::scalar(Align));
1595 })
1596 .fewerElementsIf(
1597 [=](const LegalityQuery &Query) -> bool {
1598 return Query.Types[0].isVector() &&
1599 needToSplitMemOp(Query, Op == G_LOAD);
1600 },
1601 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1602 const LLT DstTy = Query.Types[0];
1603 const LLT PtrTy = Query.Types[1];
1604
1605 LLT EltTy = DstTy.getElementType();
1606 unsigned MaxSize = maxSizeForAddrSpace(
1607 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1608 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1609
1610 // FIXME: Handle widened to power of 2 results better. This ends
1611 // up scalarizing.
1612 // FIXME: 3 element stores scalarized on SI
1613
1614 // Split if it's too large for the address space.
1615 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1616 if (MemSize > MaxSize) {
1617 unsigned NumElts = DstTy.getNumElements();
1618 unsigned EltSize = EltTy.getSizeInBits();
1619
1620 if (MaxSize % EltSize == 0) {
1621 return std::pair(
1623 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1624 }
1625
1626 unsigned NumPieces = MemSize / MaxSize;
1627
1628 // FIXME: Refine when odd breakdowns handled
1629 // The scalars will need to be re-legalized.
1630 if (NumPieces == 1 || NumPieces >= NumElts ||
1631 NumElts % NumPieces != 0)
1632 return std::pair(0, EltTy);
1633
1634 return std::pair(0,
1635 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1636 }
1637
1638 // FIXME: We could probably handle weird extending loads better.
1639 if (DstTy.getSizeInBits() > MemSize)
1640 return std::pair(0, EltTy);
1641
1642 unsigned EltSize = EltTy.getSizeInBits();
1643 unsigned DstSize = DstTy.getSizeInBits();
1644 if (!isPowerOf2_32(DstSize)) {
1645 // We're probably decomposing an odd sized store. Try to split
1646 // to the widest type. TODO: Account for alignment. As-is it
1647 // should be OK, since the new parts will be further legalized.
1648 unsigned FloorSize = llvm::bit_floor(DstSize);
1649 return std::pair(
1651 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1652 }
1653
1654 // May need relegalization for the scalars.
1655 return std::pair(0, EltTy);
1656 })
1657 .minScalar(0, S32)
1658 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1660 .widenScalarToNextPow2(0)
1661 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1662 .lower();
1663 }
1664
1665 // FIXME: Unaligned accesses not lowered.
1666 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1667 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1668 {S32, GlobalPtr, S16, 2 * 8},
1669 {S32, LocalPtr, S8, 8},
1670 {S32, LocalPtr, S16, 16},
1671 {S32, PrivatePtr, S8, 8},
1672 {S32, PrivatePtr, S16, 16},
1673 {S32, ConstantPtr, S8, 8},
1674 {S32, ConstantPtr, S16, 2 * 8}})
1675 .legalIf(
1676 [=](const LegalityQuery &Query) -> bool {
1677 return isLoadStoreLegal(ST, Query);
1678 });
1679
1680 if (ST.hasFlatAddressSpace()) {
1681 ExtLoads.legalForTypesWithMemDesc(
1682 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1683 }
1684
1685 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1686 // 64-bits.
1687 //
1688 // TODO: Should generalize bitcast action into coerce, which will also cover
1689 // inserting addrspacecasts.
1690 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1691
1692 ExtLoads.clampScalar(0, S32, S32)
1694 .lower();
1695
1696 auto &Atomics = getActionDefinitionsBuilder(
1697 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1698 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1699 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1700 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1701 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1702 {S64, GlobalPtr}, {S64, LocalPtr},
1703 {S32, RegionPtr}, {S64, RegionPtr}});
1704 if (ST.hasFlatAddressSpace()) {
1705 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1706 }
1707
1708 // TODO: v2bf16 operations, and fat buffer pointer support.
1709 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1710 if (ST.hasLDSFPAtomicAddF32()) {
1711 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1712 if (ST.hasLdsAtomicAddF64())
1713 Atomic.legalFor({{S64, LocalPtr}});
1714 if (ST.hasAtomicDsPkAdd16Insts())
1715 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1716 }
1717 if (ST.hasAtomicFaddInsts())
1718 Atomic.legalFor({{S32, GlobalPtr}});
1719 if (ST.hasFlatAtomicFaddF32Inst())
1720 Atomic.legalFor({{S32, FlatPtr}});
1721
1722 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1723 // These are legal with some caveats, and should have undergone expansion in
1724 // the IR in most situations
1725 // TODO: Move atomic expansion into legalizer
1726 Atomic.legalFor({
1727 {S32, GlobalPtr},
1728 {S64, GlobalPtr},
1729 {S64, FlatPtr}
1730 });
1731 }
1732
1733 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1734 ST.hasAtomicBufferGlobalPkAddF16Insts())
1735 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1736 if (ST.hasAtomicGlobalPkAddBF16Inst())
1737 Atomic.legalFor({{V2BF16, GlobalPtr}});
1738 if (ST.hasAtomicFlatPkAdd16Insts())
1739 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1740
1741
1742 // Most of the legalization work here is done by AtomicExpand. We could
1743 // probably use a simpler legality rule that just assumes anything is OK.
1744 auto &AtomicFMinFMax =
1745 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1746 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1747
1748 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1749 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1750 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1751 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1752 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1753 AtomicFMinFMax.legalFor({F32, FlatPtr});
1754 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1755 AtomicFMinFMax.legalFor({F64, FlatPtr});
1756
1757 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1758 // demarshalling
1759 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1760 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1761 {S32, FlatPtr}, {S64, FlatPtr}})
1762 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1763 {S32, RegionPtr}, {S64, RegionPtr}});
1764 // TODO: Pointer types, any 32-bit or 64-bit vector
1765
1766 // Condition should be s32 for scalar, s1 for vector.
1769 LocalPtr, FlatPtr, PrivatePtr,
1770 LLT::fixed_vector(2, LocalPtr),
1771 LLT::fixed_vector(2, PrivatePtr)},
1772 {S1, S32})
1773 .clampScalar(0, S16, S64)
1774 .scalarize(1)
1777 .clampMaxNumElements(0, S32, 2)
1778 .clampMaxNumElements(0, LocalPtr, 2)
1779 .clampMaxNumElements(0, PrivatePtr, 2)
1780 .scalarize(0)
1782 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1783
1784 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1785 // be more flexible with the shift amount type.
1786 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1787 .legalFor({{S32, S32}, {S64, S32}});
1788 if (ST.has16BitInsts()) {
1789 if (ST.hasVOP3PInsts()) {
1790 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1791 .clampMaxNumElements(0, S16, 2);
1792 } else
1793 Shifts.legalFor({{S16, S16}});
1794
1795 // TODO: Support 16-bit shift amounts for all types
1796 Shifts.widenScalarIf(
1797 [=](const LegalityQuery &Query) {
1798 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1799 // 32-bit amount.
1800 const LLT ValTy = Query.Types[0];
1801 const LLT AmountTy = Query.Types[1];
1802 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1803 AmountTy.getSizeInBits() < 16;
1804 }, changeTo(1, S16));
1805 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1806 Shifts.clampScalar(1, S32, S32);
1807 Shifts.widenScalarToNextPow2(0, 16);
1808 Shifts.clampScalar(0, S16, S64);
1809
1810 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1811 .minScalar(0, S16)
1812 .scalarize(0)
1813 .lower();
1814 } else {
1815 // Make sure we legalize the shift amount type first, as the general
1816 // expansion for the shifted type will produce much worse code if it hasn't
1817 // been truncated already.
1818 Shifts.clampScalar(1, S32, S32);
1819 Shifts.widenScalarToNextPow2(0, 32);
1820 Shifts.clampScalar(0, S32, S64);
1821
1822 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1823 .minScalar(0, S32)
1824 .scalarize(0)
1825 .lower();
1826 }
1827 Shifts.scalarize(0);
1828
1829 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1830 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1831 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1832 unsigned IdxTypeIdx = 2;
1833
1835 .customIf([=](const LegalityQuery &Query) {
1836 const LLT EltTy = Query.Types[EltTypeIdx];
1837 const LLT VecTy = Query.Types[VecTypeIdx];
1838 const LLT IdxTy = Query.Types[IdxTypeIdx];
1839 const unsigned EltSize = EltTy.getSizeInBits();
1840 const bool isLegalVecType =
1842 // Address space 8 pointers are 128-bit wide values, but the logic
1843 // below will try to bitcast them to 2N x s64, which will fail.
1844 // Therefore, as an intermediate step, wrap extracts/insertions from a
1845 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1846 // extraction result) in order to produce a vector operation that can
1847 // be handled by the logic below.
1848 if (EltTy.isPointer() && EltSize > 64)
1849 return true;
1850 return (EltSize == 32 || EltSize == 64) &&
1851 VecTy.getSizeInBits() % 32 == 0 &&
1852 VecTy.getSizeInBits() <= MaxRegisterSize &&
1853 IdxTy.getSizeInBits() == 32 &&
1854 isLegalVecType;
1855 })
1856 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1857 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1858 bitcastToVectorElement32(VecTypeIdx))
1859 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1860 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1861 scalarOrEltWiderThan(VecTypeIdx, 64)),
1862 [=](const LegalityQuery &Query) {
1863 // For > 64-bit element types, try to turn this into a
1864 // 64-bit element vector since we may be able to do better
1865 // indexing if this is scalar. If not, fall back to 32.
1866 const LLT EltTy = Query.Types[EltTypeIdx];
1867 const LLT VecTy = Query.Types[VecTypeIdx];
1868 const unsigned DstEltSize = EltTy.getSizeInBits();
1869 const unsigned VecSize = VecTy.getSizeInBits();
1870
1871 const unsigned TargetEltSize =
1872 DstEltSize % 64 == 0 ? 64 : 32;
1873 return std::pair(VecTypeIdx,
1874 LLT::fixed_vector(VecSize / TargetEltSize,
1875 TargetEltSize));
1876 })
1877 .clampScalar(EltTypeIdx, S32, S64)
1878 .clampScalar(VecTypeIdx, S32, S64)
1879 .clampScalar(IdxTypeIdx, S32, S32)
1880 .clampMaxNumElements(VecTypeIdx, S32, 32)
1881 // TODO: Clamp elements for 64-bit vectors?
1882 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1884 // It should only be necessary with variable indexes.
1885 // As a last resort, lower to the stack
1886 .lower();
1887 }
1888
1889 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1890 .unsupportedIf([=](const LegalityQuery &Query) {
1891 const LLT &EltTy = Query.Types[1].getElementType();
1892 return Query.Types[0] != EltTy;
1893 });
1894
1895 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1896 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1897 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1898
1899 // FIXME: Doesn't handle extract of illegal sizes.
1901 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1902 .lowerIf([=](const LegalityQuery &Query) {
1903 // Sub-vector(or single element) insert and extract.
1904 // TODO: verify immediate offset here since lower only works with
1905 // whole elements.
1906 const LLT BigTy = Query.Types[BigTyIdx];
1907 return BigTy.isVector();
1908 })
1909 // FIXME: Multiples of 16 should not be legal.
1910 .legalIf([=](const LegalityQuery &Query) {
1911 const LLT BigTy = Query.Types[BigTyIdx];
1912 const LLT LitTy = Query.Types[LitTyIdx];
1913 return (BigTy.getSizeInBits() % 32 == 0) &&
1914 (LitTy.getSizeInBits() % 16 == 0);
1915 })
1916 .widenScalarIf(
1917 [=](const LegalityQuery &Query) {
1918 const LLT BigTy = Query.Types[BigTyIdx];
1919 return (BigTy.getScalarSizeInBits() < 16);
1920 },
1922 .widenScalarIf(
1923 [=](const LegalityQuery &Query) {
1924 const LLT LitTy = Query.Types[LitTyIdx];
1925 return (LitTy.getScalarSizeInBits() < 16);
1926 },
1928 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1929 .widenScalarToNextPow2(BigTyIdx, 32);
1930
1931 }
1932
1933 auto &BuildVector =
1934 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1936 .legalForCartesianProduct(AllS64Vectors, {S64})
1937 .clampNumElements(0, V16S32, V32S32)
1942
1943 if (ST.hasScalarPackInsts()) {
1944 BuildVector
1945 // FIXME: Should probably widen s1 vectors straight to s32
1946 .minScalarOrElt(0, S16)
1947 .minScalar(1, S16);
1948
1949 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1950 .legalFor({V2S16, S32})
1951 .lower();
1952 } else {
1953 BuildVector.customFor({V2S16, S16});
1954 BuildVector.minScalarOrElt(0, S32);
1955
1956 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1957 .customFor({V2S16, S32})
1958 .lower();
1959 }
1960
1961 BuildVector.legalIf(isRegisterType(ST, 0));
1962
1963 // FIXME: Clamp maximum size
1964 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1965 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1966 .clampMaxNumElements(0, S32, 32)
1967 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1968 .clampMaxNumElements(0, S16, 64);
1969
1970 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1971
1972 // Merge/Unmerge
1973 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1974 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1975 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1976
1977 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1978 const LLT Ty = Query.Types[TypeIdx];
1979 if (Ty.isVector()) {
1980 const LLT &EltTy = Ty.getElementType();
1981 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1982 return true;
1984 return true;
1985 }
1986 return false;
1987 };
1988
1989 auto &Builder =
1991 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1992 .lowerFor({{S16, V2S16}})
1993 .lowerIf([=](const LegalityQuery &Query) {
1994 const LLT BigTy = Query.Types[BigTyIdx];
1995 return BigTy.getSizeInBits() == 32;
1996 })
1997 // Try to widen to s16 first for small types.
1998 // TODO: Only do this on targets with legal s16 shifts
1999 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2000 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2002 oneMoreElement(BigTyIdx))
2004 elementTypeIs(1, S16)),
2005 changeTo(1, V2S16))
2006 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2007 // not worth considering the multiples of 64 since 2*192 and 2*384
2008 // are not valid.
2009 .clampScalar(LitTyIdx, S32, S512)
2010 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2011 // Break up vectors with weird elements into scalars
2013 [=](const LegalityQuery &Query) {
2014 return notValidElt(Query, LitTyIdx);
2015 },
2016 scalarize(0))
2017 .fewerElementsIf(
2018 [=](const LegalityQuery &Query) {
2019 return notValidElt(Query, BigTyIdx);
2020 },
2021 scalarize(1))
2022 .clampScalar(BigTyIdx, S32, MaxScalar);
2023
2024 if (Op == G_MERGE_VALUES) {
2025 Builder.widenScalarIf(
2026 // TODO: Use 16-bit shifts if legal for 8-bit values?
2027 [=](const LegalityQuery &Query) {
2028 const LLT Ty = Query.Types[LitTyIdx];
2029 return Ty.getSizeInBits() < 32;
2030 },
2031 changeTo(LitTyIdx, S32));
2032 }
2033
2034 Builder.widenScalarIf(
2035 [=](const LegalityQuery &Query) {
2036 const LLT Ty = Query.Types[BigTyIdx];
2037 return Ty.getSizeInBits() % 16 != 0;
2038 },
2039 [=](const LegalityQuery &Query) {
2040 // Pick the next power of 2, or a multiple of 64 over 128.
2041 // Whichever is smaller.
2042 const LLT &Ty = Query.Types[BigTyIdx];
2043 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2044 if (NewSizeInBits >= 256) {
2045 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2046 if (RoundedTo < NewSizeInBits)
2047 NewSizeInBits = RoundedTo;
2048 }
2049 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2050 })
2051 // Any vectors left are the wrong size. Scalarize them.
2052 .scalarize(0)
2053 .scalarize(1);
2054 }
2055
2056 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2057 // RegBankSelect.
2058 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2059 .legalFor({{S32}, {S64}})
2060 .clampScalar(0, S32, S64);
2061
2062 if (ST.hasVOP3PInsts()) {
2063 SextInReg.lowerFor({{V2S16}})
2064 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2065 // get more vector shift opportunities, since we'll get those when
2066 // expanded.
2067 .clampMaxNumElementsStrict(0, S16, 2);
2068 } else if (ST.has16BitInsts()) {
2069 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2070 } else {
2071 // Prefer to promote to s32 before lowering if we don't have 16-bit
2072 // shifts. This avoid a lot of intermediate truncate and extend operations.
2073 SextInReg.lowerFor({{S32}, {S64}});
2074 }
2075
2076 SextInReg
2077 .scalarize(0)
2078 .clampScalar(0, S32, S64)
2079 .lower();
2080
2081 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2082 .scalarize(0)
2083 .lower();
2084
2085 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2086 FSHRActionDefs.legalFor({{S32, S32}})
2087 .clampMaxNumElementsStrict(0, S16, 2);
2088 if (ST.hasVOP3PInsts())
2089 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2090 FSHRActionDefs.scalarize(0).lower();
2091
2092 if (ST.hasVOP3PInsts()) {
2094 .lowerFor({{V2S16, V2S16}})
2095 .clampMaxNumElementsStrict(0, S16, 2)
2096 .scalarize(0)
2097 .lower();
2098 } else {
2100 .scalarize(0)
2101 .lower();
2102 }
2103
2104 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2105 .legalFor({S64});
2106
2107 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2108
2110 .alwaysLegal();
2111
2112 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2113 .scalarize(0)
2114 .minScalar(0, S32)
2115 .lower();
2116
2117 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2118 .legalFor({{S32, S32}, {S64, S32}})
2119 .clampScalar(1, S32, S32)
2120 .clampScalar(0, S32, S64)
2122 .scalarize(0);
2123
2125 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2126 G_FCOPYSIGN,
2127
2128 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2129 G_READ_REGISTER, G_WRITE_REGISTER,
2130
2131 G_SADDO, G_SSUBO})
2132 .lower();
2133
2134 if (ST.hasIEEEMinimumMaximumInsts()) {
2135 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2136 .legalFor(FPTypesPK16)
2137 .clampMaxNumElements(0, S16, 2)
2138 .scalarize(0);
2139 } else {
2140 // TODO: Implement
2141 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2142 }
2143
2144 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2145 .lower();
2146
2147 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2148
2149 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2150 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2151 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2152 .unsupported();
2153
2155
2157 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2158 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2159 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2160 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2161 .legalFor(AllVectors)
2162 .scalarize(1)
2163 .lower();
2164
2166 verify(*ST.getInstrInfo());
2167}
2168
2171 LostDebugLocObserver &LocObserver) const {
2172 MachineIRBuilder &B = Helper.MIRBuilder;
2173 MachineRegisterInfo &MRI = *B.getMRI();
2174
2175 switch (MI.getOpcode()) {
2176 case TargetOpcode::G_ADDRSPACE_CAST:
2177 return legalizeAddrSpaceCast(MI, MRI, B);
2178 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2179 return legalizeFroundeven(MI, MRI, B);
2180 case TargetOpcode::G_FCEIL:
2181 return legalizeFceil(MI, MRI, B);
2182 case TargetOpcode::G_FREM:
2183 return legalizeFrem(MI, MRI, B);
2184 case TargetOpcode::G_INTRINSIC_TRUNC:
2185 return legalizeIntrinsicTrunc(MI, MRI, B);
2186 case TargetOpcode::G_SITOFP:
2187 return legalizeITOFP(MI, MRI, B, true);
2188 case TargetOpcode::G_UITOFP:
2189 return legalizeITOFP(MI, MRI, B, false);
2190 case TargetOpcode::G_FPTOSI:
2191 return legalizeFPTOI(MI, MRI, B, true);
2192 case TargetOpcode::G_FPTOUI:
2193 return legalizeFPTOI(MI, MRI, B, false);
2194 case TargetOpcode::G_FMINNUM:
2195 case TargetOpcode::G_FMAXNUM:
2196 case TargetOpcode::G_FMINIMUMNUM:
2197 case TargetOpcode::G_FMAXIMUMNUM:
2198 case TargetOpcode::G_FMINNUM_IEEE:
2199 case TargetOpcode::G_FMAXNUM_IEEE:
2200 return legalizeMinNumMaxNum(Helper, MI);
2201 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2202 return legalizeExtractVectorElt(MI, MRI, B);
2203 case TargetOpcode::G_INSERT_VECTOR_ELT:
2204 return legalizeInsertVectorElt(MI, MRI, B);
2205 case TargetOpcode::G_FSIN:
2206 case TargetOpcode::G_FCOS:
2207 return legalizeSinCos(MI, MRI, B);
2208 case TargetOpcode::G_GLOBAL_VALUE:
2209 return legalizeGlobalValue(MI, MRI, B);
2210 case TargetOpcode::G_LOAD:
2211 case TargetOpcode::G_SEXTLOAD:
2212 case TargetOpcode::G_ZEXTLOAD:
2213 return legalizeLoad(Helper, MI);
2214 case TargetOpcode::G_STORE:
2215 return legalizeStore(Helper, MI);
2216 case TargetOpcode::G_FMAD:
2217 return legalizeFMad(MI, MRI, B);
2218 case TargetOpcode::G_FDIV:
2219 return legalizeFDIV(MI, MRI, B);
2220 case TargetOpcode::G_FFREXP:
2221 return legalizeFFREXP(MI, MRI, B);
2222 case TargetOpcode::G_FSQRT:
2223 return legalizeFSQRT(MI, MRI, B);
2224 case TargetOpcode::G_UDIV:
2225 case TargetOpcode::G_UREM:
2226 case TargetOpcode::G_UDIVREM:
2227 return legalizeUnsignedDIV_REM(MI, MRI, B);
2228 case TargetOpcode::G_SDIV:
2229 case TargetOpcode::G_SREM:
2230 case TargetOpcode::G_SDIVREM:
2231 return legalizeSignedDIV_REM(MI, MRI, B);
2232 case TargetOpcode::G_ATOMIC_CMPXCHG:
2233 return legalizeAtomicCmpXChg(MI, MRI, B);
2234 case TargetOpcode::G_FLOG2:
2235 return legalizeFlog2(MI, B);
2236 case TargetOpcode::G_FLOG:
2237 case TargetOpcode::G_FLOG10:
2238 return legalizeFlogCommon(MI, B);
2239 case TargetOpcode::G_FEXP2:
2240 return legalizeFExp2(MI, B);
2241 case TargetOpcode::G_FEXP:
2242 case TargetOpcode::G_FEXP10:
2243 return legalizeFExp(MI, B);
2244 case TargetOpcode::G_FPOW:
2245 return legalizeFPow(MI, B);
2246 case TargetOpcode::G_FFLOOR:
2247 return legalizeFFloor(MI, MRI, B);
2248 case TargetOpcode::G_BUILD_VECTOR:
2249 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2250 return legalizeBuildVector(MI, MRI, B);
2251 case TargetOpcode::G_MUL:
2252 return legalizeMul(Helper, MI);
2253 case TargetOpcode::G_CTLZ:
2254 case TargetOpcode::G_CTTZ:
2255 return legalizeCTLZ_CTTZ(MI, MRI, B);
2256 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2257 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2258 case TargetOpcode::G_STACKSAVE:
2259 return legalizeStackSave(MI, B);
2260 case TargetOpcode::G_GET_FPENV:
2261 return legalizeGetFPEnv(MI, MRI, B);
2262 case TargetOpcode::G_SET_FPENV:
2263 return legalizeSetFPEnv(MI, MRI, B);
2264 case TargetOpcode::G_TRAP:
2265 return legalizeTrap(MI, MRI, B);
2266 case TargetOpcode::G_DEBUGTRAP:
2267 return legalizeDebugTrap(MI, MRI, B);
2268 default:
2269 return false;
2270 }
2271
2272 llvm_unreachable("expected switch to return");
2273}
2274
2276 unsigned AS,
2278 MachineIRBuilder &B) const {
2279 MachineFunction &MF = B.getMF();
2280 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2281 const LLT S32 = LLT::scalar(32);
2282 const LLT S64 = LLT::scalar(64);
2283
2285
2286 if (ST.hasApertureRegs()) {
2287 // Note: this register is somewhat broken. When used as a 32-bit operand,
2288 // it only returns zeroes. The real value is in the upper 32 bits.
2289 // Thus, we must emit extract the high 32 bits.
2290 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2291 ? AMDGPU::SRC_SHARED_BASE
2292 : AMDGPU::SRC_PRIVATE_BASE;
2293 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2294 !ST.hasGloballyAddressableScratch()) &&
2295 "Cannot use src_private_base with globally addressable scratch!");
2296 // FIXME: It would be more natural to emit a COPY here, but then copy
2297 // coalescing would kick in and it would think it's okay to use the "HI"
2298 // subregister (instead of extracting the HI 32 bits) which is an artificial
2299 // (unusable) register.
2300 // Register TableGen definitions would need an overhaul to get rid of the
2301 // artificial "HI" aperture registers and prevent this kind of issue from
2302 // happening.
2303 Register Dst = MRI.createGenericVirtualRegister(S64);
2304 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2305 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2306 return B.buildUnmerge(S32, Dst).getReg(1);
2307 }
2308
2309 // TODO: can we be smarter about machine pointer info?
2311 Register LoadAddr = MRI.createGenericVirtualRegister(
2313 // For code object version 5, private_base and shared_base are passed through
2314 // implicit kernargs.
2321 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2322
2323 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2325
2326 if (!loadInputValue(KernargPtrReg, B,
2328 return Register();
2329
2331 PtrInfo,
2335
2336 // Pointer address
2337 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2338 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2339 // Load address
2340 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2341 }
2342
2343 Register QueuePtr = MRI.createGenericVirtualRegister(
2345
2347 return Register();
2348
2349 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2350 // private_segment_aperture_base_hi.
2351 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2352
2354 PtrInfo,
2357 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2358
2359 B.buildObjectPtrOffset(
2360 LoadAddr, QueuePtr,
2361 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2362 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2363}
2364
2365/// Return true if the value is a known valid address, such that a null check is
2366/// not necessary.
2368 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2369 MachineInstr *Def = MRI.getVRegDef(Val);
2370 switch (Def->getOpcode()) {
2371 case AMDGPU::G_FRAME_INDEX:
2372 case AMDGPU::G_GLOBAL_VALUE:
2373 case AMDGPU::G_BLOCK_ADDR:
2374 return true;
2375 case AMDGPU::G_CONSTANT: {
2376 const ConstantInt *CI = Def->getOperand(1).getCImm();
2377 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2378 }
2379 default:
2380 return false;
2381 }
2382
2383 return false;
2384}
2385
2388 MachineIRBuilder &B) const {
2389 MachineFunction &MF = B.getMF();
2390
2391 // MI can either be a G_ADDRSPACE_CAST or a
2392 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2393 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2394 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2395 Intrinsic::amdgcn_addrspacecast_nonnull));
2396
2397 const LLT S32 = LLT::scalar(32);
2398 Register Dst = MI.getOperand(0).getReg();
2399 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2400 : MI.getOperand(1).getReg();
2401 LLT DstTy = MRI.getType(Dst);
2402 LLT SrcTy = MRI.getType(Src);
2403 unsigned DestAS = DstTy.getAddressSpace();
2404 unsigned SrcAS = SrcTy.getAddressSpace();
2405
2406 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2407 // vector element.
2408 assert(!DstTy.isVector());
2409
2410 const AMDGPUTargetMachine &TM
2411 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2412
2413 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2414 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2415 return true;
2416 }
2417
2418 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2419 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2420 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2421 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2422 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2423 ST.hasGloballyAddressableScratch()) {
2424 // flat -> private with globally addressable scratch: subtract
2425 // src_flat_scratch_base_lo.
2426 const LLT S32 = LLT::scalar(32);
2427 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2428 Register FlatScratchBaseLo =
2429 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2430 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2431 .getReg(0);
2432 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2433 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2434 return B.buildIntToPtr(Dst, Sub).getReg(0);
2435 }
2436
2437 // Extract low 32-bits of the pointer.
2438 return B.buildExtract(Dst, Src, 0).getReg(0);
2439 };
2440
2441 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2442 // G_ADDRSPACE_CAST we need to guess.
2443 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2444 castFlatToLocalOrPrivate(Dst);
2445 MI.eraseFromParent();
2446 return true;
2447 }
2448
2449 unsigned NullVal = TM.getNullPointerValue(DestAS);
2450
2451 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2452 auto FlatNull = B.buildConstant(SrcTy, 0);
2453
2454 // Extract low 32-bits of the pointer.
2455 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2456
2457 auto CmpRes =
2458 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2459 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2460
2461 MI.eraseFromParent();
2462 return true;
2463 }
2464
2465 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2466 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2467 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2468 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2469 // Coerce the type of the low half of the result so we can use
2470 // merge_values.
2471 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2472
2473 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2474 ST.hasGloballyAddressableScratch()) {
2475 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2476 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2477 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2478 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2479 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2480 .addUse(AllOnes)
2481 .addUse(ThreadID)
2482 .getReg(0);
2483 if (ST.isWave64()) {
2484 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2485 .addUse(AllOnes)
2486 .addUse(ThreadID)
2487 .getReg(0);
2488 }
2489 Register ShAmt =
2490 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2491 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2492 Register CvtPtr =
2493 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2494 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2495 // 64-bit hi:lo value.
2496 Register FlatScratchBase =
2497 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2498 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2499 .getReg(0);
2500 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2501 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2502 }
2503
2504 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2505 if (!ApertureReg.isValid())
2506 return false;
2507
2508 // TODO: Should we allow mismatched types but matching sizes in merges to
2509 // avoid the ptrtoint?
2510 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2511 };
2512
2513 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2514 // G_ADDRSPACE_CAST we need to guess.
2515 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2516 castLocalOrPrivateToFlat(Dst);
2517 MI.eraseFromParent();
2518 return true;
2519 }
2520
2521 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2522
2523 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2524 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2525
2526 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2527 SegmentNull.getReg(0));
2528
2529 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2530
2531 MI.eraseFromParent();
2532 return true;
2533 }
2534
2535 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2536 SrcTy.getSizeInBits() == 64) {
2537 // Truncate.
2538 B.buildExtract(Dst, Src, 0);
2539 MI.eraseFromParent();
2540 return true;
2541 }
2542
2543 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2544 DstTy.getSizeInBits() == 64) {
2546 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2547 auto PtrLo = B.buildPtrToInt(S32, Src);
2548 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2549 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2550 MI.eraseFromParent();
2551 return true;
2552 }
2553
2554 // Invalid casts are poison.
2555 // TODO: Should return poison
2556 B.buildUndef(Dst);
2557 MI.eraseFromParent();
2558 return true;
2559}
2560
2563 MachineIRBuilder &B) const {
2564 Register Src = MI.getOperand(1).getReg();
2565 LLT Ty = MRI.getType(Src);
2566 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2567
2568 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2569 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2570
2571 auto C1 = B.buildFConstant(Ty, C1Val);
2572 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2573
2574 // TODO: Should this propagate fast-math-flags?
2575 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2576 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2577
2578 auto C2 = B.buildFConstant(Ty, C2Val);
2579 auto Fabs = B.buildFAbs(Ty, Src);
2580
2581 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2582 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2583 MI.eraseFromParent();
2584 return true;
2585}
2586
2589 MachineIRBuilder &B) const {
2590
2591 const LLT S1 = LLT::scalar(1);
2592 const LLT S64 = LLT::scalar(64);
2593
2594 Register Src = MI.getOperand(1).getReg();
2595 assert(MRI.getType(Src) == S64);
2596
2597 // result = trunc(src)
2598 // if (src > 0.0 && src != result)
2599 // result += 1.0
2600
2601 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2602
2603 const auto Zero = B.buildFConstant(S64, 0.0);
2604 const auto One = B.buildFConstant(S64, 1.0);
2605 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2606 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2607 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2608 auto Add = B.buildSelect(S64, And, One, Zero);
2609
2610 // TODO: Should this propagate fast-math-flags?
2611 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2612 MI.eraseFromParent();
2613 return true;
2614}
2615
2618 MachineIRBuilder &B) const {
2619 Register DstReg = MI.getOperand(0).getReg();
2620 Register Src0Reg = MI.getOperand(1).getReg();
2621 Register Src1Reg = MI.getOperand(2).getReg();
2622 auto Flags = MI.getFlags();
2623 LLT Ty = MRI.getType(DstReg);
2624
2625 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2626 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2627 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2628 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2629 MI.eraseFromParent();
2630 return true;
2631}
2632
2635 const unsigned FractBits = 52;
2636 const unsigned ExpBits = 11;
2637 LLT S32 = LLT::scalar(32);
2638
2639 auto Const0 = B.buildConstant(S32, FractBits - 32);
2640 auto Const1 = B.buildConstant(S32, ExpBits);
2641
2642 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2643 .addUse(Hi)
2644 .addUse(Const0.getReg(0))
2645 .addUse(Const1.getReg(0));
2646
2647 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2648}
2649
2652 MachineIRBuilder &B) const {
2653 const LLT S1 = LLT::scalar(1);
2654 const LLT S32 = LLT::scalar(32);
2655 const LLT S64 = LLT::scalar(64);
2656
2657 Register Src = MI.getOperand(1).getReg();
2658 assert(MRI.getType(Src) == S64);
2659
2660 // TODO: Should this use extract since the low half is unused?
2661 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2662 Register Hi = Unmerge.getReg(1);
2663
2664 // Extract the upper half, since this is where we will find the sign and
2665 // exponent.
2666 auto Exp = extractF64Exponent(Hi, B);
2667
2668 const unsigned FractBits = 52;
2669
2670 // Extract the sign bit.
2671 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2672 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2673
2674 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2675
2676 const auto Zero32 = B.buildConstant(S32, 0);
2677
2678 // Extend back to 64-bits.
2679 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2680
2681 auto Shr = B.buildAShr(S64, FractMask, Exp);
2682 auto Not = B.buildNot(S64, Shr);
2683 auto Tmp0 = B.buildAnd(S64, Src, Not);
2684 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2685
2686 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2687 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2688
2689 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2690 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2691 MI.eraseFromParent();
2692 return true;
2693}
2694
2697 MachineIRBuilder &B, bool Signed) const {
2698
2699 Register Dst = MI.getOperand(0).getReg();
2700 Register Src = MI.getOperand(1).getReg();
2701
2702 const LLT S64 = LLT::scalar(64);
2703 const LLT S32 = LLT::scalar(32);
2704
2705 assert(MRI.getType(Src) == S64);
2706
2707 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2708 auto ThirtyTwo = B.buildConstant(S32, 32);
2709
2710 if (MRI.getType(Dst) == S64) {
2711 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2712 : B.buildUITOFP(S64, Unmerge.getReg(1));
2713
2714 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2715 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2716
2717 // TODO: Should this propagate fast-math-flags?
2718 B.buildFAdd(Dst, LdExp, CvtLo);
2719 MI.eraseFromParent();
2720 return true;
2721 }
2722
2723 assert(MRI.getType(Dst) == S32);
2724
2725 auto One = B.buildConstant(S32, 1);
2726
2727 MachineInstrBuilder ShAmt;
2728 if (Signed) {
2729 auto ThirtyOne = B.buildConstant(S32, 31);
2730 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2731 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2732 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2733 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2734 .addUse(Unmerge.getReg(1));
2735 auto LS2 = B.buildSub(S32, LS, One);
2736 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2737 } else
2738 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2739 auto Norm = B.buildShl(S64, Src, ShAmt);
2740 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2741 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2742 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2743 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2744 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2745 B.buildFLdexp(Dst, FVal, Scale);
2746 MI.eraseFromParent();
2747 return true;
2748}
2749
2750// TODO: Copied from DAG implementation. Verify logic and document how this
2751// actually works.
2755 bool Signed) const {
2756
2757 Register Dst = MI.getOperand(0).getReg();
2758 Register Src = MI.getOperand(1).getReg();
2759
2760 const LLT S64 = LLT::scalar(64);
2761 const LLT S32 = LLT::scalar(32);
2762
2763 const LLT SrcLT = MRI.getType(Src);
2764 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2765
2766 unsigned Flags = MI.getFlags();
2767
2768 // The basic idea of converting a floating point number into a pair of 32-bit
2769 // integers is illustrated as follows:
2770 //
2771 // tf := trunc(val);
2772 // hif := floor(tf * 2^-32);
2773 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2774 // hi := fptoi(hif);
2775 // lo := fptoi(lof);
2776 //
2777 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2779 if (Signed && SrcLT == S32) {
2780 // However, a 32-bit floating point number has only 23 bits mantissa and
2781 // it's not enough to hold all the significant bits of `lof` if val is
2782 // negative. To avoid the loss of precision, We need to take the absolute
2783 // value after truncating and flip the result back based on the original
2784 // signedness.
2785 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2786 Trunc = B.buildFAbs(S32, Trunc, Flags);
2787 }
2788 MachineInstrBuilder K0, K1;
2789 if (SrcLT == S64) {
2790 K0 = B.buildFConstant(
2791 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2792 K1 = B.buildFConstant(
2793 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2794 } else {
2795 K0 = B.buildFConstant(
2796 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2797 K1 = B.buildFConstant(
2798 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2799 }
2800
2801 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2802 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2803 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2804
2805 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2806 : B.buildFPTOUI(S32, FloorMul);
2807 auto Lo = B.buildFPTOUI(S32, Fma);
2808
2809 if (Signed && SrcLT == S32) {
2810 // Flip the result based on the signedness, which is either all 0s or 1s.
2811 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2812 // r := xor({lo, hi}, sign) - sign;
2813 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2814 Sign);
2815 } else
2816 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2817 MI.eraseFromParent();
2818
2819 return true;
2820}
2821
2823 MachineInstr &MI) const {
2824 MachineFunction &MF = Helper.MIRBuilder.getMF();
2826
2827 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2828 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2829
2830 // With ieee_mode disabled, the instructions have the correct behavior
2831 // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
2832 //
2833 // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
2834 // enabled.
2835 if (!MFI->getMode().IEEE) {
2836 if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
2837 MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
2838 return true;
2839
2840 return !IsIEEEOp;
2841 }
2842
2843 if (IsIEEEOp)
2844 return true;
2845
2847}
2848
2851 MachineIRBuilder &B) const {
2852 // TODO: Should move some of this into LegalizerHelper.
2853
2854 // TODO: Promote dynamic indexing of s16 to s32
2855
2856 Register Dst = MI.getOperand(0).getReg();
2857 Register Vec = MI.getOperand(1).getReg();
2858
2859 LLT VecTy = MRI.getType(Vec);
2860 LLT EltTy = VecTy.getElementType();
2861 assert(EltTy == MRI.getType(Dst));
2862
2863 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2864 // but we can't go directly to that logic becasue you can't bitcast a vector
2865 // of pointers to a vector of integers. Therefore, introduce an intermediate
2866 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2867 // drive the legalization forward.
2868 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2869 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2870 LLT IntVecTy = VecTy.changeElementType(IntTy);
2871
2872 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2873 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2874 B.buildIntToPtr(Dst, IntElt);
2875
2876 MI.eraseFromParent();
2877 return true;
2878 }
2879
2880 // FIXME: Artifact combiner probably should have replaced the truncated
2881 // constant before this, so we shouldn't need
2882 // getIConstantVRegValWithLookThrough.
2883 std::optional<ValueAndVReg> MaybeIdxVal =
2884 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2885 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2886 return true;
2887 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2888
2889 if (IdxVal < VecTy.getNumElements()) {
2890 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2891 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2892 } else {
2893 B.buildUndef(Dst);
2894 }
2895
2896 MI.eraseFromParent();
2897 return true;
2898}
2899
2902 MachineIRBuilder &B) const {
2903 // TODO: Should move some of this into LegalizerHelper.
2904
2905 // TODO: Promote dynamic indexing of s16 to s32
2906
2907 Register Dst = MI.getOperand(0).getReg();
2908 Register Vec = MI.getOperand(1).getReg();
2909 Register Ins = MI.getOperand(2).getReg();
2910
2911 LLT VecTy = MRI.getType(Vec);
2912 LLT EltTy = VecTy.getElementType();
2913 assert(EltTy == MRI.getType(Ins));
2914
2915 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2916 // but we can't go directly to that logic becasue you can't bitcast a vector
2917 // of pointers to a vector of integers. Therefore, make the pointer vector
2918 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2919 // new value, and then inttoptr the result vector back. This will then allow
2920 // the rest of legalization to take over.
2921 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2922 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2923 LLT IntVecTy = VecTy.changeElementType(IntTy);
2924
2925 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2926 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2927 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2928 MI.getOperand(3));
2929 B.buildIntToPtr(Dst, IntVecDest);
2930 MI.eraseFromParent();
2931 return true;
2932 }
2933
2934 // FIXME: Artifact combiner probably should have replaced the truncated
2935 // constant before this, so we shouldn't need
2936 // getIConstantVRegValWithLookThrough.
2937 std::optional<ValueAndVReg> MaybeIdxVal =
2938 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2939 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2940 return true;
2941
2942 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2943
2944 unsigned NumElts = VecTy.getNumElements();
2945 if (IdxVal < NumElts) {
2947 for (unsigned i = 0; i < NumElts; ++i)
2948 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2949 B.buildUnmerge(SrcRegs, Vec);
2950
2951 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2952 B.buildMergeLikeInstr(Dst, SrcRegs);
2953 } else {
2954 B.buildUndef(Dst);
2955 }
2956
2957 MI.eraseFromParent();
2958 return true;
2959}
2960
2963 MachineIRBuilder &B) const {
2964
2965 Register DstReg = MI.getOperand(0).getReg();
2966 Register SrcReg = MI.getOperand(1).getReg();
2967 LLT Ty = MRI.getType(DstReg);
2968 unsigned Flags = MI.getFlags();
2969
2970 Register TrigVal;
2971 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2972 if (ST.hasTrigReducedRange()) {
2973 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2974 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2975 .addUse(MulVal.getReg(0))
2976 .setMIFlags(Flags)
2977 .getReg(0);
2978 } else
2979 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2980
2981 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2982 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2983 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2984 .addUse(TrigVal)
2985 .setMIFlags(Flags);
2986 MI.eraseFromParent();
2987 return true;
2988}
2989
2992 const GlobalValue *GV,
2993 int64_t Offset,
2994 unsigned GAFlags) const {
2995 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2996 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2997 // to the following code sequence:
2998 //
2999 // For constant address space:
3000 // s_getpc_b64 s[0:1]
3001 // s_add_u32 s0, s0, $symbol
3002 // s_addc_u32 s1, s1, 0
3003 //
3004 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3005 // a fixup or relocation is emitted to replace $symbol with a literal
3006 // constant, which is a pc-relative offset from the encoding of the $symbol
3007 // operand to the global variable.
3008 //
3009 // For global address space:
3010 // s_getpc_b64 s[0:1]
3011 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3012 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3013 //
3014 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3015 // fixups or relocations are emitted to replace $symbol@*@lo and
3016 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3017 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3018 // operand to the global variable.
3019
3021
3022 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3023 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3024
3025 if (ST.has64BitLiterals()) {
3026 assert(GAFlags != SIInstrInfo::MO_NONE);
3027
3029 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3030 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3031 } else {
3033 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3034
3035 MIB.addGlobalAddress(GV, Offset, GAFlags);
3036 if (GAFlags == SIInstrInfo::MO_NONE)
3037 MIB.addImm(0);
3038 else
3039 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3040 }
3041
3042 if (!B.getMRI()->getRegClassOrNull(PCReg))
3043 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3044
3045 if (PtrTy.getSizeInBits() == 32)
3046 B.buildExtract(DstReg, PCReg, 0);
3047 return true;
3048}
3049
3050// Emit a ABS32_LO / ABS32_HI relocation stub.
3052 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3053 MachineRegisterInfo &MRI) const {
3054 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3055
3056 if (RequiresHighHalf && ST.has64BitLiterals()) {
3057 if (!MRI.getRegClassOrNull(DstReg))
3058 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3059 B.buildInstr(AMDGPU::S_MOV_B64)
3060 .addDef(DstReg)
3061 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3062 return;
3063 }
3064
3065 LLT S32 = LLT::scalar(32);
3066
3067 // Use the destination directly, if and only if we store the lower address
3068 // part only and we don't have a register class being set.
3069 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3070 ? DstReg
3071 : MRI.createGenericVirtualRegister(S32);
3072
3073 if (!MRI.getRegClassOrNull(AddrLo))
3074 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3075
3076 // Write the lower half.
3077 B.buildInstr(AMDGPU::S_MOV_B32)
3078 .addDef(AddrLo)
3079 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3080
3081 // If required, write the upper half as well.
3082 if (RequiresHighHalf) {
3083 assert(PtrTy.getSizeInBits() == 64 &&
3084 "Must provide a 64-bit pointer type!");
3085
3086 Register AddrHi = MRI.createGenericVirtualRegister(S32);
3087 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3088
3089 B.buildInstr(AMDGPU::S_MOV_B32)
3090 .addDef(AddrHi)
3091 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3092
3093 // Use the destination directly, if and only if we don't have a register
3094 // class being set.
3095 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3096 ? DstReg
3097 : MRI.createGenericVirtualRegister(LLT::scalar(64));
3098
3099 if (!MRI.getRegClassOrNull(AddrDst))
3100 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3101
3102 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3103
3104 // If we created a new register for the destination, cast the result into
3105 // the final output.
3106 if (AddrDst != DstReg)
3107 B.buildCast(DstReg, AddrDst);
3108 } else if (AddrLo != DstReg) {
3109 // If we created a new register for the destination, cast the result into
3110 // the final output.
3111 B.buildCast(DstReg, AddrLo);
3112 }
3113}
3114
3117 MachineIRBuilder &B) const {
3118 Register DstReg = MI.getOperand(0).getReg();
3119 LLT Ty = MRI.getType(DstReg);
3120 unsigned AS = Ty.getAddressSpace();
3121
3122 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3123 MachineFunction &MF = B.getMF();
3125
3127 if (!MFI->isModuleEntryFunction() &&
3128 GV->getName() != "llvm.amdgcn.module.lds" &&
3130 const Function &Fn = MF.getFunction();
3132 Fn, "local memory global used by non-kernel function",
3133 MI.getDebugLoc(), DS_Warning));
3134
3135 // We currently don't have a way to correctly allocate LDS objects that
3136 // aren't directly associated with a kernel. We do force inlining of
3137 // functions that use local objects. However, if these dead functions are
3138 // not eliminated, we don't want a compile time error. Just emit a warning
3139 // and a trap, since there should be no callable path here.
3140 B.buildTrap();
3141 B.buildUndef(DstReg);
3142 MI.eraseFromParent();
3143 return true;
3144 }
3145
3146 // TODO: We could emit code to handle the initialization somewhere.
3147 // We ignore the initializer for now and legalize it to allow selection.
3148 // The initializer will anyway get errored out during assembly emission.
3149 const SITargetLowering *TLI = ST.getTargetLowering();
3150 if (!TLI->shouldUseLDSConstAddress(GV)) {
3151 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3152 return true; // Leave in place;
3153 }
3154
3155 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3156 Type *Ty = GV->getValueType();
3157 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3158 // zero-sized type in other languages to declare the dynamic shared
3159 // memory which size is not known at the compile time. They will be
3160 // allocated by the runtime and placed directly after the static
3161 // allocated ones. They all share the same offset.
3162 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3163 // Adjust alignment for that dynamic shared memory array.
3165 LLT S32 = LLT::scalar(32);
3166 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3167 B.buildIntToPtr(DstReg, Sz);
3168 MI.eraseFromParent();
3169 return true;
3170 }
3171 }
3172
3173 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3174 *cast<GlobalVariable>(GV)));
3175 MI.eraseFromParent();
3176 return true;
3177 }
3178
3179 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3180 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3181 MI.eraseFromParent();
3182 return true;
3183 }
3184
3185 const SITargetLowering *TLI = ST.getTargetLowering();
3186
3187 if (TLI->shouldEmitFixup(GV)) {
3188 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3189 MI.eraseFromParent();
3190 return true;
3191 }
3192
3193 if (TLI->shouldEmitPCReloc(GV)) {
3194 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3195 MI.eraseFromParent();
3196 return true;
3197 }
3198
3200 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3201
3202 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3207 LoadTy, Align(8));
3208
3209 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3210
3211 if (Ty.getSizeInBits() == 32) {
3212 // Truncate if this is a 32-bit constant address.
3213 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3214 B.buildExtract(DstReg, Load, 0);
3215 } else
3216 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3217
3218 MI.eraseFromParent();
3219 return true;
3220}
3221
3223 if (Ty.isVector())
3224 return Ty.changeElementCount(
3225 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3226 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3227}
3228
3230 MachineInstr &MI) const {
3231 MachineIRBuilder &B = Helper.MIRBuilder;
3232 MachineRegisterInfo &MRI = *B.getMRI();
3233 GISelChangeObserver &Observer = Helper.Observer;
3234
3235 Register PtrReg = MI.getOperand(1).getReg();
3236 LLT PtrTy = MRI.getType(PtrReg);
3237 unsigned AddrSpace = PtrTy.getAddressSpace();
3238
3239 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3241 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3242 Observer.changingInstr(MI);
3243 MI.getOperand(1).setReg(Cast.getReg(0));
3244 Observer.changedInstr(MI);
3245 return true;
3246 }
3247
3248 if (MI.getOpcode() != AMDGPU::G_LOAD)
3249 return false;
3250
3251 Register ValReg = MI.getOperand(0).getReg();
3252 LLT ValTy = MRI.getType(ValReg);
3253
3254 if (hasBufferRsrcWorkaround(ValTy)) {
3255 Observer.changingInstr(MI);
3257 Observer.changedInstr(MI);
3258 return true;
3259 }
3260
3261 MachineMemOperand *MMO = *MI.memoperands_begin();
3262 const unsigned ValSize = ValTy.getSizeInBits();
3263 const LLT MemTy = MMO->getMemoryType();
3264 const Align MemAlign = MMO->getAlign();
3265 const unsigned MemSize = MemTy.getSizeInBits();
3266 const uint64_t AlignInBits = 8 * MemAlign.value();
3267
3268 // Widen non-power-of-2 loads to the alignment if needed
3269 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3270 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3271
3272 // This was already the correct extending load result type, so just adjust
3273 // the memory type.
3274 if (WideMemSize == ValSize) {
3275 MachineFunction &MF = B.getMF();
3276
3277 MachineMemOperand *WideMMO =
3278 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3279 Observer.changingInstr(MI);
3280 MI.setMemRefs(MF, {WideMMO});
3281 Observer.changedInstr(MI);
3282 return true;
3283 }
3284
3285 // Don't bother handling edge case that should probably never be produced.
3286 if (ValSize > WideMemSize)
3287 return false;
3288
3289 LLT WideTy = widenToNextPowerOf2(ValTy);
3290
3291 Register WideLoad;
3292 if (!WideTy.isVector()) {
3293 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3294 B.buildTrunc(ValReg, WideLoad).getReg(0);
3295 } else {
3296 // Extract the subvector.
3297
3298 if (isRegisterType(ST, ValTy)) {
3299 // If this a case where G_EXTRACT is legal, use it.
3300 // (e.g. <3 x s32> -> <4 x s32>)
3301 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3302 B.buildExtract(ValReg, WideLoad, 0);
3303 } else {
3304 // For cases where the widened type isn't a nice register value, unmerge
3305 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3306 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3307 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3308 }
3309 }
3310
3311 MI.eraseFromParent();
3312 return true;
3313 }
3314
3315 return false;
3316}
3317
3319 MachineInstr &MI) const {
3320 MachineIRBuilder &B = Helper.MIRBuilder;
3321 MachineRegisterInfo &MRI = *B.getMRI();
3322 GISelChangeObserver &Observer = Helper.Observer;
3323
3324 Register DataReg = MI.getOperand(0).getReg();
3325 LLT DataTy = MRI.getType(DataReg);
3326
3327 if (hasBufferRsrcWorkaround(DataTy)) {
3328 Observer.changingInstr(MI);
3330 Observer.changedInstr(MI);
3331 return true;
3332 }
3333 return false;
3334}
3335
3338 MachineIRBuilder &B) const {
3339 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3340 assert(Ty.isScalar());
3341
3342 MachineFunction &MF = B.getMF();
3344
3345 // TODO: Always legal with future ftz flag.
3346 // FIXME: Do we need just output?
3347 if (Ty == LLT::float32() &&
3349 return true;
3350 if (Ty == LLT::float16() &&
3352 return true;
3353
3354 MachineIRBuilder HelperBuilder(MI);
3355 GISelObserverWrapper DummyObserver;
3356 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3357 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3358}
3359
3362 Register DstReg = MI.getOperand(0).getReg();
3363 Register PtrReg = MI.getOperand(1).getReg();
3364 Register CmpVal = MI.getOperand(2).getReg();
3365 Register NewVal = MI.getOperand(3).getReg();
3366
3367 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3368 "this should not have been custom lowered");
3369
3370 LLT ValTy = MRI.getType(CmpVal);
3371 LLT VecTy = LLT::fixed_vector(2, ValTy);
3372
3373 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3374
3375 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3376 .addDef(DstReg)
3377 .addUse(PtrReg)
3378 .addUse(PackedVal)
3379 .setMemRefs(MI.memoperands());
3380
3381 MI.eraseFromParent();
3382 return true;
3383}
3384
3385/// Return true if it's known that \p Src can never be an f32 denormal value.
3387 Register Src) {
3388 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3389 switch (DefMI->getOpcode()) {
3390 case TargetOpcode::G_INTRINSIC: {
3392 case Intrinsic::amdgcn_frexp_mant:
3393 return true;
3394 default:
3395 break;
3396 }
3397
3398 break;
3399 }
3400 case TargetOpcode::G_FFREXP: {
3401 if (DefMI->getOperand(0).getReg() == Src)
3402 return true;
3403 break;
3404 }
3405 case TargetOpcode::G_FPEXT: {
3406 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3407 }
3408 default:
3409 return false;
3410 }
3411
3412 return false;
3413}
3414
3415static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3416 return Flags & MachineInstr::FmAfn;
3417}
3418
3420 unsigned Flags) {
3421 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3424}
3425
3426std::pair<Register, Register>
3428 unsigned Flags) const {
3429 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3430 return {};
3431
3432 const LLT F32 = LLT::scalar(32);
3433 auto SmallestNormal = B.buildFConstant(
3435 auto IsLtSmallestNormal =
3436 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3437
3438 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3439 auto One = B.buildFConstant(F32, 1.0);
3440 auto ScaleFactor =
3441 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3442 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3443
3444 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3445}
3446
3448 MachineIRBuilder &B) const {
3449 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3450 // If we have to handle denormals, scale up the input and adjust the result.
3451
3452 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3453 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3454
3455 Register Dst = MI.getOperand(0).getReg();
3456 Register Src = MI.getOperand(1).getReg();
3457 LLT Ty = B.getMRI()->getType(Dst);
3458 unsigned Flags = MI.getFlags();
3459
3460 if (Ty == LLT::scalar(16)) {
3461 const LLT F32 = LLT::scalar(32);
3462 // Nothing in half is a denormal when promoted to f32.
3463 auto Ext = B.buildFPExt(F32, Src, Flags);
3464 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3465 .addUse(Ext.getReg(0))
3466 .setMIFlags(Flags);
3467 B.buildFPTrunc(Dst, Log2, Flags);
3468 MI.eraseFromParent();
3469 return true;
3470 }
3471
3472 assert(Ty == LLT::scalar(32));
3473
3474 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3475 if (!ScaledInput) {
3476 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3477 .addUse(Src)
3478 .setMIFlags(Flags);
3479 MI.eraseFromParent();
3480 return true;
3481 }
3482
3483 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3484 .addUse(ScaledInput)
3485 .setMIFlags(Flags);
3486
3487 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3488 auto Zero = B.buildFConstant(Ty, 0.0);
3489 auto ResultOffset =
3490 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3491 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3492
3493 MI.eraseFromParent();
3494 return true;
3495}
3496
3498 Register Z, unsigned Flags) {
3499 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3500 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3501}
3502
3504 MachineIRBuilder &B) const {
3505 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3506 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3507
3508 MachineRegisterInfo &MRI = *B.getMRI();
3509 Register Dst = MI.getOperand(0).getReg();
3510 Register X = MI.getOperand(1).getReg();
3511 unsigned Flags = MI.getFlags();
3512 const LLT Ty = MRI.getType(X);
3513 MachineFunction &MF = B.getMF();
3514
3515 const LLT F32 = LLT::scalar(32);
3516 const LLT F16 = LLT::scalar(16);
3517
3518 const AMDGPUTargetMachine &TM =
3519 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3520
3521 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3522 if (Ty == F16 && !ST.has16BitInsts()) {
3523 Register LogVal = MRI.createGenericVirtualRegister(F32);
3524 auto PromoteSrc = B.buildFPExt(F32, X);
3525 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3526 B.buildFPTrunc(Dst, LogVal);
3527 } else {
3528 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3529 }
3530
3531 MI.eraseFromParent();
3532 return true;
3533 }
3534
3535 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3536 if (ScaledInput)
3537 X = ScaledInput;
3538
3539 auto Y =
3540 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3541
3542 Register R;
3543 if (ST.hasFastFMAF32()) {
3544 // c+cc are ln(2)/ln(10) to more than 49 bits
3545 const float c_log10 = 0x1.344134p-2f;
3546 const float cc_log10 = 0x1.09f79ep-26f;
3547
3548 // c + cc is ln(2) to more than 49 bits
3549 const float c_log = 0x1.62e42ep-1f;
3550 const float cc_log = 0x1.efa39ep-25f;
3551
3552 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3553 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3554
3555 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3556 auto NegR = B.buildFNeg(Ty, R, Flags);
3557 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3558 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3559 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3560 } else {
3561 // ch+ct is ln(2)/ln(10) to more than 36 bits
3562 const float ch_log10 = 0x1.344000p-2f;
3563 const float ct_log10 = 0x1.3509f6p-18f;
3564
3565 // ch + ct is ln(2) to more than 36 bits
3566 const float ch_log = 0x1.62e000p-1f;
3567 const float ct_log = 0x1.0bfbe8p-15f;
3568
3569 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3570 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3571
3572 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3573 auto YH = B.buildAnd(Ty, Y, MaskConst);
3574 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3575 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3576
3577 Register Mad0 =
3578 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3579 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3580 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3581 }
3582
3583 const bool IsFiniteOnly =
3584 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3585 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3586
3587 if (!IsFiniteOnly) {
3588 // Expand isfinite(x) => fabs(x) < inf
3589 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3590 auto Fabs = B.buildFAbs(Ty, Y);
3591 auto IsFinite =
3592 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3593 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3594 }
3595
3596 if (ScaledInput) {
3597 auto Zero = B.buildFConstant(Ty, 0.0);
3598 auto ShiftK =
3599 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3600 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3601 B.buildFSub(Dst, R, Shift, Flags);
3602 } else {
3603 B.buildCopy(Dst, R);
3604 }
3605
3606 MI.eraseFromParent();
3607 return true;
3608}
3609
3611 Register Src, bool IsLog10,
3612 unsigned Flags) const {
3613 const double Log2BaseInverted =
3615
3616 LLT Ty = B.getMRI()->getType(Dst);
3617
3618 if (Ty == LLT::scalar(32)) {
3619 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3620 if (ScaledInput) {
3621 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3622 .addUse(Src)
3623 .setMIFlags(Flags);
3624 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3625 auto Zero = B.buildFConstant(Ty, 0.0);
3626 auto ResultOffset =
3627 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3628 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3629
3630 if (ST.hasFastFMAF32())
3631 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3632 else {
3633 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3634 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3635 }
3636
3637 return true;
3638 }
3639 }
3640
3641 auto Log2Operand = Ty == LLT::scalar(16)
3642 ? B.buildFLog2(Ty, Src, Flags)
3643 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3644 .addUse(Src)
3645 .setMIFlags(Flags);
3646 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3647 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3648 return true;
3649}
3650
3652 MachineIRBuilder &B) const {
3653 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3654 // If we have to handle denormals, scale up the input and adjust the result.
3655
3656 Register Dst = MI.getOperand(0).getReg();
3657 Register Src = MI.getOperand(1).getReg();
3658 unsigned Flags = MI.getFlags();
3659 LLT Ty = B.getMRI()->getType(Dst);
3660 const LLT F16 = LLT::scalar(16);
3661 const LLT F32 = LLT::scalar(32);
3662
3663 if (Ty == F16) {
3664 // Nothing in half is a denormal when promoted to f32.
3665 auto Ext = B.buildFPExt(F32, Src, Flags);
3666 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3667 .addUse(Ext.getReg(0))
3668 .setMIFlags(Flags);
3669 B.buildFPTrunc(Dst, Log2, Flags);
3670 MI.eraseFromParent();
3671 return true;
3672 }
3673
3674 assert(Ty == F32);
3675
3676 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3677 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3678 .addUse(Src)
3679 .setMIFlags(Flags);
3680 MI.eraseFromParent();
3681 return true;
3682 }
3683
3684 // bool needs_scaling = x < -0x1.f80000p+6f;
3685 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3686
3687 // -nextafter(128.0, -1)
3688 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3689 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3690 RangeCheckConst, Flags);
3691
3692 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3693 auto Zero = B.buildFConstant(Ty, 0.0);
3694 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3695 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3696
3697 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3698 .addUse(AddInput.getReg(0))
3699 .setMIFlags(Flags);
3700
3701 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3702 auto One = B.buildFConstant(Ty, 1.0);
3703 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3704 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3705 MI.eraseFromParent();
3706 return true;
3707}
3708
3710 Register X, unsigned Flags) const {
3711 LLT Ty = B.getMRI()->getType(Dst);
3712 LLT F32 = LLT::scalar(32);
3713
3714 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3715 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3716 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3717
3718 if (Ty == F32) {
3719 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3720 .addUse(Mul.getReg(0))
3721 .setMIFlags(Flags);
3722 } else {
3723 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3724 }
3725
3726 return true;
3727 }
3728
3729 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3730 auto NeedsScaling =
3731 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3732 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3733 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3734 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3735
3736 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3737 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3738
3739 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3740 .addUse(ExpInput.getReg(0))
3741 .setMIFlags(Flags);
3742
3743 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3744 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3745 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3746 return true;
3747}
3748
3750 MachineIRBuilder &B) const {
3751 Register Dst = MI.getOperand(0).getReg();
3752 Register X = MI.getOperand(1).getReg();
3753 const unsigned Flags = MI.getFlags();
3754 MachineFunction &MF = B.getMF();
3755 MachineRegisterInfo &MRI = *B.getMRI();
3756 LLT Ty = MRI.getType(Dst);
3757 const LLT F16 = LLT::scalar(16);
3758 const LLT F32 = LLT::scalar(32);
3759 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3760
3761 if (Ty == F16) {
3762 // v_exp_f16 (fmul x, log2e)
3763 if (allowApproxFunc(MF, Flags)) {
3764 // TODO: Does this really require fast?
3765 legalizeFExpUnsafe(B, Dst, X, Flags);
3766 MI.eraseFromParent();
3767 return true;
3768 }
3769
3770 // exp(f16 x) ->
3771 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3772
3773 // Nothing in half is a denormal when promoted to f32.
3774 auto Ext = B.buildFPExt(F32, X, Flags);
3775 Register Lowered = MRI.createGenericVirtualRegister(F32);
3776 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3777 B.buildFPTrunc(Dst, Lowered, Flags);
3778 MI.eraseFromParent();
3779 return true;
3780 }
3781
3782 assert(Ty == F32);
3783
3784 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3785 // library behavior. Also, is known-not-daz source sufficient?
3786 if (allowApproxFunc(MF, Flags)) {
3787 legalizeFExpUnsafe(B, Dst, X, Flags);
3788 MI.eraseFromParent();
3789 return true;
3790 }
3791
3792 // Algorithm:
3793 //
3794 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3795 //
3796 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3797 // n = 64*m + j, 0 <= j < 64
3798 //
3799 // e^x = 2^((64*m + j + f)/64)
3800 // = (2^m) * (2^(j/64)) * 2^(f/64)
3801 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3802 //
3803 // f = x*(64/ln(2)) - n
3804 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3805 //
3806 // e^x = (2^m) * (2^(j/64)) * e^r
3807 //
3808 // (2^(j/64)) is precomputed
3809 //
3810 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3811 // e^r = 1 + q
3812 //
3813 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3814 //
3815 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3816 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3817 Register PH, PL;
3818
3819 if (ST.hasFastFMAF32()) {
3820 const float c_exp = numbers::log2ef;
3821 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3822 const float c_exp10 = 0x1.a934f0p+1f;
3823 const float cc_exp10 = 0x1.2f346ep-24f;
3824
3825 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3826 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3827 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3828 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3829
3830 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3831 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3832 } else {
3833 const float ch_exp = 0x1.714000p+0f;
3834 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3835
3836 const float ch_exp10 = 0x1.a92000p+1f;
3837 const float cl_exp10 = 0x1.4f0978p-11f;
3838
3839 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3840 auto XH = B.buildAnd(Ty, X, MaskConst);
3841 auto XL = B.buildFSub(Ty, X, XH, Flags);
3842
3843 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3844 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3845
3846 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3847 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3848
3849 Register Mad0 =
3850 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3851 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3852 }
3853
3854 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3855
3856 // It is unsafe to contract this fsub into the PH multiply.
3857 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3858 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3859 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3860
3861 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3862 .addUse(A.getReg(0))
3863 .setMIFlags(Flags);
3864 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3865
3866 auto UnderflowCheckConst =
3867 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3868 auto Zero = B.buildFConstant(Ty, 0.0);
3869 auto Underflow =
3870 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3871
3872 R = B.buildSelect(Ty, Underflow, Zero, R);
3873
3874 const auto &Options = MF.getTarget().Options;
3875
3876 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3877 auto OverflowCheckConst =
3878 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3879
3880 auto Overflow =
3881 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3882 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3883 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3884 }
3885
3886 B.buildCopy(Dst, R);
3887 MI.eraseFromParent();
3888 return true;
3889}
3890
3892 MachineIRBuilder &B) const {
3893 Register Dst = MI.getOperand(0).getReg();
3894 Register Src0 = MI.getOperand(1).getReg();
3895 Register Src1 = MI.getOperand(2).getReg();
3896 unsigned Flags = MI.getFlags();
3897 LLT Ty = B.getMRI()->getType(Dst);
3898 const LLT F16 = LLT::float16();
3899 const LLT F32 = LLT::float32();
3900
3901 if (Ty == F32) {
3902 auto Log = B.buildFLog2(F32, Src0, Flags);
3903 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3904 .addUse(Log.getReg(0))
3905 .addUse(Src1)
3906 .setMIFlags(Flags);
3907 B.buildFExp2(Dst, Mul, Flags);
3908 } else if (Ty == F16) {
3909 // There's no f16 fmul_legacy, so we need to convert for it.
3910 auto Log = B.buildFLog2(F16, Src0, Flags);
3911 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3912 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3913 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3914 .addUse(Ext0.getReg(0))
3915 .addUse(Ext1.getReg(0))
3916 .setMIFlags(Flags);
3917 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3918 } else
3919 return false;
3920
3921 MI.eraseFromParent();
3922 return true;
3923}
3924
3925// Find a source register, ignoring any possible source modifiers.
3927 Register ModSrc = OrigSrc;
3928 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3929 ModSrc = SrcFNeg->getOperand(1).getReg();
3930 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3931 ModSrc = SrcFAbs->getOperand(1).getReg();
3932 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3933 ModSrc = SrcFAbs->getOperand(1).getReg();
3934 return ModSrc;
3935}
3936
3939 MachineIRBuilder &B) const {
3940
3941 const LLT S1 = LLT::scalar(1);
3942 const LLT F64 = LLT::float64();
3943 Register Dst = MI.getOperand(0).getReg();
3944 Register OrigSrc = MI.getOperand(1).getReg();
3945 unsigned Flags = MI.getFlags();
3946 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3947 "this should not have been custom lowered");
3948
3949 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3950 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3951 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3952 // V_FRACT bug is:
3953 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3954 //
3955 // Convert floor(x) to (x - fract(x))
3956
3957 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3958 .addUse(OrigSrc)
3959 .setMIFlags(Flags);
3960
3961 // Give source modifier matching some assistance before obscuring a foldable
3962 // pattern.
3963
3964 // TODO: We can avoid the neg on the fract? The input sign to fract
3965 // shouldn't matter?
3966 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3967
3968 auto Const =
3969 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3970
3971 Register Min = MRI.createGenericVirtualRegister(F64);
3972
3973 // We don't need to concern ourselves with the snan handling difference, so
3974 // use the one which will directly select.
3975 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3976 if (MFI->getMode().IEEE)
3977 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3978 else
3979 B.buildFMinNum(Min, Fract, Const, Flags);
3980
3981 Register CorrectedFract = Min;
3982 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3983 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3984 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3985 }
3986
3987 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3988 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3989
3990 MI.eraseFromParent();
3991 return true;
3992}
3993
3994// Turn an illegal packed v2s16 build vector into bit operations.
3995// TODO: This should probably be a bitcast action in LegalizerHelper.
3998 Register Dst = MI.getOperand(0).getReg();
3999 const LLT S32 = LLT::scalar(32);
4000 const LLT S16 = LLT::scalar(16);
4001 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4002
4003 Register Src0 = MI.getOperand(1).getReg();
4004 Register Src1 = MI.getOperand(2).getReg();
4005
4006 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4007 assert(MRI.getType(Src0) == S32);
4008 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4009 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4010 }
4011
4012 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4013 B.buildBitcast(Dst, Merge);
4014
4015 MI.eraseFromParent();
4016 return true;
4017}
4018
4019// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4020//
4021// Source and accumulation registers must all be 32-bits.
4022//
4023// TODO: When the multiply is uniform, we should produce a code sequence
4024// that is better suited to instruction selection on the SALU. Instead of
4025// the outer loop going over parts of the result, the outer loop should go
4026// over parts of one of the factors. This should result in instruction
4027// selection that makes full use of S_ADDC_U32 instructions.
4030 ArrayRef<Register> Src0,
4031 ArrayRef<Register> Src1,
4032 bool UsePartialMad64_32,
4033 bool SeparateOddAlignedProducts) const {
4034 // Use (possibly empty) vectors of S1 registers to represent the set of
4035 // carries from one pair of positions to the next.
4036 using Carry = SmallVector<Register, 2>;
4037
4038 MachineIRBuilder &B = Helper.MIRBuilder;
4039 GISelValueTracking &VT = *Helper.getValueTracking();
4040
4041 const LLT S1 = LLT::scalar(1);
4042 const LLT S32 = LLT::scalar(32);
4043 const LLT S64 = LLT::scalar(64);
4044
4045 Register Zero32;
4046 Register Zero64;
4047
4048 auto getZero32 = [&]() -> Register {
4049 if (!Zero32)
4050 Zero32 = B.buildConstant(S32, 0).getReg(0);
4051 return Zero32;
4052 };
4053 auto getZero64 = [&]() -> Register {
4054 if (!Zero64)
4055 Zero64 = B.buildConstant(S64, 0).getReg(0);
4056 return Zero64;
4057 };
4058
4059 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4060 for (unsigned i = 0; i < Src0.size(); ++i) {
4061 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4062 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4063 }
4064
4065 // Merge the given carries into the 32-bit LocalAccum, which is modified
4066 // in-place.
4067 //
4068 // Returns the carry-out, which is a single S1 register or null.
4069 auto mergeCarry =
4070 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4071 if (CarryIn.empty())
4072 return Register();
4073
4074 bool HaveCarryOut = true;
4075 Register CarryAccum;
4076 if (CarryIn.size() == 1) {
4077 if (!LocalAccum) {
4078 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4079 return Register();
4080 }
4081
4082 CarryAccum = getZero32();
4083 } else {
4084 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4085 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4086 CarryAccum =
4087 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4088 .getReg(0);
4089 }
4090
4091 if (!LocalAccum) {
4092 LocalAccum = getZero32();
4093 HaveCarryOut = false;
4094 }
4095 }
4096
4097 auto Add =
4098 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4099 LocalAccum = Add.getReg(0);
4100 return HaveCarryOut ? Add.getReg(1) : Register();
4101 };
4102
4103 // Build a multiply-add chain to compute
4104 //
4105 // LocalAccum + (partial products at DstIndex)
4106 // + (opportunistic subset of CarryIn)
4107 //
4108 // LocalAccum is an array of one or two 32-bit registers that are updated
4109 // in-place. The incoming registers may be null.
4110 //
4111 // In some edge cases, carry-ins can be consumed "for free". In that case,
4112 // the consumed carry bits are removed from CarryIn in-place.
4113 auto buildMadChain =
4114 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4115 -> Carry {
4116 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4117 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4118
4119 Carry CarryOut;
4120 unsigned j0 = 0;
4121
4122 // Use plain 32-bit multiplication for the most significant part of the
4123 // result by default.
4124 if (LocalAccum.size() == 1 &&
4125 (!UsePartialMad64_32 || !CarryIn.empty())) {
4126 do {
4127 // Skip multiplication if one of the operands is 0
4128 unsigned j1 = DstIndex - j0;
4129 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4130 ++j0;
4131 continue;
4132 }
4133 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4134 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4135 LocalAccum[0] = Mul.getReg(0);
4136 } else {
4137 if (CarryIn.empty()) {
4138 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4139 } else {
4140 LocalAccum[0] =
4141 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4142 .getReg(0);
4143 CarryIn.pop_back();
4144 }
4145 }
4146 ++j0;
4147 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4148 }
4149
4150 // Build full 64-bit multiplies.
4151 if (j0 <= DstIndex) {
4152 bool HaveSmallAccum = false;
4153 Register Tmp;
4154
4155 if (LocalAccum[0]) {
4156 if (LocalAccum.size() == 1) {
4157 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4158 HaveSmallAccum = true;
4159 } else if (LocalAccum[1]) {
4160 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4161 HaveSmallAccum = false;
4162 } else {
4163 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4164 HaveSmallAccum = true;
4165 }
4166 } else {
4167 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4168 Tmp = getZero64();
4169 HaveSmallAccum = true;
4170 }
4171
4172 do {
4173 unsigned j1 = DstIndex - j0;
4174 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4175 ++j0;
4176 continue;
4177 }
4178 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4179 {Src0[j0], Src1[j1], Tmp});
4180 Tmp = Mad.getReg(0);
4181 if (!HaveSmallAccum)
4182 CarryOut.push_back(Mad.getReg(1));
4183 HaveSmallAccum = false;
4184
4185 ++j0;
4186 } while (j0 <= DstIndex);
4187
4188 auto Unmerge = B.buildUnmerge(S32, Tmp);
4189 LocalAccum[0] = Unmerge.getReg(0);
4190 if (LocalAccum.size() > 1)
4191 LocalAccum[1] = Unmerge.getReg(1);
4192 }
4193
4194 return CarryOut;
4195 };
4196
4197 // Outer multiply loop, iterating over destination parts from least
4198 // significant to most significant parts.
4199 //
4200 // The columns of the following diagram correspond to the destination parts
4201 // affected by one iteration of the outer loop (ignoring boundary
4202 // conditions).
4203 //
4204 // Dest index relative to 2 * i: 1 0 -1
4205 // ------
4206 // Carries from previous iteration: e o
4207 // Even-aligned partial product sum: E E .
4208 // Odd-aligned partial product sum: O O
4209 //
4210 // 'o' is OddCarry, 'e' is EvenCarry.
4211 // EE and OO are computed from partial products via buildMadChain and use
4212 // accumulation where possible and appropriate.
4213 //
4214 Register SeparateOddCarry;
4215 Carry EvenCarry;
4216 Carry OddCarry;
4217
4218 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4219 Carry OddCarryIn = std::move(OddCarry);
4220 Carry EvenCarryIn = std::move(EvenCarry);
4221 OddCarry.clear();
4222 EvenCarry.clear();
4223
4224 // Partial products at offset 2 * i.
4225 if (2 * i < Accum.size()) {
4226 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4227 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4228 }
4229
4230 // Partial products at offset 2 * i - 1.
4231 if (i > 0) {
4232 if (!SeparateOddAlignedProducts) {
4233 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4234 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4235 } else {
4236 bool IsHighest = 2 * i >= Accum.size();
4237 Register SeparateOddOut[2];
4238 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4239 .take_front(IsHighest ? 1 : 2);
4240 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4241
4243
4244 if (i == 1) {
4245 if (!IsHighest)
4246 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4247 else
4248 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4249 } else {
4250 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4251 SeparateOddCarry);
4252 }
4253 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4254
4255 if (!IsHighest) {
4256 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4257 Lo->getOperand(1).getReg());
4258 Accum[2 * i] = Hi.getReg(0);
4259 SeparateOddCarry = Hi.getReg(1);
4260 }
4261 }
4262 }
4263
4264 // Add in the carries from the previous iteration
4265 if (i > 0) {
4266 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4267 EvenCarryIn.push_back(CarryOut);
4268
4269 if (2 * i < Accum.size()) {
4270 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4271 OddCarry.push_back(CarryOut);
4272 }
4273 }
4274 }
4275}
4276
4277// Custom narrowing of wide multiplies using wide multiply-add instructions.
4278//
4279// TODO: If the multiply is followed by an addition, we should attempt to
4280// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4282 MachineInstr &MI) const {
4283 assert(ST.hasMad64_32());
4284 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4285
4286 MachineIRBuilder &B = Helper.MIRBuilder;
4287 MachineRegisterInfo &MRI = *B.getMRI();
4288
4289 Register DstReg = MI.getOperand(0).getReg();
4290 Register Src0 = MI.getOperand(1).getReg();
4291 Register Src1 = MI.getOperand(2).getReg();
4292
4293 LLT Ty = MRI.getType(DstReg);
4294 assert(Ty.isScalar());
4295
4296 unsigned Size = Ty.getSizeInBits();
4297 if (ST.hasVectorMulU64() && Size == 64)
4298 return true;
4299
4300 unsigned NumParts = Size / 32;
4301 assert((Size % 32) == 0);
4302 assert(NumParts >= 2);
4303
4304 // Whether to use MAD_64_32 for partial products whose high half is
4305 // discarded. This avoids some ADD instructions but risks false dependency
4306 // stalls on some subtargets in some cases.
4307 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4308
4309 // Whether to compute odd-aligned partial products separately. This is
4310 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4311 // in an even-aligned VGPR.
4312 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4313
4314 LLT S32 = LLT::scalar(32);
4315 SmallVector<Register, 2> Src0Parts, Src1Parts;
4316 for (unsigned i = 0; i < NumParts; ++i) {
4317 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4318 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4319 }
4320 B.buildUnmerge(Src0Parts, Src0);
4321 B.buildUnmerge(Src1Parts, Src1);
4322
4323 SmallVector<Register, 2> AccumRegs(NumParts);
4324 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4325 SeparateOddAlignedProducts);
4326
4327 B.buildMergeLikeInstr(DstReg, AccumRegs);
4328 MI.eraseFromParent();
4329 return true;
4330}
4331
4332// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4333// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4334// case with a single min instruction instead of a compare+select.
4337 MachineIRBuilder &B) const {
4338 Register Dst = MI.getOperand(0).getReg();
4339 Register Src = MI.getOperand(1).getReg();
4340 LLT DstTy = MRI.getType(Dst);
4341 LLT SrcTy = MRI.getType(Src);
4342
4343 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4344 ? AMDGPU::G_AMDGPU_FFBH_U32
4345 : AMDGPU::G_AMDGPU_FFBL_B32;
4346 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4347 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4348
4349 MI.eraseFromParent();
4350 return true;
4351}
4352
4355 MachineIRBuilder &B) const {
4356 Register Dst = MI.getOperand(0).getReg();
4357 Register Src = MI.getOperand(1).getReg();
4358 LLT SrcTy = MRI.getType(Src);
4359 TypeSize NumBits = SrcTy.getSizeInBits();
4360
4361 assert(NumBits < 32u);
4362
4363 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4364 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4365 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4366 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4367 B.buildTrunc(Dst, Ctlz);
4368 MI.eraseFromParent();
4369 return true;
4370}
4371
4372// Check that this is a G_XOR x, -1
4373static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4374 if (MI.getOpcode() != TargetOpcode::G_XOR)
4375 return false;
4376 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4377 return ConstVal == -1;
4378}
4379
4380// Return the use branch instruction, otherwise null if the usage is invalid.
4381static MachineInstr *
4383 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4384 Register CondDef = MI.getOperand(0).getReg();
4385 if (!MRI.hasOneNonDBGUse(CondDef))
4386 return nullptr;
4387
4388 MachineBasicBlock *Parent = MI.getParent();
4389 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4390
4391 if (isNot(MRI, *UseMI)) {
4392 Register NegatedCond = UseMI->getOperand(0).getReg();
4393 if (!MRI.hasOneNonDBGUse(NegatedCond))
4394 return nullptr;
4395
4396 // We're deleting the def of this value, so we need to remove it.
4397 eraseInstr(*UseMI, MRI);
4398
4399 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4400 Negated = true;
4401 }
4402
4403 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4404 return nullptr;
4405
4406 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4407 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4408 if (Next == Parent->end()) {
4409 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4410 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4411 return nullptr;
4412 UncondBrTarget = &*NextMBB;
4413 } else {
4414 if (Next->getOpcode() != AMDGPU::G_BR)
4415 return nullptr;
4416 Br = &*Next;
4417 UncondBrTarget = Br->getOperand(0).getMBB();
4418 }
4419
4420 return UseMI;
4421}
4422
4425 const ArgDescriptor *Arg,
4426 const TargetRegisterClass *ArgRC,
4427 LLT ArgTy) const {
4428 MCRegister SrcReg = Arg->getRegister();
4429 assert(SrcReg.isPhysical() && "Physical register expected");
4430 assert(DstReg.isVirtual() && "Virtual register expected");
4431
4432 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4433 *ArgRC, B.getDebugLoc(), ArgTy);
4434 if (Arg->isMasked()) {
4435 // TODO: Should we try to emit this once in the entry block?
4436 const LLT S32 = LLT::scalar(32);
4437 const unsigned Mask = Arg->getMask();
4438 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4439
4440 Register AndMaskSrc = LiveIn;
4441
4442 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4443 // 0.
4444 if (Shift != 0) {
4445 auto ShiftAmt = B.buildConstant(S32, Shift);
4446 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4447 }
4448
4449 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4450 } else {
4451 B.buildCopy(DstReg, LiveIn);
4452 }
4453}
4454
4459 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4460 Register DstReg = MI.getOperand(0).getReg();
4461 if (!ST.hasClusters()) {
4462 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4463 return false;
4464 MI.eraseFromParent();
4465 return true;
4466 }
4467
4468 // Clusters are supported. Return the global position in the grid. If clusters
4469 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4470
4471 // WorkGroupIdXYZ = ClusterId == 0 ?
4472 // ClusterIdXYZ :
4473 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4474 MachineRegisterInfo &MRI = *B.getMRI();
4475 const LLT S32 = LLT::scalar(32);
4476 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4477 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4478 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4479 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4480 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4481 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4482 return false;
4483
4484 auto One = B.buildConstant(S32, 1);
4485 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4486 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4487 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4488
4489 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4490
4491 switch (MFI->getClusterDims().getKind()) {
4494 B.buildCopy(DstReg, GlobalIdXYZ);
4495 MI.eraseFromParent();
4496 return true;
4497 }
4499 B.buildCopy(DstReg, ClusterIdXYZ);
4500 MI.eraseFromParent();
4501 return true;
4502 }
4504 using namespace AMDGPU::Hwreg;
4505 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4506 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4507 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4508 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4509 .addDef(ClusterId)
4510 .addImm(ClusterIdField);
4511 auto Zero = B.buildConstant(S32, 0);
4512 auto NoClusters =
4513 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4514 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4515 MI.eraseFromParent();
4516 return true;
4517 }
4518 }
4519
4520 llvm_unreachable("nothing should reach here");
4521}
4522
4524 Register DstReg, MachineIRBuilder &B,
4526 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4527 const ArgDescriptor *Arg = nullptr;
4528 const TargetRegisterClass *ArgRC;
4529 LLT ArgTy;
4530
4531 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4532 const ArgDescriptor WorkGroupIDX =
4533 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4534 // If GridZ is not programmed in an entry function then the hardware will set
4535 // it to all zeros, so there is no need to mask the GridY value in the low
4536 // order bits.
4537 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4538 AMDGPU::TTMP7,
4539 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4540 const ArgDescriptor WorkGroupIDZ =
4541 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4542 const ArgDescriptor ClusterWorkGroupIDX =
4543 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4544 const ArgDescriptor ClusterWorkGroupIDY =
4545 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4546 const ArgDescriptor ClusterWorkGroupIDZ =
4547 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4548 const ArgDescriptor ClusterWorkGroupMaxIDX =
4549 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4550 const ArgDescriptor ClusterWorkGroupMaxIDY =
4551 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4552 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4553 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4554 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4555 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4556
4557 auto LoadConstant = [&](unsigned N) {
4558 B.buildConstant(DstReg, N);
4559 return true;
4560 };
4561
4562 if (ST.hasArchitectedSGPRs() &&
4564 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4565 bool HasFixedDims = ClusterDims.isFixedDims();
4566
4567 switch (ArgType) {
4569 Arg = &WorkGroupIDX;
4570 ArgRC = &AMDGPU::SReg_32RegClass;
4571 ArgTy = LLT::scalar(32);
4572 break;
4574 Arg = &WorkGroupIDY;
4575 ArgRC = &AMDGPU::SReg_32RegClass;
4576 ArgTy = LLT::scalar(32);
4577 break;
4579 Arg = &WorkGroupIDZ;
4580 ArgRC = &AMDGPU::SReg_32RegClass;
4581 ArgTy = LLT::scalar(32);
4582 break;
4584 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4585 return LoadConstant(0);
4586 Arg = &ClusterWorkGroupIDX;
4587 ArgRC = &AMDGPU::SReg_32RegClass;
4588 ArgTy = LLT::scalar(32);
4589 break;
4591 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4592 return LoadConstant(0);
4593 Arg = &ClusterWorkGroupIDY;
4594 ArgRC = &AMDGPU::SReg_32RegClass;
4595 ArgTy = LLT::scalar(32);
4596 break;
4598 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4599 return LoadConstant(0);
4600 Arg = &ClusterWorkGroupIDZ;
4601 ArgRC = &AMDGPU::SReg_32RegClass;
4602 ArgTy = LLT::scalar(32);
4603 break;
4605 if (HasFixedDims)
4606 return LoadConstant(ClusterDims.getDims()[0] - 1);
4607 Arg = &ClusterWorkGroupMaxIDX;
4608 ArgRC = &AMDGPU::SReg_32RegClass;
4609 ArgTy = LLT::scalar(32);
4610 break;
4612 if (HasFixedDims)
4613 return LoadConstant(ClusterDims.getDims()[1] - 1);
4614 Arg = &ClusterWorkGroupMaxIDY;
4615 ArgRC = &AMDGPU::SReg_32RegClass;
4616 ArgTy = LLT::scalar(32);
4617 break;
4619 if (HasFixedDims)
4620 return LoadConstant(ClusterDims.getDims()[2] - 1);
4621 Arg = &ClusterWorkGroupMaxIDZ;
4622 ArgRC = &AMDGPU::SReg_32RegClass;
4623 ArgTy = LLT::scalar(32);
4624 break;
4626 Arg = &ClusterWorkGroupMaxFlatID;
4627 ArgRC = &AMDGPU::SReg_32RegClass;
4628 ArgTy = LLT::scalar(32);
4629 break;
4630 default:
4631 break;
4632 }
4633 }
4634
4635 if (!Arg)
4636 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4637
4638 if (!Arg) {
4640 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4641 // which case the pointer argument may be missing and we use null.
4642 return LoadConstant(0);
4643 }
4644
4645 // It's undefined behavior if a function marked with the amdgpu-no-*
4646 // attributes uses the corresponding intrinsic.
4647 B.buildUndef(DstReg);
4648 return true;
4649 }
4650
4651 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4652 return false; // TODO: Handle these
4653 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4654 return true;
4655}
4656
4660 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4661 return false;
4662
4663 MI.eraseFromParent();
4664 return true;
4665}
4666
4668 int64_t C) {
4669 B.buildConstant(MI.getOperand(0).getReg(), C);
4670 MI.eraseFromParent();
4671 return true;
4672}
4673
4676 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4677 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4678 if (MaxID == 0)
4679 return replaceWithConstant(B, MI, 0);
4680
4681 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4682 const ArgDescriptor *Arg;
4683 const TargetRegisterClass *ArgRC;
4684 LLT ArgTy;
4685 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4686
4687 Register DstReg = MI.getOperand(0).getReg();
4688 if (!Arg) {
4689 // It's undefined behavior if a function marked with the amdgpu-no-*
4690 // attributes uses the corresponding intrinsic.
4691 B.buildUndef(DstReg);
4692 MI.eraseFromParent();
4693 return true;
4694 }
4695
4696 if (Arg->isMasked()) {
4697 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4698 // masking operations anyway.
4699 //
4700 // TODO: We could assert the top bit is 0 for the source copy.
4701 if (!loadInputValue(DstReg, B, ArgType))
4702 return false;
4703 } else {
4704 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4705 if (!loadInputValue(TmpReg, B, ArgType))
4706 return false;
4707 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4708 }
4709
4710 MI.eraseFromParent();
4711 return true;
4712}
4713
4715 int64_t Offset) const {
4717 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4718
4719 // TODO: If we passed in the base kernel offset we could have a better
4720 // alignment than 4, but we don't really need it.
4721 if (!loadInputValue(KernArgReg, B,
4723 llvm_unreachable("failed to find kernarg segment ptr");
4724
4725 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4726 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4727}
4728
4729/// Legalize a value that's loaded from kernel arguments. This is only used by
4730/// legacy intrinsics.
4734 Align Alignment) const {
4735 Register DstReg = MI.getOperand(0).getReg();
4736
4737 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4738 "unexpected kernarg parameter type");
4739
4742 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4745 MI.eraseFromParent();
4746 return true;
4747}
4748
4751 MachineIRBuilder &B) const {
4752 Register Dst = MI.getOperand(0).getReg();
4753 LLT DstTy = MRI.getType(Dst);
4754 LLT S16 = LLT::scalar(16);
4755 LLT S32 = LLT::scalar(32);
4756 LLT S64 = LLT::scalar(64);
4757
4758 if (DstTy == S16)
4759 return legalizeFDIV16(MI, MRI, B);
4760 if (DstTy == S32)
4761 return legalizeFDIV32(MI, MRI, B);
4762 if (DstTy == S64)
4763 return legalizeFDIV64(MI, MRI, B);
4764
4765 return false;
4766}
4767
4769 Register DstDivReg,
4770 Register DstRemReg,
4771 Register X,
4772 Register Y) const {
4773 const LLT S1 = LLT::scalar(1);
4774 const LLT S32 = LLT::scalar(32);
4775
4776 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4777 // algorithm used here.
4778
4779 // Initial estimate of inv(y).
4780 auto FloatY = B.buildUITOFP(S32, Y);
4781 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4782 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4783 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4784 auto Z = B.buildFPTOUI(S32, ScaledY);
4785
4786 // One round of UNR.
4787 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4788 auto NegYZ = B.buildMul(S32, NegY, Z);
4789 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4790
4791 // Quotient/remainder estimate.
4792 auto Q = B.buildUMulH(S32, X, Z);
4793 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4794
4795 // First quotient/remainder refinement.
4796 auto One = B.buildConstant(S32, 1);
4797 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4798 if (DstDivReg)
4799 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4800 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4801
4802 // Second quotient/remainder refinement.
4803 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4804 if (DstDivReg)
4805 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4806
4807 if (DstRemReg)
4808 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4809}
4810
4811// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4812//
4813// Return lo, hi of result
4814//
4815// %cvt.lo = G_UITOFP Val.lo
4816// %cvt.hi = G_UITOFP Val.hi
4817// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4818// %rcp = G_AMDGPU_RCP_IFLAG %mad
4819// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4820// %mul2 = G_FMUL %mul1, 2**(-32)
4821// %trunc = G_INTRINSIC_TRUNC %mul2
4822// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4823// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4824static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4825 Register Val) {
4826 const LLT S32 = LLT::scalar(32);
4827 auto Unmerge = B.buildUnmerge(S32, Val);
4828
4829 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4830 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4831
4832 auto Mad = B.buildFMAD(
4833 S32, CvtHi, // 2**32
4834 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4835
4836 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4837 auto Mul1 = B.buildFMul(
4838 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4839
4840 // 2**(-32)
4841 auto Mul2 = B.buildFMul(
4842 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4843 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4844
4845 // -(2**32)
4846 auto Mad2 = B.buildFMAD(
4847 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4848 Mul1);
4849
4850 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4851 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4852
4853 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4854}
4855
4857 Register DstDivReg,
4858 Register DstRemReg,
4859 Register Numer,
4860 Register Denom) const {
4861 const LLT S32 = LLT::scalar(32);
4862 const LLT S64 = LLT::scalar(64);
4863 const LLT S1 = LLT::scalar(1);
4864 Register RcpLo, RcpHi;
4865
4866 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4867
4868 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4869
4870 auto Zero64 = B.buildConstant(S64, 0);
4871 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4872
4873 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4874 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4875
4876 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4877 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4878 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4879
4880 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4881 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4882 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4883
4884 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4885 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4886 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4887 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4888 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4889
4890 auto Zero32 = B.buildConstant(S32, 0);
4891 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4892 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4893 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4894
4895 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4896 Register NumerLo = UnmergeNumer.getReg(0);
4897 Register NumerHi = UnmergeNumer.getReg(1);
4898
4899 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4900 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4901 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4902 Register Mul3_Lo = UnmergeMul3.getReg(0);
4903 Register Mul3_Hi = UnmergeMul3.getReg(1);
4904 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4905 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4906 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4907 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4908
4909 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4910 Register DenomLo = UnmergeDenom.getReg(0);
4911 Register DenomHi = UnmergeDenom.getReg(1);
4912
4913 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4914 auto C1 = B.buildSExt(S32, CmpHi);
4915
4916 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4917 auto C2 = B.buildSExt(S32, CmpLo);
4918
4919 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4920 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4921
4922 // TODO: Here and below portions of the code can be enclosed into if/endif.
4923 // Currently control flow is unconditional and we have 4 selects after
4924 // potential endif to substitute PHIs.
4925
4926 // if C3 != 0 ...
4927 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4928 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4929 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4930 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4931
4932 auto One64 = B.buildConstant(S64, 1);
4933 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4934
4935 auto C4 =
4936 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4937 auto C5 =
4938 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4939 auto C6 = B.buildSelect(
4940 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4941
4942 // if (C6 != 0)
4943 auto Add4 = B.buildAdd(S64, Add3, One64);
4944 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4945
4946 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4947 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4948 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4949
4950 // endif C6
4951 // endif C3
4952
4953 if (DstDivReg) {
4954 auto Sel1 = B.buildSelect(
4955 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4956 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4957 Sel1, MulHi3);
4958 }
4959
4960 if (DstRemReg) {
4961 auto Sel2 = B.buildSelect(
4962 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4963 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4964 Sel2, Sub1);
4965 }
4966}
4967
4970 MachineIRBuilder &B) const {
4971 Register DstDivReg, DstRemReg;
4972 switch (MI.getOpcode()) {
4973 default:
4974 llvm_unreachable("Unexpected opcode!");
4975 case AMDGPU::G_UDIV: {
4976 DstDivReg = MI.getOperand(0).getReg();
4977 break;
4978 }
4979 case AMDGPU::G_UREM: {
4980 DstRemReg = MI.getOperand(0).getReg();
4981 break;
4982 }
4983 case AMDGPU::G_UDIVREM: {
4984 DstDivReg = MI.getOperand(0).getReg();
4985 DstRemReg = MI.getOperand(1).getReg();
4986 break;
4987 }
4988 }
4989
4990 const LLT S64 = LLT::scalar(64);
4991 const LLT S32 = LLT::scalar(32);
4992 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4993 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4994 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4995 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4996
4997 if (Ty == S32)
4998 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4999 else if (Ty == S64)
5000 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5001 else
5002 return false;
5003
5004 MI.eraseFromParent();
5005 return true;
5006}
5007
5010 MachineIRBuilder &B) const {
5011 const LLT S64 = LLT::scalar(64);
5012 const LLT S32 = LLT::scalar(32);
5013
5014 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5015 if (Ty != S32 && Ty != S64)
5016 return false;
5017
5018 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5019 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5020 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5021
5022 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5023 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5024 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5025
5026 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5027 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5028
5029 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5030 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5031
5032 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5033 switch (MI.getOpcode()) {
5034 default:
5035 llvm_unreachable("Unexpected opcode!");
5036 case AMDGPU::G_SDIV: {
5037 DstDivReg = MI.getOperand(0).getReg();
5038 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5039 break;
5040 }
5041 case AMDGPU::G_SREM: {
5042 DstRemReg = MI.getOperand(0).getReg();
5043 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5044 break;
5045 }
5046 case AMDGPU::G_SDIVREM: {
5047 DstDivReg = MI.getOperand(0).getReg();
5048 DstRemReg = MI.getOperand(1).getReg();
5049 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5050 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5051 break;
5052 }
5053 }
5054
5055 if (Ty == S32)
5056 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5057 else
5058 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5059
5060 if (DstDivReg) {
5061 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5062 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5063 B.buildSub(DstDivReg, SignXor, Sign);
5064 }
5065
5066 if (DstRemReg) {
5067 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5068 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5069 B.buildSub(DstRemReg, SignXor, Sign);
5070 }
5071
5072 MI.eraseFromParent();
5073 return true;
5074}
5075
5078 MachineIRBuilder &B) const {
5079 Register Res = MI.getOperand(0).getReg();
5080 Register LHS = MI.getOperand(1).getReg();
5081 Register RHS = MI.getOperand(2).getReg();
5082 uint16_t Flags = MI.getFlags();
5083 LLT ResTy = MRI.getType(Res);
5084
5085 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5086
5087 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5088 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5089 return false;
5090
5091 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5092 // the CI documentation has a worst case error of 1 ulp.
5093 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5094 // use it as long as we aren't trying to use denormals.
5095 //
5096 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5097
5098 // 1 / x -> RCP(x)
5099 if (CLHS->isExactlyValue(1.0)) {
5100 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5101 .addUse(RHS)
5102 .setMIFlags(Flags);
5103
5104 MI.eraseFromParent();
5105 return true;
5106 }
5107
5108 // -1 / x -> RCP( FNEG(x) )
5109 if (CLHS->isExactlyValue(-1.0)) {
5110 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5111 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5112 .addUse(FNeg.getReg(0))
5113 .setMIFlags(Flags);
5114
5115 MI.eraseFromParent();
5116 return true;
5117 }
5118 }
5119
5120 // For f16 require afn or arcp.
5121 // For f32 require afn.
5122 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5123 !MI.getFlag(MachineInstr::FmArcp)))
5124 return false;
5125
5126 // x / y -> x * (1.0 / y)
5127 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5128 .addUse(RHS)
5129 .setMIFlags(Flags);
5130 B.buildFMul(Res, LHS, RCP, Flags);
5131
5132 MI.eraseFromParent();
5133 return true;
5134}
5135
5138 MachineIRBuilder &B) const {
5139 Register Res = MI.getOperand(0).getReg();
5140 Register X = MI.getOperand(1).getReg();
5141 Register Y = MI.getOperand(2).getReg();
5142 uint16_t Flags = MI.getFlags();
5143 LLT ResTy = MRI.getType(Res);
5144
5145 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5146
5147 if (!AllowInaccurateRcp)
5148 return false;
5149
5150 auto NegY = B.buildFNeg(ResTy, Y);
5151 auto One = B.buildFConstant(ResTy, 1.0);
5152
5153 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5154 .addUse(Y)
5155 .setMIFlags(Flags);
5156
5157 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5158 R = B.buildFMA(ResTy, Tmp0, R, R);
5159
5160 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5161 R = B.buildFMA(ResTy, Tmp1, R, R);
5162
5163 auto Ret = B.buildFMul(ResTy, X, R);
5164 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5165
5166 B.buildFMA(Res, Tmp2, R, Ret);
5167 MI.eraseFromParent();
5168 return true;
5169}
5170
5173 MachineIRBuilder &B) const {
5175 return true;
5176
5177 Register Res = MI.getOperand(0).getReg();
5178 Register LHS = MI.getOperand(1).getReg();
5179 Register RHS = MI.getOperand(2).getReg();
5180
5181 uint16_t Flags = MI.getFlags();
5182
5183 LLT S16 = LLT::scalar(16);
5184 LLT S32 = LLT::scalar(32);
5185
5186 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5187 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5188 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5189 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5190 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5191 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5192 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5193 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5194 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5195 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5196 // q16.u = opx(V_CVT_F16_F32, q32.u);
5197 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5198
5199 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5200 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5201 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5202 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5203 .addUse(RHSExt.getReg(0))
5204 .setMIFlags(Flags);
5205 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5207 if (ST.hasMadMacF32Insts()) {
5208 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5209 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5210 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5211 } else {
5212 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5213 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5214 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5215 }
5216 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5217 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5218 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5219 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5220 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5221 .addUse(RDst.getReg(0))
5222 .addUse(RHS)
5223 .addUse(LHS)
5224 .setMIFlags(Flags);
5225
5226 MI.eraseFromParent();
5227 return true;
5228}
5229
5230static constexpr unsigned SPDenormModeBitField =
5232
5233// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5234// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5236 const GCNSubtarget &ST,
5238 // Set SP denorm mode to this value.
5239 unsigned SPDenormMode =
5240 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5241
5242 if (ST.hasDenormModeInst()) {
5243 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5244 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5245
5246 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5247 B.buildInstr(AMDGPU::S_DENORM_MODE)
5248 .addImm(NewDenormModeValue);
5249
5250 } else {
5251 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5252 .addImm(SPDenormMode)
5253 .addImm(SPDenormModeBitField);
5254 }
5255}
5256
5259 MachineIRBuilder &B) const {
5261 return true;
5262
5263 Register Res = MI.getOperand(0).getReg();
5264 Register LHS = MI.getOperand(1).getReg();
5265 Register RHS = MI.getOperand(2).getReg();
5266 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5267 SIModeRegisterDefaults Mode = MFI->getMode();
5268
5269 uint16_t Flags = MI.getFlags();
5270
5271 LLT S32 = LLT::scalar(32);
5272 LLT S1 = LLT::scalar(1);
5273
5274 auto One = B.buildFConstant(S32, 1.0f);
5275
5276 auto DenominatorScaled =
5277 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5278 .addUse(LHS)
5279 .addUse(RHS)
5280 .addImm(0)
5281 .setMIFlags(Flags);
5282 auto NumeratorScaled =
5283 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5284 .addUse(LHS)
5285 .addUse(RHS)
5286 .addImm(1)
5287 .setMIFlags(Flags);
5288
5289 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5290 .addUse(DenominatorScaled.getReg(0))
5291 .setMIFlags(Flags);
5292 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5293
5294 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5295 const bool HasDynamicDenormals =
5296 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5297 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5298
5299 Register SavedSPDenormMode;
5300 if (!PreservesDenormals) {
5301 if (HasDynamicDenormals) {
5302 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5303 B.buildInstr(AMDGPU::S_GETREG_B32)
5304 .addDef(SavedSPDenormMode)
5305 .addImm(SPDenormModeBitField);
5306 }
5307 toggleSPDenormMode(true, B, ST, Mode);
5308 }
5309
5310 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5311 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5312 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5313 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5314 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5315 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5316
5317 if (!PreservesDenormals) {
5318 if (HasDynamicDenormals) {
5319 assert(SavedSPDenormMode);
5320 B.buildInstr(AMDGPU::S_SETREG_B32)
5321 .addReg(SavedSPDenormMode)
5322 .addImm(SPDenormModeBitField);
5323 } else
5324 toggleSPDenormMode(false, B, ST, Mode);
5325 }
5326
5327 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5328 .addUse(Fma4.getReg(0))
5329 .addUse(Fma1.getReg(0))
5330 .addUse(Fma3.getReg(0))
5331 .addUse(NumeratorScaled.getReg(1))
5332 .setMIFlags(Flags);
5333
5334 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5335 .addUse(Fmas.getReg(0))
5336 .addUse(RHS)
5337 .addUse(LHS)
5338 .setMIFlags(Flags);
5339
5340 MI.eraseFromParent();
5341 return true;
5342}
5343
5346 MachineIRBuilder &B) const {
5348 return true;
5349
5350 Register Res = MI.getOperand(0).getReg();
5351 Register LHS = MI.getOperand(1).getReg();
5352 Register RHS = MI.getOperand(2).getReg();
5353
5354 uint16_t Flags = MI.getFlags();
5355
5356 LLT S64 = LLT::scalar(64);
5357 LLT S1 = LLT::scalar(1);
5358
5359 auto One = B.buildFConstant(S64, 1.0);
5360
5361 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5362 .addUse(LHS)
5363 .addUse(RHS)
5364 .addImm(0)
5365 .setMIFlags(Flags);
5366
5367 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5368
5369 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5370 .addUse(DivScale0.getReg(0))
5371 .setMIFlags(Flags);
5372
5373 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5374 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5375 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5376
5377 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5378 .addUse(LHS)
5379 .addUse(RHS)
5380 .addImm(1)
5381 .setMIFlags(Flags);
5382
5383 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5384 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5385 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5386
5387 Register Scale;
5388 if (!ST.hasUsableDivScaleConditionOutput()) {
5389 // Workaround a hardware bug on SI where the condition output from div_scale
5390 // is not usable.
5391
5392 LLT S32 = LLT::scalar(32);
5393
5394 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5395 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5396 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5397 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5398
5399 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5400 Scale1Unmerge.getReg(1));
5401 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5402 Scale0Unmerge.getReg(1));
5403 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5404 } else {
5405 Scale = DivScale1.getReg(1);
5406 }
5407
5408 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5409 .addUse(Fma4.getReg(0))
5410 .addUse(Fma3.getReg(0))
5411 .addUse(Mul.getReg(0))
5412 .addUse(Scale)
5413 .setMIFlags(Flags);
5414
5415 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5416 .addUse(Fmas.getReg(0))
5417 .addUse(RHS)
5418 .addUse(LHS)
5419 .setMIFlags(Flags);
5420
5421 MI.eraseFromParent();
5422 return true;
5423}
5424
5427 MachineIRBuilder &B) const {
5428 Register Res0 = MI.getOperand(0).getReg();
5429 Register Res1 = MI.getOperand(1).getReg();
5430 Register Val = MI.getOperand(2).getReg();
5431 uint16_t Flags = MI.getFlags();
5432
5433 LLT Ty = MRI.getType(Res0);
5434 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5435
5436 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5437 .addUse(Val)
5438 .setMIFlags(Flags);
5439 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5440 .addUse(Val)
5441 .setMIFlags(Flags);
5442
5443 if (ST.hasFractBug()) {
5444 auto Fabs = B.buildFAbs(Ty, Val);
5445 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5446 auto IsFinite =
5447 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5448 auto Zero = B.buildConstant(InstrExpTy, 0);
5449 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5450 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5451 }
5452
5453 B.buildCopy(Res0, Mant);
5454 B.buildSExtOrTrunc(Res1, Exp);
5455
5456 MI.eraseFromParent();
5457 return true;
5458}
5459
5462 MachineIRBuilder &B) const {
5463 Register Res = MI.getOperand(0).getReg();
5464 Register LHS = MI.getOperand(2).getReg();
5465 Register RHS = MI.getOperand(3).getReg();
5466 uint16_t Flags = MI.getFlags();
5467
5468 LLT S32 = LLT::scalar(32);
5469 LLT S1 = LLT::scalar(1);
5470
5471 auto Abs = B.buildFAbs(S32, RHS, Flags);
5472 const APFloat C0Val(1.0f);
5473
5474 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5475 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5476 auto C2 = B.buildFConstant(S32, 1.0f);
5477
5478 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5479 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5480
5481 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5482
5483 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5484 .addUse(Mul0.getReg(0))
5485 .setMIFlags(Flags);
5486
5487 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5488
5489 B.buildFMul(Res, Sel, Mul1, Flags);
5490
5491 MI.eraseFromParent();
5492 return true;
5493}
5494
5497 MachineIRBuilder &B) const {
5498 // Bypass the correct expansion a standard promotion through G_FSQRT would
5499 // get. The f32 op is accurate enough for the f16 cas.
5500 unsigned Flags = MI.getFlags();
5501 assert(!ST.has16BitInsts());
5502 const LLT F32 = LLT::scalar(32);
5503 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5504 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5505 .addUse(Ext.getReg(0))
5506 .setMIFlags(Flags);
5507 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5508 MI.eraseFromParent();
5509 return true;
5510}
5511
5514 MachineIRBuilder &B) const {
5515 MachineFunction &MF = B.getMF();
5516 Register Dst = MI.getOperand(0).getReg();
5517 Register X = MI.getOperand(1).getReg();
5518 const unsigned Flags = MI.getFlags();
5519 const LLT S1 = LLT::scalar(1);
5520 const LLT F32 = LLT::scalar(32);
5521 const LLT I32 = LLT::scalar(32);
5522
5523 if (allowApproxFunc(MF, Flags)) {
5524 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5525 .addUse(X)
5526 .setMIFlags(Flags);
5527 MI.eraseFromParent();
5528 return true;
5529 }
5530
5531 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5532 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5533 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5534 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5535 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5536
5537 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5538 if (needsDenormHandlingF32(MF, X, Flags)) {
5539 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5540 .addUse(SqrtX.getReg(0))
5541 .setMIFlags(Flags);
5542
5543 auto NegOne = B.buildConstant(I32, -1);
5544 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5545
5546 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5547 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5548
5549 auto PosOne = B.buildConstant(I32, 1);
5550 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5551
5552 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5553 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5554
5555 auto Zero = B.buildFConstant(F32, 0.0f);
5556 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5557
5558 SqrtS =
5559 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5560
5561 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5562 SqrtS =
5563 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5564 } else {
5565 auto SqrtR =
5566 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5567 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5568
5569 auto Half = B.buildFConstant(F32, 0.5f);
5570 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5571 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5572 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5573 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5574 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5575 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5576 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5577 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5578 }
5579
5580 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5581
5582 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5583
5584 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5585
5586 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5587 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5588
5589 MI.eraseFromParent();
5590 return true;
5591}
5592
5595 MachineIRBuilder &B) const {
5596 // For double type, the SQRT and RSQ instructions don't have required
5597 // precision, we apply Goldschmidt's algorithm to improve the result:
5598 //
5599 // y0 = rsq(x)
5600 // g0 = x * y0
5601 // h0 = 0.5 * y0
5602 //
5603 // r0 = 0.5 - h0 * g0
5604 // g1 = g0 * r0 + g0
5605 // h1 = h0 * r0 + h0
5606 //
5607 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5608 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5609 // h2 = h1 * r1 + h1
5610 //
5611 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5612 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5613 //
5614 // sqrt(x) = g3
5615
5616 const LLT S1 = LLT::scalar(1);
5617 const LLT S32 = LLT::scalar(32);
5618 const LLT F64 = LLT::scalar(64);
5619
5620 Register Dst = MI.getOperand(0).getReg();
5621 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5622
5623 Register X = MI.getOperand(1).getReg();
5624 unsigned Flags = MI.getFlags();
5625
5626 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5627
5628 auto ZeroInt = B.buildConstant(S32, 0);
5629 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5630
5631 // Scale up input if it is too small.
5632 auto ScaleUpFactor = B.buildConstant(S32, 256);
5633 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5634 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5635
5636 auto SqrtY =
5637 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5638
5639 auto Half = B.buildFConstant(F64, 0.5);
5640 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5641 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5642
5643 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5644 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5645
5646 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5647 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5648
5649 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5650 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5651
5652 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5653
5654 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5655 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5656
5657 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5658
5659 // Scale down the result.
5660 auto ScaleDownFactor = B.buildConstant(S32, -128);
5661 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5662 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5663
5664 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5665 // with finite only or nsz because rsq(+/-0) = +/-inf
5666
5667 // TODO: Check for DAZ and expand to subnormals
5668 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5669
5670 // If x is +INF, +0, or -0, use its original value
5671 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5672
5673 MI.eraseFromParent();
5674 return true;
5675}
5676
5679 MachineIRBuilder &B) const {
5680 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5681 if (Ty == LLT::scalar(32))
5682 return legalizeFSQRTF32(MI, MRI, B);
5683 if (Ty == LLT::scalar(64))
5684 return legalizeFSQRTF64(MI, MRI, B);
5685 if (Ty == LLT::scalar(16))
5686 return legalizeFSQRTF16(MI, MRI, B);
5687 return false;
5688}
5689
5690// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5691// FIXME: Why do we handle this one but not other removed instructions?
5692//
5693// Reciprocal square root. The clamp prevents infinite results, clamping
5694// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5695// +-max_float.
5698 MachineIRBuilder &B) const {
5699 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5700 return true;
5701
5702 Register Dst = MI.getOperand(0).getReg();
5703 Register Src = MI.getOperand(2).getReg();
5704 auto Flags = MI.getFlags();
5705
5706 LLT Ty = MRI.getType(Dst);
5707
5708 const fltSemantics *FltSemantics;
5709 if (Ty == LLT::scalar(32))
5710 FltSemantics = &APFloat::IEEEsingle();
5711 else if (Ty == LLT::scalar(64))
5712 FltSemantics = &APFloat::IEEEdouble();
5713 else
5714 return false;
5715
5716 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5717 .addUse(Src)
5718 .setMIFlags(Flags);
5719
5720 // We don't need to concern ourselves with the snan handling difference, since
5721 // the rsq quieted (or not) so use the one which will directly select.
5722 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5723 const bool UseIEEE = MFI->getMode().IEEE;
5724
5725 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5726 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5727 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5728
5729 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5730
5731 if (UseIEEE)
5732 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5733 else
5734 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5735 MI.eraseFromParent();
5736 return true;
5737}
5738
5739// TODO: Fix pointer type handling
5742 Intrinsic::ID IID) const {
5743
5744 MachineIRBuilder &B = Helper.MIRBuilder;
5745 MachineRegisterInfo &MRI = *B.getMRI();
5746
5747 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5748 IID == Intrinsic::amdgcn_permlanex16;
5749 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5750 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5751
5752 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5753 Register Src2, LLT VT) -> Register {
5754 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5755 switch (IID) {
5756 case Intrinsic::amdgcn_readfirstlane:
5757 case Intrinsic::amdgcn_permlane64:
5758 return LaneOp.getReg(0);
5759 case Intrinsic::amdgcn_readlane:
5760 case Intrinsic::amdgcn_set_inactive:
5761 case Intrinsic::amdgcn_set_inactive_chain_arg:
5762 return LaneOp.addUse(Src1).getReg(0);
5763 case Intrinsic::amdgcn_writelane:
5764 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5765 case Intrinsic::amdgcn_permlane16:
5766 case Intrinsic::amdgcn_permlanex16: {
5767 Register Src3 = MI.getOperand(5).getReg();
5768 int64_t Src4 = MI.getOperand(6).getImm();
5769 int64_t Src5 = MI.getOperand(7).getImm();
5770 return LaneOp.addUse(Src1)
5771 .addUse(Src2)
5772 .addUse(Src3)
5773 .addImm(Src4)
5774 .addImm(Src5)
5775 .getReg(0);
5776 }
5777 case Intrinsic::amdgcn_mov_dpp8:
5778 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5779 case Intrinsic::amdgcn_update_dpp:
5780 return LaneOp.addUse(Src1)
5781 .addImm(MI.getOperand(4).getImm())
5782 .addImm(MI.getOperand(5).getImm())
5783 .addImm(MI.getOperand(6).getImm())
5784 .addImm(MI.getOperand(7).getImm())
5785 .getReg(0);
5786 default:
5787 llvm_unreachable("unhandled lane op");
5788 }
5789 };
5790
5791 Register DstReg = MI.getOperand(0).getReg();
5792 Register Src0 = MI.getOperand(2).getReg();
5793 Register Src1, Src2;
5794 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5795 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5796 Src1 = MI.getOperand(3).getReg();
5797 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5798 Src2 = MI.getOperand(4).getReg();
5799 }
5800 }
5801
5802 LLT Ty = MRI.getType(DstReg);
5803 unsigned Size = Ty.getSizeInBits();
5804
5805 unsigned SplitSize = 32;
5806 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5807 ST.hasDPALU_DPP() &&
5808 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
5809 SplitSize = 64;
5810
5811 if (Size == SplitSize) {
5812 // Already legal
5813 return true;
5814 }
5815
5816 if (Size < 32) {
5817 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5818
5819 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5820 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5821
5822 if (IID == Intrinsic::amdgcn_writelane)
5823 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5824
5825 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5826 B.buildTrunc(DstReg, LaneOpDst);
5827 MI.eraseFromParent();
5828 return true;
5829 }
5830
5831 if (Size % SplitSize != 0)
5832 return false;
5833
5834 LLT PartialResTy = LLT::scalar(SplitSize);
5835 bool NeedsBitcast = false;
5836 if (Ty.isVector()) {
5837 LLT EltTy = Ty.getElementType();
5838 unsigned EltSize = EltTy.getSizeInBits();
5839 if (EltSize == SplitSize) {
5840 PartialResTy = EltTy;
5841 } else if (EltSize == 16 || EltSize == 32) {
5842 unsigned NElem = SplitSize / EltSize;
5843 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5844 } else {
5845 // Handle all other cases via S32/S64 pieces
5846 NeedsBitcast = true;
5847 }
5848 }
5849
5850 SmallVector<Register, 4> PartialRes;
5851 unsigned NumParts = Size / SplitSize;
5852 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5853 MachineInstrBuilder Src1Parts, Src2Parts;
5854
5855 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5856 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5857
5858 if (IID == Intrinsic::amdgcn_writelane)
5859 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5860
5861 for (unsigned i = 0; i < NumParts; ++i) {
5862 Src0 = Src0Parts.getReg(i);
5863
5864 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5865 Src1 = Src1Parts.getReg(i);
5866
5867 if (IID == Intrinsic::amdgcn_writelane)
5868 Src2 = Src2Parts.getReg(i);
5869
5870 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5871 }
5872
5873 if (NeedsBitcast)
5874 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
5875 LLT::scalar(Ty.getSizeInBits()), PartialRes));
5876 else
5877 B.buildMergeLikeInstr(DstReg, PartialRes);
5878
5879 MI.eraseFromParent();
5880 return true;
5881}
5882
5885 MachineIRBuilder &B) const {
5887 ST.getTargetLowering()->getImplicitParameterOffset(
5889 LLT DstTy = MRI.getType(DstReg);
5890 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5891
5892 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5893 if (!loadInputValue(KernargPtrReg, B,
5895 return false;
5896
5897 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
5898 B.buildConstant(IdxTy, Offset).getReg(0));
5899 return true;
5900}
5901
5902/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5903/// bits of the pointer and replace them with the stride argument, then
5904/// merge_values everything together. In the common case of a raw buffer (the
5905/// stride component is 0), we can just AND off the upper half.
5908 Register Result = MI.getOperand(0).getReg();
5909 Register Pointer = MI.getOperand(2).getReg();
5910 Register Stride = MI.getOperand(3).getReg();
5911 Register NumRecords = MI.getOperand(4).getReg();
5912 Register Flags = MI.getOperand(5).getReg();
5913
5914 LLT S32 = LLT::scalar(32);
5915
5916 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5917 auto Unmerge = B.buildUnmerge(S32, Pointer);
5918 Register LowHalf = Unmerge.getReg(0);
5919 Register HighHalf = Unmerge.getReg(1);
5920
5921 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5922 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5923
5924 MachineInstrBuilder NewHighHalf = Masked;
5925 std::optional<ValueAndVReg> StrideConst =
5927 if (!StrideConst || !StrideConst->Value.isZero()) {
5928 MachineInstrBuilder ShiftedStride;
5929 if (StrideConst) {
5930 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5931 uint32_t ShiftedStrideVal = StrideVal << 16;
5932 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5933 } else {
5934 auto ExtStride = B.buildAnyExt(S32, Stride);
5935 auto ShiftConst = B.buildConstant(S32, 16);
5936 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5937 }
5938 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5939 }
5940 Register NewHighHalfReg = NewHighHalf.getReg(0);
5941 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5942 MI.eraseFromParent();
5943 return true;
5944}
5945
5948 MachineIRBuilder &B) const {
5949 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5950 if (!MFI->isEntryFunction()) {
5953 }
5954
5955 Register DstReg = MI.getOperand(0).getReg();
5956 if (!getImplicitArgPtr(DstReg, MRI, B))
5957 return false;
5958
5959 MI.eraseFromParent();
5960 return true;
5961}
5962
5965 MachineIRBuilder &B) const {
5966 Function &F = B.getMF().getFunction();
5967 std::optional<uint32_t> KnownSize =
5969 if (KnownSize.has_value())
5970 B.buildConstant(DstReg, *KnownSize);
5971 return false;
5972}
5973
5976 MachineIRBuilder &B) const {
5977
5978 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5979 if (!MFI->isEntryFunction()) {
5982 }
5983
5984 Register DstReg = MI.getOperand(0).getReg();
5985 if (!getLDSKernelId(DstReg, MRI, B))
5986 return false;
5987
5988 MI.eraseFromParent();
5989 return true;
5990}
5991
5995 unsigned AddrSpace) const {
5996 const LLT S32 = LLT::scalar(32);
5997 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
5998 Register Hi32 = Unmerge.getReg(1);
5999
6000 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6001 ST.hasGloballyAddressableScratch()) {
6002 Register FlatScratchBaseHi =
6003 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6004 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6005 .getReg(0);
6006 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6007 // Test bits 63..58 against the aperture address.
6008 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6009 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6010 B.buildConstant(S32, 1u << 26));
6011 } else {
6012 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6013 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6014 }
6015 MI.eraseFromParent();
6016 return true;
6017}
6018
6019// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6020// offset (the offset that is included in bounds checking and swizzling, to be
6021// split between the instruction's voffset and immoffset fields) and soffset
6022// (the offset that is excluded from bounds checking and swizzling, to go in
6023// the instruction's soffset field). This function takes the first kind of
6024// offset and figures out how to split it between voffset and immoffset.
6025std::pair<Register, unsigned>
6027 Register OrigOffset) const {
6028 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6029 Register BaseReg;
6030 unsigned ImmOffset;
6031 const LLT S32 = LLT::scalar(32);
6032 MachineRegisterInfo &MRI = *B.getMRI();
6033
6034 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6035 // being added, so we can only safely match a 32-bit addition with no unsigned
6036 // overflow.
6037 bool CheckNUW = AMDGPU::isGFX1250(ST);
6038 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6039 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6040
6041 // If BaseReg is a pointer, convert it to int.
6042 if (MRI.getType(BaseReg).isPointer())
6043 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6044
6045 // If the immediate value is too big for the immoffset field, put only bits
6046 // that would normally fit in the immoffset field. The remaining value that
6047 // is copied/added for the voffset field is a large power of 2, and it
6048 // stands more chance of being CSEd with the copy/add for another similar
6049 // load/store.
6050 // However, do not do that rounding down if that is a negative
6051 // number, as it appears to be illegal to have a negative offset in the
6052 // vgpr, even if adding the immediate offset makes it positive.
6053 unsigned Overflow = ImmOffset & ~MaxImm;
6054 ImmOffset -= Overflow;
6055 if ((int32_t)Overflow < 0) {
6056 Overflow += ImmOffset;
6057 ImmOffset = 0;
6058 }
6059
6060 if (Overflow != 0) {
6061 if (!BaseReg) {
6062 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6063 } else {
6064 auto OverflowVal = B.buildConstant(S32, Overflow);
6065 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6066 }
6067 }
6068
6069 if (!BaseReg)
6070 BaseReg = B.buildConstant(S32, 0).getReg(0);
6071
6072 return std::pair(BaseReg, ImmOffset);
6073}
6074
6075/// Handle register layout difference for f16 images for some subtargets.
6078 Register Reg,
6079 bool ImageStore) const {
6080 const LLT S16 = LLT::scalar(16);
6081 const LLT S32 = LLT::scalar(32);
6082 LLT StoreVT = MRI.getType(Reg);
6083 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6084
6085 if (ST.hasUnpackedD16VMem()) {
6086 auto Unmerge = B.buildUnmerge(S16, Reg);
6087
6088 SmallVector<Register, 4> WideRegs;
6089 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6090 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6091
6092 int NumElts = StoreVT.getNumElements();
6093
6094 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6095 .getReg(0);
6096 }
6097
6098 if (ImageStore && ST.hasImageStoreD16Bug()) {
6099 if (StoreVT.getNumElements() == 2) {
6100 SmallVector<Register, 4> PackedRegs;
6101 Reg = B.buildBitcast(S32, Reg).getReg(0);
6102 PackedRegs.push_back(Reg);
6103 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6104 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6105 .getReg(0);
6106 }
6107
6108 if (StoreVT.getNumElements() == 3) {
6109 SmallVector<Register, 4> PackedRegs;
6110 auto Unmerge = B.buildUnmerge(S16, Reg);
6111 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6112 PackedRegs.push_back(Unmerge.getReg(I));
6113 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6114 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6115 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6116 }
6117
6118 if (StoreVT.getNumElements() == 4) {
6119 SmallVector<Register, 4> PackedRegs;
6120 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6121 auto Unmerge = B.buildUnmerge(S32, Reg);
6122 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6123 PackedRegs.push_back(Unmerge.getReg(I));
6124 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6125 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6126 .getReg(0);
6127 }
6128
6129 llvm_unreachable("invalid data type");
6130 }
6131
6132 if (StoreVT == LLT::fixed_vector(3, S16)) {
6133 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6134 .getReg(0);
6135 }
6136 return Reg;
6137}
6138
6140 Register VData, LLT MemTy,
6141 bool IsFormat) const {
6142 MachineRegisterInfo *MRI = B.getMRI();
6143 LLT Ty = MRI->getType(VData);
6144
6145 const LLT S16 = LLT::scalar(16);
6146
6147 // Fixup buffer resources themselves needing to be v4i128.
6149 return castBufferRsrcToV4I32(VData, B);
6150
6151 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6152 Ty = getBitcastRegisterType(Ty);
6153 VData = B.buildBitcast(Ty, VData).getReg(0);
6154 }
6155 // Fixup illegal register types for i8 stores.
6156 if (Ty == LLT::scalar(8) || Ty == S16) {
6157 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6158 return AnyExt;
6159 }
6160
6161 if (Ty.isVector()) {
6162 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6163 if (IsFormat)
6164 return handleD16VData(B, *MRI, VData);
6165 }
6166 }
6167
6168 return VData;
6169}
6170
6172 LegalizerHelper &Helper,
6173 bool IsTyped,
6174 bool IsFormat) const {
6175 MachineIRBuilder &B = Helper.MIRBuilder;
6176 MachineRegisterInfo &MRI = *B.getMRI();
6177
6178 Register VData = MI.getOperand(1).getReg();
6179 LLT Ty = MRI.getType(VData);
6180 LLT EltTy = Ty.getScalarType();
6181 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6182 const LLT S32 = LLT::scalar(32);
6183
6184 MachineMemOperand *MMO = *MI.memoperands_begin();
6185 const int MemSize = MMO->getSize().getValue();
6186 LLT MemTy = MMO->getMemoryType();
6187
6188 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6189
6191 Register RSrc = MI.getOperand(2).getReg();
6192
6193 unsigned ImmOffset;
6194
6195 // The typed intrinsics add an immediate after the registers.
6196 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6197
6198 // The struct intrinsic variants add one additional operand over raw.
6199 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6200 Register VIndex;
6201 int OpOffset = 0;
6202 if (HasVIndex) {
6203 VIndex = MI.getOperand(3).getReg();
6204 OpOffset = 1;
6205 } else {
6206 VIndex = B.buildConstant(S32, 0).getReg(0);
6207 }
6208
6209 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6210 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6211
6212 unsigned Format = 0;
6213 if (IsTyped) {
6214 Format = MI.getOperand(5 + OpOffset).getImm();
6215 ++OpOffset;
6216 }
6217
6218 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6219
6220 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6221
6222 unsigned Opc;
6223 if (IsTyped) {
6224 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6225 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6226 } else if (IsFormat) {
6227 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6228 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6229 } else {
6230 switch (MemSize) {
6231 case 1:
6232 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6233 break;
6234 case 2:
6235 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6236 break;
6237 default:
6238 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6239 break;
6240 }
6241 }
6242
6243 auto MIB = B.buildInstr(Opc)
6244 .addUse(VData) // vdata
6245 .addUse(RSrc) // rsrc
6246 .addUse(VIndex) // vindex
6247 .addUse(VOffset) // voffset
6248 .addUse(SOffset) // soffset
6249 .addImm(ImmOffset); // offset(imm)
6250
6251 if (IsTyped)
6252 MIB.addImm(Format);
6253
6254 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6255 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6256 .addMemOperand(MMO);
6257
6258 MI.eraseFromParent();
6259 return true;
6260}
6261
6262static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6263 Register VIndex, Register VOffset, Register SOffset,
6264 unsigned ImmOffset, unsigned Format,
6265 unsigned AuxiliaryData, MachineMemOperand *MMO,
6266 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6267 auto MIB = B.buildInstr(Opc)
6268 .addDef(LoadDstReg) // vdata
6269 .addUse(RSrc) // rsrc
6270 .addUse(VIndex) // vindex
6271 .addUse(VOffset) // voffset
6272 .addUse(SOffset) // soffset
6273 .addImm(ImmOffset); // offset(imm)
6274
6275 if (IsTyped)
6276 MIB.addImm(Format);
6277
6278 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6279 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6280 .addMemOperand(MMO);
6281}
6282
6284 LegalizerHelper &Helper,
6285 bool IsFormat,
6286 bool IsTyped) const {
6287 MachineIRBuilder &B = Helper.MIRBuilder;
6288 MachineRegisterInfo &MRI = *B.getMRI();
6289 GISelChangeObserver &Observer = Helper.Observer;
6290
6291 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6292 MachineMemOperand *MMO = *MI.memoperands_begin();
6293 const LLT MemTy = MMO->getMemoryType();
6294 const LLT S32 = LLT::scalar(32);
6295
6296 Register Dst = MI.getOperand(0).getReg();
6297
6298 Register StatusDst;
6299 int OpOffset = 0;
6300 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6301 bool IsTFE = MI.getNumExplicitDefs() == 2;
6302 if (IsTFE) {
6303 StatusDst = MI.getOperand(1).getReg();
6304 ++OpOffset;
6305 }
6306
6307 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6308 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6309
6310 // The typed intrinsics add an immediate after the registers.
6311 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6312
6313 // The struct intrinsic variants add one additional operand over raw.
6314 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6315 Register VIndex;
6316 if (HasVIndex) {
6317 VIndex = MI.getOperand(3 + OpOffset).getReg();
6318 ++OpOffset;
6319 } else {
6320 VIndex = B.buildConstant(S32, 0).getReg(0);
6321 }
6322
6323 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6324 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6325
6326 unsigned Format = 0;
6327 if (IsTyped) {
6328 Format = MI.getOperand(5 + OpOffset).getImm();
6329 ++OpOffset;
6330 }
6331
6332 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6333 unsigned ImmOffset;
6334
6335 LLT Ty = MRI.getType(Dst);
6336 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6337 // logic doesn't have to handle that case.
6338 if (hasBufferRsrcWorkaround(Ty)) {
6339 Observer.changingInstr(MI);
6340 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6341 Observer.changedInstr(MI);
6342 Dst = MI.getOperand(0).getReg();
6343 B.setInsertPt(B.getMBB(), MI);
6344 }
6345 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6346 Ty = getBitcastRegisterType(Ty);
6347 Observer.changingInstr(MI);
6348 Helper.bitcastDst(MI, Ty, 0);
6349 Observer.changedInstr(MI);
6350 Dst = MI.getOperand(0).getReg();
6351 B.setInsertPt(B.getMBB(), MI);
6352 }
6353
6354 LLT EltTy = Ty.getScalarType();
6355 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6356 const bool Unpacked = ST.hasUnpackedD16VMem();
6357
6358 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6359
6360 unsigned Opc;
6361
6362 // TODO: Support TFE for typed and narrow loads.
6363 if (IsTyped) {
6364 if (IsTFE)
6365 return false;
6366 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6367 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6368 } else if (IsFormat) {
6369 if (IsD16) {
6370 if (IsTFE)
6371 return false;
6372 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6373 } else {
6374 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6375 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6376 }
6377 } else {
6378 switch (MemTy.getSizeInBits()) {
6379 case 8:
6380 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6381 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6382 break;
6383 case 16:
6384 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6385 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6386 break;
6387 default:
6388 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6389 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6390 break;
6391 }
6392 }
6393
6394 if (IsTFE) {
6395 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6396 unsigned NumLoadDWords = NumValueDWords + 1;
6397 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6398 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6399 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6400 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6401 if (MemTy.getSizeInBits() < 32) {
6402 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6403 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6404 B.buildTrunc(Dst, ExtDst);
6405 } else if (NumValueDWords == 1) {
6406 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6407 } else {
6408 SmallVector<Register, 5> LoadElts;
6409 for (unsigned I = 0; I != NumValueDWords; ++I)
6410 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6411 LoadElts.push_back(StatusDst);
6412 B.buildUnmerge(LoadElts, LoadDstReg);
6413 LoadElts.truncate(NumValueDWords);
6414 B.buildMergeLikeInstr(Dst, LoadElts);
6415 }
6416 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6417 (IsD16 && !Ty.isVector())) {
6418 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6419 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6420 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6421 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6422 B.buildTrunc(Dst, LoadDstReg);
6423 } else if (Unpacked && IsD16 && Ty.isVector()) {
6424 LLT UnpackedTy = Ty.changeElementSize(32);
6425 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6426 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6427 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6428 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6429 // FIXME: G_TRUNC should work, but legalization currently fails
6430 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6432 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6433 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6434 B.buildMergeLikeInstr(Dst, Repack);
6435 } else {
6436 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6437 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6438 }
6439
6440 MI.eraseFromParent();
6441 return true;
6442}
6443
6444static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6445 switch (IntrID) {
6446 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6448 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6449 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6450 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6451 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6452 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6453 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6454 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6455 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6456 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6458 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6459 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6460 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6461 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6463 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6465 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6466 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6468 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6471 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6473 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6475 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6476 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6478 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6480 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6481 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6482 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6483 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6486 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6487 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6488 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6490 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6491 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6493 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6495 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6496 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6497 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6498 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6500 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6501 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6502 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6503 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6505 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6506 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6507 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6508 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6509 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6510 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6511 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6512 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6513 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6514 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6515 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6516 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6518 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6520 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6521 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6522 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6523 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6525 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6526 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6527 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6528 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6529 default:
6530 llvm_unreachable("unhandled atomic opcode");
6531 }
6532}
6533
6536 Intrinsic::ID IID) const {
6537 const bool IsCmpSwap =
6538 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6539 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6540 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6541 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6542
6543 Register Dst = MI.getOperand(0).getReg();
6544 // Since we don't have 128-bit atomics, we don't need to handle the case of
6545 // p8 argmunents to the atomic itself
6546 Register VData = MI.getOperand(2).getReg();
6547
6548 Register CmpVal;
6549 int OpOffset = 0;
6550
6551 if (IsCmpSwap) {
6552 CmpVal = MI.getOperand(3).getReg();
6553 ++OpOffset;
6554 }
6555
6556 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6557 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6558 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6559
6560 // The struct intrinsic variants add one additional operand over raw.
6561 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6562 Register VIndex;
6563 if (HasVIndex) {
6564 VIndex = MI.getOperand(4 + OpOffset).getReg();
6565 ++OpOffset;
6566 } else {
6567 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6568 }
6569
6570 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6571 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6572 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6573
6574 MachineMemOperand *MMO = *MI.memoperands_begin();
6575
6576 unsigned ImmOffset;
6577 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6578
6579 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6580 .addDef(Dst)
6581 .addUse(VData); // vdata
6582
6583 if (IsCmpSwap)
6584 MIB.addReg(CmpVal);
6585
6586 MIB.addUse(RSrc) // rsrc
6587 .addUse(VIndex) // vindex
6588 .addUse(VOffset) // voffset
6589 .addUse(SOffset) // soffset
6590 .addImm(ImmOffset) // offset(imm)
6591 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6592 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6593 .addMemOperand(MMO);
6594
6595 MI.eraseFromParent();
6596 return true;
6597}
6598
6599/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6600/// vector with s16 typed elements.
6602 SmallVectorImpl<Register> &PackedAddrs,
6603 unsigned ArgOffset,
6605 bool IsA16, bool IsG16) {
6606 const LLT S16 = LLT::scalar(16);
6607 const LLT V2S16 = LLT::fixed_vector(2, 16);
6608 auto EndIdx = Intr->VAddrEnd;
6609
6610 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6611 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6612 if (!SrcOp.isReg())
6613 continue; // _L to _LZ may have eliminated this.
6614
6615 Register AddrReg = SrcOp.getReg();
6616
6617 if ((I < Intr->GradientStart) ||
6618 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6619 (I >= Intr->CoordStart && !IsA16)) {
6620 if ((I < Intr->GradientStart) && IsA16 &&
6621 (B.getMRI()->getType(AddrReg) == S16)) {
6622 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6623 // Special handling of bias when A16 is on. Bias is of type half but
6624 // occupies full 32-bit.
6625 PackedAddrs.push_back(
6626 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6627 .getReg(0));
6628 } else {
6629 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6630 "Bias needs to be converted to 16 bit in A16 mode");
6631 // Handle any gradient or coordinate operands that should not be packed
6632 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6633 PackedAddrs.push_back(AddrReg);
6634 }
6635 } else {
6636 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6637 // derivatives dx/dh and dx/dv are packed with undef.
6638 if (((I + 1) >= EndIdx) ||
6639 ((Intr->NumGradients / 2) % 2 == 1 &&
6640 (I == static_cast<unsigned>(Intr->GradientStart +
6641 (Intr->NumGradients / 2) - 1) ||
6642 I == static_cast<unsigned>(Intr->GradientStart +
6643 Intr->NumGradients - 1))) ||
6644 // Check for _L to _LZ optimization
6645 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6646 PackedAddrs.push_back(
6647 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6648 .getReg(0));
6649 } else {
6650 PackedAddrs.push_back(
6651 B.buildBuildVector(
6652 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6653 .getReg(0));
6654 ++I;
6655 }
6656 }
6657 }
6658}
6659
6660/// Convert from separate vaddr components to a single vector address register,
6661/// and replace the remaining operands with $noreg.
6663 int DimIdx, int NumVAddrs) {
6664 const LLT S32 = LLT::scalar(32);
6665 (void)S32;
6666 SmallVector<Register, 8> AddrRegs;
6667 for (int I = 0; I != NumVAddrs; ++I) {
6668 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6669 if (SrcOp.isReg()) {
6670 AddrRegs.push_back(SrcOp.getReg());
6671 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6672 }
6673 }
6674
6675 int NumAddrRegs = AddrRegs.size();
6676 if (NumAddrRegs != 1) {
6677 auto VAddr =
6678 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6679 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6680 }
6681
6682 for (int I = 1; I != NumVAddrs; ++I) {
6683 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6684 if (SrcOp.isReg())
6685 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6686 }
6687}
6688
6689/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6690///
6691/// Depending on the subtarget, load/store with 16-bit element data need to be
6692/// rewritten to use the low half of 32-bit registers, or directly use a packed
6693/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6694/// registers.
6695///
6696/// We don't want to directly select image instructions just yet, but also want
6697/// to exposes all register repacking to the legalizer/combiners. We also don't
6698/// want a selected instruction entering RegBankSelect. In order to avoid
6699/// defining a multitude of intermediate image instructions, directly hack on
6700/// the intrinsic's arguments. In cases like a16 addresses, this requires
6701/// padding now unnecessary arguments with $noreg.
6704 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6705
6706 const MachineFunction &MF = *MI.getMF();
6707 const unsigned NumDefs = MI.getNumExplicitDefs();
6708 const unsigned ArgOffset = NumDefs + 1;
6709 bool IsTFE = NumDefs == 2;
6710 // We are only processing the operands of d16 image operations on subtargets
6711 // that use the unpacked register layout, or need to repack the TFE result.
6712
6713 // TODO: Do we need to guard against already legalized intrinsics?
6714 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6716
6717 MachineRegisterInfo *MRI = B.getMRI();
6718 const LLT S32 = LLT::scalar(32);
6719 const LLT S16 = LLT::scalar(16);
6720 const LLT V2S16 = LLT::fixed_vector(2, 16);
6721
6722 unsigned DMask = 0;
6723 Register VData;
6724 LLT Ty;
6725
6726 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6727 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6728 Ty = MRI->getType(VData);
6729 }
6730
6731 const bool IsAtomicPacked16Bit =
6732 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6733 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6734
6735 // Check for 16 bit addresses and pack if true.
6736 LLT GradTy =
6737 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6738 LLT AddrTy =
6739 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6740 const bool IsG16 =
6741 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6742 const bool IsA16 = AddrTy == S16;
6743 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6744
6745 int DMaskLanes = 0;
6746 if (!BaseOpcode->Atomic) {
6747 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6748 if (BaseOpcode->Gather4) {
6749 DMaskLanes = 4;
6750 } else if (DMask != 0) {
6751 DMaskLanes = llvm::popcount(DMask);
6752 } else if (!IsTFE && !BaseOpcode->Store) {
6753 // If dmask is 0, this is a no-op load. This can be eliminated.
6754 B.buildUndef(MI.getOperand(0));
6755 MI.eraseFromParent();
6756 return true;
6757 }
6758 }
6759
6760 Observer.changingInstr(MI);
6761 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6762
6763 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6764 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6765 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6766 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6767 unsigned NewOpcode = LoadOpcode;
6768 if (BaseOpcode->Store)
6769 NewOpcode = StoreOpcode;
6770 else if (BaseOpcode->NoReturn)
6771 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6772
6773 // Track that we legalized this
6774 MI.setDesc(B.getTII().get(NewOpcode));
6775
6776 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6777 // dmask to be at least 1 otherwise the instruction will fail
6778 if (IsTFE && DMask == 0) {
6779 DMask = 0x1;
6780 DMaskLanes = 1;
6781 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6782 }
6783
6784 if (BaseOpcode->Atomic) {
6785 Register VData0 = MI.getOperand(2).getReg();
6786 LLT Ty = MRI->getType(VData0);
6787
6788 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6789 if (Ty.isVector() && !IsAtomicPacked16Bit)
6790 return false;
6791
6792 if (BaseOpcode->AtomicX2) {
6793 Register VData1 = MI.getOperand(3).getReg();
6794 // The two values are packed in one register.
6795 LLT PackedTy = LLT::fixed_vector(2, Ty);
6796 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6797 MI.getOperand(2).setReg(Concat.getReg(0));
6798 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6799 }
6800 }
6801
6802 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6803
6804 // Rewrite the addressing register layout before doing anything else.
6805 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6806 // 16 bit gradients are supported, but are tied to the A16 control
6807 // so both gradients and addresses must be 16 bit
6808 return false;
6809 }
6810
6811 if (IsA16 && !ST.hasA16()) {
6812 // A16 not supported
6813 return false;
6814 }
6815
6816 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6817 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6818
6819 if (IsA16 || IsG16) {
6820 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6821 // instructions expect VGPR_32
6822 SmallVector<Register, 4> PackedRegs;
6823
6824 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6825
6826 // See also below in the non-a16 branch
6827 const bool UseNSA = ST.hasNSAEncoding() &&
6828 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6829 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6830 const bool UsePartialNSA =
6831 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6832
6833 if (UsePartialNSA) {
6834 // Pack registers that would go over NSAMaxSize into last VAddr register
6835 LLT PackedAddrTy =
6836 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6837 auto Concat = B.buildConcatVectors(
6838 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6839 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6840 PackedRegs.resize(NSAMaxSize);
6841 } else if (!UseNSA && PackedRegs.size() > 1) {
6842 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6843 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6844 PackedRegs[0] = Concat.getReg(0);
6845 PackedRegs.resize(1);
6846 }
6847
6848 const unsigned NumPacked = PackedRegs.size();
6849 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6850 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6851 if (!SrcOp.isReg()) {
6852 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6853 continue;
6854 }
6855
6856 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6857
6858 if (I - Intr->VAddrStart < NumPacked)
6859 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6860 else
6861 SrcOp.setReg(AMDGPU::NoRegister);
6862 }
6863 } else {
6864 // If the register allocator cannot place the address registers contiguously
6865 // without introducing moves, then using the non-sequential address encoding
6866 // is always preferable, since it saves VALU instructions and is usually a
6867 // wash in terms of code size or even better.
6868 //
6869 // However, we currently have no way of hinting to the register allocator
6870 // that MIMG addresses should be placed contiguously when it is possible to
6871 // do so, so force non-NSA for the common 2-address case as a heuristic.
6872 //
6873 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6874 // allocation when possible.
6875 //
6876 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6877 // set of the remaining addresses.
6878 const bool UseNSA = ST.hasNSAEncoding() &&
6879 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6880 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6881 const bool UsePartialNSA =
6882 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6883
6884 if (UsePartialNSA) {
6886 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6887 Intr->NumVAddrs - NSAMaxSize + 1);
6888 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6889 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6890 Intr->NumVAddrs);
6891 }
6892 }
6893
6894 int Flags = 0;
6895 if (IsA16)
6896 Flags |= 1;
6897 if (IsG16)
6898 Flags |= 2;
6899 MI.addOperand(MachineOperand::CreateImm(Flags));
6900
6901 if (BaseOpcode->NoReturn) { // No TFE for stores?
6902 // TODO: Handle dmask trim
6903 if (!Ty.isVector() || !IsD16)
6904 return true;
6905
6906 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6907 if (RepackedReg != VData) {
6908 MI.getOperand(1).setReg(RepackedReg);
6909 }
6910
6911 return true;
6912 }
6913
6914 Register DstReg = MI.getOperand(0).getReg();
6915 const LLT EltTy = Ty.getScalarType();
6916 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6917
6918 // Confirm that the return type is large enough for the dmask specified
6919 if (NumElts < DMaskLanes)
6920 return false;
6921
6922 if (NumElts > 4 || DMaskLanes > 4)
6923 return false;
6924
6925 // Image atomic instructions are using DMask to specify how many bits
6926 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6927 // DMaskLanes for image atomic has default value '0'.
6928 // We must be sure that atomic variants (especially packed) will not be
6929 // truncated from v2s16 or v4s16 to s16 type.
6930 //
6931 // ChangeElementCount will be needed for image load where Ty is always scalar.
6932 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6933 const LLT AdjustedTy =
6934 DMaskLanes == 0
6935 ? Ty
6936 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6937
6938 // The raw dword aligned data component of the load. The only legal cases
6939 // where this matters should be when using the packed D16 format, for
6940 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6941 LLT RoundedTy;
6942
6943 // S32 vector to cover all data, plus TFE result element.
6944 LLT TFETy;
6945
6946 // Register type to use for each loaded component. Will be S32 or V2S16.
6947 LLT RegTy;
6948
6949 if (IsD16 && ST.hasUnpackedD16VMem()) {
6950 RoundedTy =
6951 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6952 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6953 RegTy = S32;
6954 } else {
6955 unsigned EltSize = EltTy.getSizeInBits();
6956 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6957 unsigned RoundedSize = 32 * RoundedElts;
6958 RoundedTy = LLT::scalarOrVector(
6959 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6960 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6961 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6962 }
6963
6964 // The return type does not need adjustment.
6965 // TODO: Should we change s16 case to s32 or <2 x s16>?
6966 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6967 return true;
6968
6969 Register Dst1Reg;
6970
6971 // Insert after the instruction.
6972 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6973
6974 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6975 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6976 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6977 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6978
6979 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6980
6981 MI.getOperand(0).setReg(NewResultReg);
6982
6983 // In the IR, TFE is supposed to be used with a 2 element struct return
6984 // type. The instruction really returns these two values in one contiguous
6985 // register, with one additional dword beyond the loaded data. Rewrite the
6986 // return type to use a single register result.
6987
6988 if (IsTFE) {
6989 Dst1Reg = MI.getOperand(1).getReg();
6990 if (MRI->getType(Dst1Reg) != S32)
6991 return false;
6992
6993 // TODO: Make sure the TFE operand bit is set.
6994 MI.removeOperand(1);
6995
6996 // Handle the easy case that requires no repack instructions.
6997 if (Ty == S32) {
6998 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6999 return true;
7000 }
7001 }
7002
7003 // Now figure out how to copy the new result register back into the old
7004 // result.
7005 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7006
7007 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7008
7009 if (ResultNumRegs == 1) {
7010 assert(!IsTFE);
7011 ResultRegs[0] = NewResultReg;
7012 } else {
7013 // We have to repack into a new vector of some kind.
7014 for (int I = 0; I != NumDataRegs; ++I)
7015 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7016 B.buildUnmerge(ResultRegs, NewResultReg);
7017
7018 // Drop the final TFE element to get the data part. The TFE result is
7019 // directly written to the right place already.
7020 if (IsTFE)
7021 ResultRegs.resize(NumDataRegs);
7022 }
7023
7024 // For an s16 scalar result, we form an s32 result with a truncate regardless
7025 // of packed vs. unpacked.
7026 if (IsD16 && !Ty.isVector()) {
7027 B.buildTrunc(DstReg, ResultRegs[0]);
7028 return true;
7029 }
7030
7031 // Avoid a build/concat_vector of 1 entry.
7032 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7033 B.buildBitcast(DstReg, ResultRegs[0]);
7034 return true;
7035 }
7036
7037 assert(Ty.isVector());
7038
7039 if (IsD16) {
7040 // For packed D16 results with TFE enabled, all the data components are
7041 // S32. Cast back to the expected type.
7042 //
7043 // TODO: We don't really need to use load s32 elements. We would only need one
7044 // cast for the TFE result if a multiple of v2s16 was used.
7045 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7046 for (Register &Reg : ResultRegs)
7047 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7048 } else if (ST.hasUnpackedD16VMem()) {
7049 for (Register &Reg : ResultRegs)
7050 Reg = B.buildTrunc(S16, Reg).getReg(0);
7051 }
7052 }
7053
7054 auto padWithUndef = [&](LLT Ty, int NumElts) {
7055 if (NumElts == 0)
7056 return;
7057 Register Undef = B.buildUndef(Ty).getReg(0);
7058 for (int I = 0; I != NumElts; ++I)
7059 ResultRegs.push_back(Undef);
7060 };
7061
7062 // Pad out any elements eliminated due to the dmask.
7063 LLT ResTy = MRI->getType(ResultRegs[0]);
7064 if (!ResTy.isVector()) {
7065 padWithUndef(ResTy, NumElts - ResultRegs.size());
7066 B.buildBuildVector(DstReg, ResultRegs);
7067 return true;
7068 }
7069
7070 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7071 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7072
7073 // Deal with the one annoying legal case.
7074 const LLT V3S16 = LLT::fixed_vector(3, 16);
7075 if (Ty == V3S16) {
7076 if (IsTFE) {
7077 if (ResultRegs.size() == 1) {
7078 NewResultReg = ResultRegs[0];
7079 } else if (ResultRegs.size() == 2) {
7080 LLT V4S16 = LLT::fixed_vector(4, 16);
7081 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7082 } else {
7083 return false;
7084 }
7085 }
7086
7087 if (MRI->getType(DstReg).getNumElements() <
7088 MRI->getType(NewResultReg).getNumElements()) {
7089 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7090 } else {
7091 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7092 }
7093 return true;
7094 }
7095
7096 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7097 B.buildConcatVectors(DstReg, ResultRegs);
7098 return true;
7099}
7100
7102 MachineInstr &MI) const {
7103 MachineIRBuilder &B = Helper.MIRBuilder;
7104 GISelChangeObserver &Observer = Helper.Observer;
7105
7106 Register OrigDst = MI.getOperand(0).getReg();
7107 Register Dst;
7108 LLT Ty = B.getMRI()->getType(OrigDst);
7109 unsigned Size = Ty.getSizeInBits();
7110 MachineFunction &MF = B.getMF();
7111 unsigned Opc = 0;
7112 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7113 assert(Size == 8 || Size == 16);
7114 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7115 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7116 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7117 // destination register.
7118 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7119 } else {
7120 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7121 Dst = OrigDst;
7122 }
7123
7124 Observer.changingInstr(MI);
7125
7126 // Handle needing to s.buffer.load() a p8 value.
7127 if (hasBufferRsrcWorkaround(Ty)) {
7128 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7129 B.setInsertPt(B.getMBB(), MI);
7130 }
7132 Ty = getBitcastRegisterType(Ty);
7133 Helper.bitcastDst(MI, Ty, 0);
7134 B.setInsertPt(B.getMBB(), MI);
7135 }
7136
7137 // FIXME: We don't really need this intermediate instruction. The intrinsic
7138 // should be fixed to have a memory operand. Since it's readnone, we're not
7139 // allowed to add one.
7140 MI.setDesc(B.getTII().get(Opc));
7141 MI.removeOperand(1); // Remove intrinsic ID
7142
7143 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7144 const unsigned MemSize = (Size + 7) / 8;
7145 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7151 MemSize, MemAlign);
7152 MI.addMemOperand(MF, MMO);
7153 if (Dst != OrigDst) {
7154 MI.getOperand(0).setReg(Dst);
7155 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7156 B.buildTrunc(OrigDst, Dst);
7157 }
7158
7159 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7160 // always be legal. We may need to restore this to a 96-bit result if it turns
7161 // out this needs to be converted to a vector load during RegBankSelect.
7162 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7163 if (Ty.isVector())
7165 else
7166 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7167 }
7168
7169 Observer.changedInstr(MI);
7170 return true;
7171}
7172
7174 MachineInstr &MI) const {
7175 MachineIRBuilder &B = Helper.MIRBuilder;
7176 GISelChangeObserver &Observer = Helper.Observer;
7177 Observer.changingInstr(MI);
7178 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7179 MI.removeOperand(0); // Remove intrinsic ID
7181 Observer.changedInstr(MI);
7182 return true;
7183}
7184
7185// TODO: Move to selection
7188 MachineIRBuilder &B) const {
7189 if (!ST.isTrapHandlerEnabled() ||
7190 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7191 return legalizeTrapEndpgm(MI, MRI, B);
7192
7193 return ST.supportsGetDoorbellID() ?
7195}
7196
7199 const DebugLoc &DL = MI.getDebugLoc();
7200 MachineBasicBlock &BB = B.getMBB();
7201 MachineFunction *MF = BB.getParent();
7202
7203 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7204 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7205 .addImm(0);
7206 MI.eraseFromParent();
7207 return true;
7208 }
7209
7210 // We need a block split to make the real endpgm a terminator. We also don't
7211 // want to break phis in successor blocks, so we can't just delete to the
7212 // end of the block.
7213 BB.splitAt(MI, false /*UpdateLiveIns*/);
7215 MF->push_back(TrapBB);
7216 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7217 .addImm(0);
7218 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7219 .addMBB(TrapBB);
7220
7221 BB.addSuccessor(TrapBB);
7222 MI.eraseFromParent();
7223 return true;
7224}
7225
7228 MachineFunction &MF = B.getMF();
7229 const LLT S64 = LLT::scalar(64);
7230
7231 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7232 // For code object version 5, queue_ptr is passed through implicit kernarg.
7238 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7239
7240 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7242
7243 if (!loadInputValue(KernargPtrReg, B,
7245 return false;
7246
7247 // TODO: can we be smarter about machine pointer info?
7250 PtrInfo,
7254
7255 // Pointer address
7256 Register LoadAddr = MRI.createGenericVirtualRegister(
7258 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7259 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7260 // Load address
7261 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7262 B.buildCopy(SGPR01, Temp);
7263 B.buildInstr(AMDGPU::S_TRAP)
7264 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7265 .addReg(SGPR01, RegState::Implicit);
7266 MI.eraseFromParent();
7267 return true;
7268 }
7269
7270 // Pass queue pointer to trap handler as input, and insert trap instruction
7271 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7272 Register LiveIn =
7273 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
7275 return false;
7276
7277 B.buildCopy(SGPR01, LiveIn);
7278 B.buildInstr(AMDGPU::S_TRAP)
7279 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7280 .addReg(SGPR01, RegState::Implicit);
7281
7282 MI.eraseFromParent();
7283 return true;
7284}
7285
7288 MachineIRBuilder &B) const {
7289 // We need to simulate the 's_trap 2' instruction on targets that run in
7290 // PRIV=1 (where it is treated as a nop).
7291 if (ST.hasPrivEnabledTrap2NopBug()) {
7292 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7293 MI.getDebugLoc());
7294 MI.eraseFromParent();
7295 return true;
7296 }
7297
7298 B.buildInstr(AMDGPU::S_TRAP)
7299 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7300 MI.eraseFromParent();
7301 return true;
7302}
7303
7306 MachineIRBuilder &B) const {
7307 // Is non-HSA path or trap-handler disabled? Then, report a warning
7308 // accordingly
7309 if (!ST.isTrapHandlerEnabled() ||
7310 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7311 Function &Fn = B.getMF().getFunction();
7313 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7314 } else {
7315 // Insert debug-trap instruction
7316 B.buildInstr(AMDGPU::S_TRAP)
7317 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7318 }
7319
7320 MI.eraseFromParent();
7321 return true;
7322}
7323
7325 MachineInstr &MI, MachineIRBuilder &B) const {
7326 MachineRegisterInfo &MRI = *B.getMRI();
7327 const LLT S16 = LLT::scalar(16);
7328 const LLT S32 = LLT::scalar(32);
7329 const LLT V2S16 = LLT::fixed_vector(2, 16);
7330 const LLT V3S32 = LLT::fixed_vector(3, 32);
7331
7332 Register DstReg = MI.getOperand(0).getReg();
7333 Register NodePtr = MI.getOperand(2).getReg();
7334 Register RayExtent = MI.getOperand(3).getReg();
7335 Register RayOrigin = MI.getOperand(4).getReg();
7336 Register RayDir = MI.getOperand(5).getReg();
7337 Register RayInvDir = MI.getOperand(6).getReg();
7338 Register TDescr = MI.getOperand(7).getReg();
7339
7340 if (!ST.hasGFX10_AEncoding()) {
7341 Function &Fn = B.getMF().getFunction();
7343 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7344 return false;
7345 }
7346
7347 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7348 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7349 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7350 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7351 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7352 const unsigned NumVDataDwords = 4;
7353 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7354 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7355 const bool UseNSA =
7356 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7357
7358 const unsigned BaseOpcodes[2][2] = {
7359 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7360 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7361 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7362 int Opcode;
7363 if (UseNSA) {
7364 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7365 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7366 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7367 : AMDGPU::MIMGEncGfx10NSA,
7368 NumVDataDwords, NumVAddrDwords);
7369 } else {
7370 assert(!IsGFX12Plus);
7371 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7372 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7373 : AMDGPU::MIMGEncGfx10Default,
7374 NumVDataDwords, NumVAddrDwords);
7375 }
7376 assert(Opcode != -1);
7377
7379 if (UseNSA && IsGFX11Plus) {
7380 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7381 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7382 auto Merged = B.buildMergeLikeInstr(
7383 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7384 Ops.push_back(Merged.getReg(0));
7385 };
7386
7387 Ops.push_back(NodePtr);
7388 Ops.push_back(RayExtent);
7389 packLanes(RayOrigin);
7390
7391 if (IsA16) {
7392 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7393 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7394 auto MergedDir = B.buildMergeLikeInstr(
7395 V3S32,
7396 {B.buildBitcast(
7397 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7398 UnmergeRayDir.getReg(0)}))
7399 .getReg(0),
7400 B.buildBitcast(
7401 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7402 UnmergeRayDir.getReg(1)}))
7403 .getReg(0),
7404 B.buildBitcast(
7405 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7406 UnmergeRayDir.getReg(2)}))
7407 .getReg(0)});
7408 Ops.push_back(MergedDir.getReg(0));
7409 } else {
7410 packLanes(RayDir);
7411 packLanes(RayInvDir);
7412 }
7413 } else {
7414 if (Is64) {
7415 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7416 Ops.push_back(Unmerge.getReg(0));
7417 Ops.push_back(Unmerge.getReg(1));
7418 } else {
7419 Ops.push_back(NodePtr);
7420 }
7421 Ops.push_back(RayExtent);
7422
7423 auto packLanes = [&Ops, &S32, &B](Register Src) {
7424 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7425 Ops.push_back(Unmerge.getReg(0));
7426 Ops.push_back(Unmerge.getReg(1));
7427 Ops.push_back(Unmerge.getReg(2));
7428 };
7429
7430 packLanes(RayOrigin);
7431 if (IsA16) {
7432 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7433 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7434 Register R1 = MRI.createGenericVirtualRegister(S32);
7435 Register R2 = MRI.createGenericVirtualRegister(S32);
7436 Register R3 = MRI.createGenericVirtualRegister(S32);
7437 B.buildMergeLikeInstr(R1,
7438 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7439 B.buildMergeLikeInstr(
7440 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7441 B.buildMergeLikeInstr(
7442 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7443 Ops.push_back(R1);
7444 Ops.push_back(R2);
7445 Ops.push_back(R3);
7446 } else {
7447 packLanes(RayDir);
7448 packLanes(RayInvDir);
7449 }
7450 }
7451
7452 if (!UseNSA) {
7453 // Build a single vector containing all the operands so far prepared.
7454 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7455 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7456 Ops.clear();
7457 Ops.push_back(MergedOps);
7458 }
7459
7460 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7461 .addDef(DstReg)
7462 .addImm(Opcode);
7463
7464 for (Register R : Ops) {
7465 MIB.addUse(R);
7466 }
7467
7468 MIB.addUse(TDescr)
7469 .addImm(IsA16 ? 1 : 0)
7470 .cloneMemRefs(MI);
7471
7472 MI.eraseFromParent();
7473 return true;
7474}
7475
7477 MachineInstr &MI, MachineIRBuilder &B) const {
7478 const LLT S32 = LLT::scalar(32);
7479 const LLT V2S32 = LLT::fixed_vector(2, 32);
7480
7481 Register DstReg = MI.getOperand(0).getReg();
7482 Register DstOrigin = MI.getOperand(1).getReg();
7483 Register DstDir = MI.getOperand(2).getReg();
7484 Register NodePtr = MI.getOperand(4).getReg();
7485 Register RayExtent = MI.getOperand(5).getReg();
7486 Register InstanceMask = MI.getOperand(6).getReg();
7487 Register RayOrigin = MI.getOperand(7).getReg();
7488 Register RayDir = MI.getOperand(8).getReg();
7489 Register Offsets = MI.getOperand(9).getReg();
7490 Register TDescr = MI.getOperand(10).getReg();
7491
7492 if (!ST.hasBVHDualAndBVH8Insts()) {
7493 Function &Fn = B.getMF().getFunction();
7495 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7496 return false;
7497 }
7498
7499 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7500 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7501 const unsigned NumVDataDwords = 10;
7502 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7503 int Opcode = AMDGPU::getMIMGOpcode(
7504 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7505 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7506 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7507 assert(Opcode != -1);
7508
7509 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7510 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7511
7512 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7513 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7514 .addDef(DstReg)
7515 .addDef(DstOrigin)
7516 .addDef(DstDir)
7517 .addImm(Opcode)
7518 .addUse(NodePtr)
7519 .addUse(RayExtentInstanceMaskVec.getReg(0))
7520 .addUse(RayOrigin)
7521 .addUse(RayDir)
7522 .addUse(Offsets)
7523 .addUse(TDescr)
7524 .cloneMemRefs(MI);
7525
7526 MI.eraseFromParent();
7527 return true;
7528}
7529
7531 MachineIRBuilder &B) const {
7532 const SITargetLowering *TLI = ST.getTargetLowering();
7534 Register DstReg = MI.getOperand(0).getReg();
7535 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7536 MI.eraseFromParent();
7537 return true;
7538}
7539
7541 MachineIRBuilder &B) const {
7542 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7543 if (!ST.hasArchitectedSGPRs())
7544 return false;
7545 LLT S32 = LLT::scalar(32);
7546 Register DstReg = MI.getOperand(0).getReg();
7547 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7548 auto LSB = B.buildConstant(S32, 25);
7549 auto Width = B.buildConstant(S32, 5);
7550 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7551 MI.eraseFromParent();
7552 return true;
7553}
7554
7557 AMDGPU::Hwreg::Id HwReg,
7558 unsigned LowBit,
7559 unsigned Width) const {
7560 MachineRegisterInfo &MRI = *B.getMRI();
7561 Register DstReg = MI.getOperand(0).getReg();
7562 if (!MRI.getRegClassOrNull(DstReg))
7563 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7564 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7565 .addDef(DstReg)
7566 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7567 MI.eraseFromParent();
7568 return true;
7569}
7570
7571static constexpr unsigned FPEnvModeBitField =
7573
7574static constexpr unsigned FPEnvTrapBitField =
7576
7579 MachineIRBuilder &B) const {
7580 Register Src = MI.getOperand(0).getReg();
7581 if (MRI.getType(Src) != S64)
7582 return false;
7583
7584 auto ModeReg =
7585 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7586 /*HasSideEffects=*/true, /*isConvergent=*/false)
7587 .addImm(FPEnvModeBitField);
7588 auto TrapReg =
7589 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7590 /*HasSideEffects=*/true, /*isConvergent=*/false)
7591 .addImm(FPEnvTrapBitField);
7592 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7593 MI.eraseFromParent();
7594 return true;
7595}
7596
7599 MachineIRBuilder &B) const {
7600 Register Src = MI.getOperand(0).getReg();
7601 if (MRI.getType(Src) != S64)
7602 return false;
7603
7604 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7605 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7606 /*HasSideEffects=*/true, /*isConvergent=*/false)
7607 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7608 .addReg(Unmerge.getReg(0));
7609 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7610 /*HasSideEffects=*/true, /*isConvergent=*/false)
7611 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7612 .addReg(Unmerge.getReg(1));
7613 MI.eraseFromParent();
7614 return true;
7615}
7616
7618 MachineInstr &MI) const {
7619 MachineIRBuilder &B = Helper.MIRBuilder;
7620 MachineRegisterInfo &MRI = *B.getMRI();
7621
7622 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7623 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7624 switch (IntrID) {
7625 case Intrinsic::amdgcn_if:
7626 case Intrinsic::amdgcn_else: {
7627 MachineInstr *Br = nullptr;
7628 MachineBasicBlock *UncondBrTarget = nullptr;
7629 bool Negated = false;
7630 if (MachineInstr *BrCond =
7631 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7632 const SIRegisterInfo *TRI
7633 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7634
7635 Register Def = MI.getOperand(1).getReg();
7636 Register Use = MI.getOperand(3).getReg();
7637
7638 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7639
7640 if (Negated)
7641 std::swap(CondBrTarget, UncondBrTarget);
7642
7643 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7644 if (IntrID == Intrinsic::amdgcn_if) {
7645 B.buildInstr(AMDGPU::SI_IF)
7646 .addDef(Def)
7647 .addUse(Use)
7648 .addMBB(UncondBrTarget);
7649 } else {
7650 B.buildInstr(AMDGPU::SI_ELSE)
7651 .addDef(Def)
7652 .addUse(Use)
7653 .addMBB(UncondBrTarget);
7654 }
7655
7656 if (Br) {
7657 Br->getOperand(0).setMBB(CondBrTarget);
7658 } else {
7659 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7660 // since we're swapping branch targets it needs to be reinserted.
7661 // FIXME: IRTranslator should probably not do this
7662 B.buildBr(*CondBrTarget);
7663 }
7664
7665 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7666 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7667 MI.eraseFromParent();
7668 BrCond->eraseFromParent();
7669 return true;
7670 }
7671
7672 return false;
7673 }
7674 case Intrinsic::amdgcn_loop: {
7675 MachineInstr *Br = nullptr;
7676 MachineBasicBlock *UncondBrTarget = nullptr;
7677 bool Negated = false;
7678 if (MachineInstr *BrCond =
7679 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7680 const SIRegisterInfo *TRI
7681 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7682
7683 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7684 Register Reg = MI.getOperand(2).getReg();
7685
7686 if (Negated)
7687 std::swap(CondBrTarget, UncondBrTarget);
7688
7689 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7690 B.buildInstr(AMDGPU::SI_LOOP)
7691 .addUse(Reg)
7692 .addMBB(UncondBrTarget);
7693
7694 if (Br)
7695 Br->getOperand(0).setMBB(CondBrTarget);
7696 else
7697 B.buildBr(*CondBrTarget);
7698
7699 MI.eraseFromParent();
7700 BrCond->eraseFromParent();
7701 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7702 return true;
7703 }
7704
7705 return false;
7706 }
7707 case Intrinsic::amdgcn_addrspacecast_nonnull:
7708 return legalizeAddrSpaceCast(MI, MRI, B);
7709 case Intrinsic::amdgcn_make_buffer_rsrc:
7711 case Intrinsic::amdgcn_kernarg_segment_ptr:
7712 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7713 // This only makes sense to call in a kernel, so just lower to null.
7714 B.buildConstant(MI.getOperand(0).getReg(), 0);
7715 MI.eraseFromParent();
7716 return true;
7717 }
7718
7721 case Intrinsic::amdgcn_implicitarg_ptr:
7722 return legalizeImplicitArgPtr(MI, MRI, B);
7723 case Intrinsic::amdgcn_workitem_id_x:
7724 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7726 case Intrinsic::amdgcn_workitem_id_y:
7727 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7729 case Intrinsic::amdgcn_workitem_id_z:
7730 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7732 case Intrinsic::amdgcn_workgroup_id_x:
7733 return legalizeWorkGroupId(
7737 case Intrinsic::amdgcn_workgroup_id_y:
7738 return legalizeWorkGroupId(
7742 case Intrinsic::amdgcn_workgroup_id_z:
7743 return legalizeWorkGroupId(
7747 case Intrinsic::amdgcn_cluster_id_x:
7748 return ST.hasClusters() &&
7751 case Intrinsic::amdgcn_cluster_id_y:
7752 return ST.hasClusters() &&
7755 case Intrinsic::amdgcn_cluster_id_z:
7756 return ST.hasClusters() &&
7759 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7760 return ST.hasClusters() &&
7763 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7764 return ST.hasClusters() &&
7767 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7768 return ST.hasClusters() &&
7771 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7772 return ST.hasClusters() &&
7774 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7775 return ST.hasClusters() &&
7778 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7779 return ST.hasClusters() &&
7782 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7783 return ST.hasClusters() &&
7786 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7787 return ST.hasClusters() &&
7789 MI, MRI, B,
7791 case Intrinsic::amdgcn_wave_id:
7792 return legalizeWaveID(MI, B);
7793 case Intrinsic::amdgcn_lds_kernel_id:
7796 case Intrinsic::amdgcn_dispatch_ptr:
7799 case Intrinsic::amdgcn_queue_ptr:
7802 case Intrinsic::amdgcn_implicit_buffer_ptr:
7805 case Intrinsic::amdgcn_dispatch_id:
7808 case Intrinsic::r600_read_ngroups_x:
7809 // TODO: Emit error for hsa
7812 case Intrinsic::r600_read_ngroups_y:
7815 case Intrinsic::r600_read_ngroups_z:
7818 case Intrinsic::r600_read_local_size_x:
7819 // TODO: Could insert G_ASSERT_ZEXT from s16
7821 case Intrinsic::r600_read_local_size_y:
7822 // TODO: Could insert G_ASSERT_ZEXT from s16
7824 // TODO: Could insert G_ASSERT_ZEXT from s16
7825 case Intrinsic::r600_read_local_size_z:
7828 case Intrinsic::amdgcn_fdiv_fast:
7829 return legalizeFDIVFastIntrin(MI, MRI, B);
7830 case Intrinsic::amdgcn_is_shared:
7832 case Intrinsic::amdgcn_is_private:
7834 case Intrinsic::amdgcn_wavefrontsize: {
7835 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7836 MI.eraseFromParent();
7837 return true;
7838 }
7839 case Intrinsic::amdgcn_s_buffer_load:
7840 return legalizeSBufferLoad(Helper, MI);
7841 case Intrinsic::amdgcn_raw_buffer_store:
7842 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7843 case Intrinsic::amdgcn_struct_buffer_store:
7844 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7845 return legalizeBufferStore(MI, Helper, false, false);
7846 case Intrinsic::amdgcn_raw_buffer_store_format:
7847 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7848 case Intrinsic::amdgcn_struct_buffer_store_format:
7849 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7850 return legalizeBufferStore(MI, Helper, false, true);
7851 case Intrinsic::amdgcn_raw_tbuffer_store:
7852 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7853 case Intrinsic::amdgcn_struct_tbuffer_store:
7854 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7855 return legalizeBufferStore(MI, Helper, true, true);
7856 case Intrinsic::amdgcn_raw_buffer_load:
7857 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7858 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7859 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7860 case Intrinsic::amdgcn_struct_buffer_load:
7861 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7862 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7863 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7864 return legalizeBufferLoad(MI, Helper, false, false);
7865 case Intrinsic::amdgcn_raw_buffer_load_format:
7866 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7867 case Intrinsic::amdgcn_struct_buffer_load_format:
7868 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7869 return legalizeBufferLoad(MI, Helper, true, false);
7870 case Intrinsic::amdgcn_raw_tbuffer_load:
7871 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7872 case Intrinsic::amdgcn_struct_tbuffer_load:
7873 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7874 return legalizeBufferLoad(MI, Helper, true, true);
7875 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7876 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7877 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7878 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7879 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7880 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7881 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7882 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7883 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7885 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7887 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7889 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7890 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7891 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7893 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7895 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7897 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7898 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7899 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7901 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7902 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7903 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7904 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7905 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7907 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7908 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7909 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7910 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7911 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7913 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7915 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7916 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7917 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7918 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7919 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7920 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7921 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7922 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7923 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7925 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7926 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7927 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7928 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7929 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7930 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7931 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7933 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7935 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7936 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7937 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7938 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7939 return legalizeBufferAtomic(MI, B, IntrID);
7940 case Intrinsic::amdgcn_rsq_clamp:
7942 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7944 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7945 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7947 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
7948 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
7949 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
7950 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
7951 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
7952 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
7953 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
7954 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
7955 Register Index = MI.getOperand(5).getReg();
7956 LLT S64 = LLT::scalar(64);
7957 if (MRI.getType(Index) != S64)
7958 MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0));
7959 return true;
7960 }
7961 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7962 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7963 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7964 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7965 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7966 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7967 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7968 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7969 Register Index = MI.getOperand(5).getReg();
7970 LLT S32 = LLT::scalar(32);
7971 if (MRI.getType(Index) != S32)
7972 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7973 return true;
7974 }
7975 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
7976 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
7977 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
7978 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
7979 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
7980 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
7981 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7982 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7983 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7984 Register Index = MI.getOperand(7).getReg();
7985 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
7986 ? LLT::scalar(64)
7987 : LLT::scalar(32);
7988 if (MRI.getType(Index) != IdxTy)
7989 MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0));
7990 return true;
7991 }
7992
7993 case Intrinsic::amdgcn_fmed3: {
7994 GISelChangeObserver &Observer = Helper.Observer;
7995
7996 // FIXME: This is to workaround the inability of tablegen match combiners to
7997 // match intrinsics in patterns.
7998 Observer.changingInstr(MI);
7999 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8000 MI.removeOperand(1);
8001 Observer.changedInstr(MI);
8002 return true;
8003 }
8004 case Intrinsic::amdgcn_readlane:
8005 case Intrinsic::amdgcn_writelane:
8006 case Intrinsic::amdgcn_readfirstlane:
8007 case Intrinsic::amdgcn_permlane16:
8008 case Intrinsic::amdgcn_permlanex16:
8009 case Intrinsic::amdgcn_permlane64:
8010 case Intrinsic::amdgcn_set_inactive:
8011 case Intrinsic::amdgcn_set_inactive_chain_arg:
8012 case Intrinsic::amdgcn_mov_dpp8:
8013 case Intrinsic::amdgcn_update_dpp:
8014 return legalizeLaneOp(Helper, MI, IntrID);
8015 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8016 return legalizeSBufferPrefetch(Helper, MI);
8017 case Intrinsic::amdgcn_dead: {
8018 // TODO: Use poison instead of undef
8019 for (const MachineOperand &Def : MI.defs())
8020 B.buildUndef(Def);
8021 MI.eraseFromParent();
8022 return true;
8023 }
8024 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8025 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8026 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8027 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8028 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8029 MI.eraseFromParent();
8030 return true;
8031 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8032 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8033 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8034 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8035 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8036 MI.eraseFromParent();
8037 return true;
8038 default: {
8039 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8041 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8042 return true;
8043 }
8044 }
8045
8046 return true;
8047}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:71
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1247
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
const std::array< unsigned, 3 > & getDims() const
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1158
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:702
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:686
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:687
@ ICMP_NE
not equal
Definition InstrTypes.h:700
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:64
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:392
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
TargetOptions Options
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
self_iterator getIterator()
Definition ilist_node.h:134
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
constexpr double ln2
Definition MathExtras.h:49
constexpr double ln10
Definition MathExtras.h:50
constexpr float log2ef
Definition MathExtras.h:66
constexpr double log2e
Definition MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:916
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:355
@ Offset
Definition DWP.cpp:477
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:2033
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:651
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:307
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:270
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:157
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1720
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:208
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:280
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:384
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
static constexpr uint64_t encode(Fields... Values)
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.