LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUTargetMachine.h"
19#include "GCNSubtarget.h"
22#include "SIRegisterInfo.h"
23#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/Statistic.h"
39#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
43#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/ModRef.h"
48#include <optional>
49
50using namespace llvm;
51using namespace llvm::SDPatternMatch;
52
53#define DEBUG_TYPE "si-lower"
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58 DisableLoopAlignment("amdgpu-disable-loop-alignment",
59 cl::desc("Do not align and prefetch loops"),
60 cl::init(false));
61
63 "amdgpu-use-divergent-register-indexing", cl::Hidden,
64 cl::desc("Use indirect register addressing for divergent indexes"),
65 cl::init(false));
66
67// TODO: This option should be removed once we switch to always using PTRADD in
68// the SelectionDAG.
70 "amdgpu-use-sdag-ptradd", cl::Hidden,
71 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
72 "SelectionDAG ISel"),
73 cl::init(false));
74
77 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
78}
79
82 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
83}
84
85static unsigned findFirstFreeSGPR(CCState &CCInfo) {
86 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
87 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
88 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
89 return AMDGPU::SGPR0 + Reg;
90 }
91 }
92 llvm_unreachable("Cannot allocate sgpr");
93}
94
96 const GCNSubtarget &STI)
97 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
98 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
99 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
100
101 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
102 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
103
104 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
105
106 const SIRegisterInfo *TRI = STI.getRegisterInfo();
107 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
108
109 addRegisterClass(MVT::f64, V64RegClass);
110 addRegisterClass(MVT::v2f32, V64RegClass);
111 addRegisterClass(MVT::Untyped, V64RegClass);
112
113 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
114 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
115
116 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
117 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
118
119 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
120 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
121
122 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
123 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
124
125 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
126 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
127
128 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
129 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
130
131 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
132 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
133
134 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
135 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
136
137 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
138 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
139
140 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
141 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
142
143 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
144 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
151
152 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
153 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
154
155 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
156 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
157
158 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
159 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
160
161 if (Subtarget->has16BitInsts()) {
162 if (Subtarget->useRealTrue16Insts()) {
163 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
165 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
166 } else {
167 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
170 }
171
172 // Unless there are also VOP3P operations, not operations are really legal.
173 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
176 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
179 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
182 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
185 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
187 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
188 }
189
190 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
191 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
192
193 computeRegisterProperties(Subtarget->getRegisterInfo());
194
195 // The boolean content concept here is too inflexible. Compares only ever
196 // really produce a 1-bit result. Any copy/extend from these will turn into a
197 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
198 // it's what most targets use.
201
202 // We need to custom lower vector stores from local memory
203 setOperationAction(ISD::LOAD,
204 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
205 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
206 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
207 MVT::i1, MVT::v32i32},
208 Custom);
209
210 setOperationAction(ISD::STORE,
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Custom);
216
217 if (isTypeLegal(MVT::bf16)) {
218 for (unsigned Opc :
220 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
221 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
222 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
223 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
224 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
225 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
226 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
227 ISD::SETCC}) {
228 // FIXME: The promoted to type shouldn't need to be explicit
229 setOperationAction(Opc, MVT::bf16, Promote);
230 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
231 }
232
234
236 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
237
238 setOperationAction(ISD::FABS, MVT::bf16, Legal);
239 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
241
242 // We only need to custom lower because we can't specify an action for bf16
243 // sources.
246 }
247
248 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
249 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
250 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
251 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
252 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
253 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
254 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
259 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
264
265 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
266 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
267 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
270 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
271 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
272
273 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
274
278 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
279
280 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
281
283 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
284
286 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
287 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
288
290 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
291 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
292 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
293 Expand);
295 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
296 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
297 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
298 Expand);
299
301 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
302 MVT::v3i16, MVT::v4i16, MVT::Other},
303 Custom);
304
305 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
306 setOperationAction(ISD::BR_CC,
307 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
308
310
312
314 Expand);
315
316#if 0
318#endif
319
320 // We only support LOAD/STORE and vector manipulation ops for vectors
321 // with > 4 elements.
322 for (MVT VT :
323 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
324 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
325 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
326 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
327 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
328 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
329 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
330 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
331 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
332 switch (Op) {
333 case ISD::LOAD:
334 case ISD::STORE:
336 case ISD::BITCAST:
337 case ISD::UNDEF:
341 case ISD::IS_FPCLASS:
342 break;
347 break;
348 default:
350 break;
351 }
352 }
353 }
354
355 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
356
357 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
358 // is expanded to avoid having two separate loops in case the index is a VGPR.
359
360 // Most operations are naturally 32-bit vector operations. We only support
361 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
362 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
364 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
368
370 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
374 }
375
376 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
378 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
382
384 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
388 }
389
390 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
392 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
396
398 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
402 }
403
404 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
406 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
410
412 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
416 }
417
418 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
420 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
424
426 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
430 }
431
433 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
434 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
435 Custom);
436
437 if (Subtarget->hasPkMovB32()) {
438 // TODO: 16-bit element vectors should be legal with even aligned elements.
439 // TODO: Can be legal with wider source types than the result with
440 // subregister extracts.
441 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
442 }
443
445 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
446 // instead lower to cndmask in SITargetLowering::LowerSELECT().
448 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
449 // alignbit.
450 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
451
452 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
453 Custom);
454
455 // Avoid stack access for these.
456 // TODO: Generalize to more vector types.
458 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
459 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
460 Custom);
461
462 // Deal with vec3 vector operations when widened to vec4.
464 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
465
466 // Deal with vec5/6/7 vector operations when widened to vec8.
468 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
469 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
470 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
471 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
472 Custom);
473
474 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
475 // and output demarshalling
476 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
477
478 // We can't return success/failure, only the old value,
479 // let LLVM add the comparison
480 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
481 Expand);
482
483 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
484
485 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
486
487 // FIXME: This should be narrowed to i32, but that only happens if i64 is
488 // illegal.
489 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
490 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
491
492 // On SI this is s_memtime and s_memrealtime on VI.
493 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
494
495 if (Subtarget->hasSMemRealTime() ||
496 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
497 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
498 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
499
500 if (Subtarget->has16BitInsts()) {
501 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
502 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
503 } else {
504 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
505 }
506
507 if (Subtarget->hasMadMacF32Insts())
509
510 if (!Subtarget->hasBFI())
511 // fcopysign can be done in a single instruction with BFI.
512 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
513
514 if (!Subtarget->hasBCNT(32))
516
517 if (!Subtarget->hasBCNT(64))
519
520 if (Subtarget->hasFFBH())
522
523 if (Subtarget->hasFFBL())
525
526 // We only really have 32-bit BFE instructions (and 16-bit on VI).
527 //
528 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
529 // effort to match them now. We want this to be false for i64 cases when the
530 // extraction isn't restricted to the upper or lower half. Ideally we would
531 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
532 // span the midpoint are probably relatively rare, so don't worry about them
533 // for now.
534 if (Subtarget->hasBFE())
536
537 // Clamp modifier on add/sub
538 if (Subtarget->hasIntClamp())
540
541 if (Subtarget->hasAddNoCarry())
542 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
543 Legal);
544
546 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
547 {MVT::f32, MVT::f64}, Custom);
548
549 // These are really only legal for ieee_mode functions. We should be avoiding
550 // them for functions that don't have ieee_mode enabled, so just say they are
551 // legal.
552 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
553 {MVT::f32, MVT::f64}, Legal);
554
555 if (Subtarget->haveRoundOpsF64())
556 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
557 Legal);
558 else
559 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
560 MVT::f64, Custom);
561
562 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
563 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
564 Legal);
565 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
566
567 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
569
570 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
571 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
572
573 // Custom lower these because we can't specify a rule based on an illegal
574 // source bf16.
575 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
576 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
577
578 if (Subtarget->has16BitInsts()) {
581 MVT::i16, Legal);
582
583 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
584
586 MVT::i16, Expand);
587
591 ISD::CTPOP},
592 MVT::i16, Promote);
593
594 setOperationAction(ISD::LOAD, MVT::i16, Custom);
595
596 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
597
598 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
599 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
600 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
601 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
602
606
608
609 // F16 - Constant Actions.
612
613 // F16 - Load/Store Actions.
614 setOperationAction(ISD::LOAD, MVT::f16, Promote);
615 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
616 setOperationAction(ISD::STORE, MVT::f16, Promote);
617 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
618
619 // BF16 - Load/Store Actions.
620 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
621 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
622 setOperationAction(ISD::STORE, MVT::bf16, Promote);
623 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
624
625 // F16 - VOP1 Actions.
627 ISD::FSIN, ISD::FROUND},
628 MVT::f16, Custom);
629
630 // BF16 - VOP1 Actions.
631 if (Subtarget->hasBF16TransInsts())
632 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
633
636
637 // F16 - VOP2 Actions.
638 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
639 Expand);
640 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
641 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
643
644 // F16 - VOP3 Actions.
646 if (STI.hasMadF16())
648
649 for (MVT VT :
650 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
651 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
652 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
653 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
654 switch (Op) {
655 case ISD::LOAD:
656 case ISD::STORE:
658 case ISD::BITCAST:
659 case ISD::UNDEF:
664 case ISD::IS_FPCLASS:
665 break;
669 break;
670 default:
672 break;
673 }
674 }
675 }
676
677 // v_perm_b32 can handle either of these.
678 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
680
681 // XXX - Do these do anything? Vector constants turn into build_vector.
682 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
683
684 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
685 Legal);
686
687 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
688 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
689 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
690 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
691
692 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
694 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
695 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
696
697 setOperationAction(ISD::AND, MVT::v2i16, Promote);
698 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
699 setOperationAction(ISD::OR, MVT::v2i16, Promote);
700 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
701 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
702 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
703
704 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
705 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
706 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
708 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
710
711 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
712 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
713 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
714 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
715 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
716 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
717
718 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
719 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
720 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
722 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
724
725 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
726 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
727 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
728 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
729
730 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
732 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
734 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
735 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
736
737 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
738 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
739 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
740 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
741 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
742 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
743
744 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
745 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
746 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
747 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
748 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
749 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
750
751 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
752 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
753 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
754 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
755 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
756 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
757
758 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
759 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
760 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
761 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
762 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
763 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
764
766 MVT::v2i32, Expand);
767 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
768
770 MVT::v4i32, Expand);
771
773 MVT::v8i32, Expand);
774
775 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
776 Subtarget->hasVOP3PInsts() ? Legal : Custom);
777
778 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
779 // This isn't really legal, but this avoids the legalizer unrolling it (and
780 // allows matching fneg (fabs x) patterns)
781 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
782
783 // Can do this in one BFI plus a constant materialize.
785 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
786 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
787 MVT::v32f16, MVT::v32bf16},
788 Custom);
789
791 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
792 MVT::f16, Custom);
793 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
794
795 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
796 ISD::FMAXIMUMNUM},
797 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
798 Custom);
799
800 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
801 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
802 Expand);
803
804 for (MVT Vec16 :
805 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
806 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
809 Vec16, Custom);
811 }
812 }
813
814 if (Subtarget->hasVOP3PInsts()) {
818 MVT::v2i16, Legal);
819
820 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
821 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
822 MVT::v2f16, Legal);
823
825 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
826
828 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
829 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
830 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
831 Custom);
832
833 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
834 // Split vector operations.
839 VT, Custom);
840
841 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
842 // Split vector operations.
844 VT, Custom);
845
847 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
848 {MVT::v2f16, MVT::v4f16}, Custom);
849
850 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
851 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
852 Custom);
853
854 if (Subtarget->hasPackedFP32Ops()) {
856 MVT::v2f32, Legal);
858 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
859 Custom);
860 }
861 }
862
863 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
864
865 if (Subtarget->has16BitInsts()) {
867 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
869 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
870 } else {
871 // Legalization hack.
872 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
873
874 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
875 }
876
878 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
879 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
880 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
881 MVT::v32f16, MVT::v32bf16},
882 Custom);
883
885
886 if (Subtarget->hasVectorMulU64())
888 else if (Subtarget->hasScalarSMulU64())
890
891 if (Subtarget->hasMad64_32())
893
894 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
895 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
896
897 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
899 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
900 } else {
901 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
902 if (Subtarget->hasMinimum3Maximum3F32())
903 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
904
905 if (Subtarget->hasMinimum3Maximum3PKF16()) {
906 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
907
908 // If only the vector form is available, we need to widen to a vector.
909 if (!Subtarget->hasMinimum3Maximum3F16())
910 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
911 }
912 }
913
914 if (Subtarget->hasVOP3PInsts()) {
915 // We want to break these into v2f16 pieces, not scalarize.
916 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
917 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
918 Custom);
919 }
920
921 if (Subtarget->hasIntMinMax64())
923 Legal);
924
926 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
927 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
928 MVT::i8},
929 Custom);
930
932 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
933 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
934 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
935 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
936 Custom);
937
939 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
940 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
941 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
942 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
943 Custom);
944
945 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
947 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
948 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
949 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
950
951 // TODO: Could move this to custom lowering, could benefit from combines on
952 // extract of relevant bits.
953 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
954
956
957 if (Subtarget->hasBF16ConversionInsts()) {
958 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
960 }
961
962 if (Subtarget->hasBF16PackedInsts()) {
964 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
965 MVT::v2bf16, Legal);
966 }
967
968 if (Subtarget->hasBF16TransInsts()) {
969 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
970 }
971
972 if (Subtarget->hasCvtPkF16F32Inst()) {
974 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
975 Custom);
976 }
977
979 ISD::PTRADD,
981 ISD::SUB,
983 ISD::MUL,
984 ISD::FADD,
985 ISD::FSUB,
986 ISD::FDIV,
987 ISD::FMUL,
988 ISD::FMINNUM,
989 ISD::FMAXNUM,
990 ISD::FMINNUM_IEEE,
991 ISD::FMAXNUM_IEEE,
992 ISD::FMINIMUM,
993 ISD::FMAXIMUM,
994 ISD::FMINIMUMNUM,
995 ISD::FMAXIMUMNUM,
996 ISD::FMA,
997 ISD::SMIN,
998 ISD::SMAX,
999 ISD::UMIN,
1000 ISD::UMAX,
1001 ISD::SETCC,
1003 ISD::SMIN,
1004 ISD::SMAX,
1005 ISD::UMIN,
1006 ISD::UMAX,
1007 ISD::AND,
1008 ISD::OR,
1009 ISD::XOR,
1010 ISD::SHL,
1011 ISD::SRL,
1012 ISD::SRA,
1013 ISD::FSHR,
1023
1024 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1026
1027 // All memory operations. Some folding on the pointer operand is done to help
1028 // matching the constant offsets in the addressing modes.
1029 setTargetDAGCombine({ISD::LOAD,
1030 ISD::STORE,
1031 ISD::ATOMIC_LOAD,
1032 ISD::ATOMIC_STORE,
1033 ISD::ATOMIC_CMP_SWAP,
1034 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1035 ISD::ATOMIC_SWAP,
1036 ISD::ATOMIC_LOAD_ADD,
1037 ISD::ATOMIC_LOAD_SUB,
1038 ISD::ATOMIC_LOAD_AND,
1039 ISD::ATOMIC_LOAD_OR,
1040 ISD::ATOMIC_LOAD_XOR,
1041 ISD::ATOMIC_LOAD_NAND,
1042 ISD::ATOMIC_LOAD_MIN,
1043 ISD::ATOMIC_LOAD_MAX,
1044 ISD::ATOMIC_LOAD_UMIN,
1045 ISD::ATOMIC_LOAD_UMAX,
1046 ISD::ATOMIC_LOAD_FADD,
1047 ISD::ATOMIC_LOAD_FMIN,
1048 ISD::ATOMIC_LOAD_FMAX,
1049 ISD::ATOMIC_LOAD_UINC_WRAP,
1050 ISD::ATOMIC_LOAD_UDEC_WRAP,
1053
1054 // FIXME: In other contexts we pretend this is a per-function property.
1056
1058}
1059
1060const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1061
1063 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1064 return RCRegs;
1065}
1066
1067//===----------------------------------------------------------------------===//
1068// TargetLowering queries
1069//===----------------------------------------------------------------------===//
1070
1071// v_mad_mix* support a conversion from f16 to f32.
1072//
1073// There is only one special case when denormals are enabled we don't currently,
1074// where this is OK to use.
1075bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1076 EVT DestVT, EVT SrcVT) const {
1077 return DestVT.getScalarType() == MVT::f32 &&
1078 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1079 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 SrcVT.getScalarType() == MVT::f16) ||
1081 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1082 SrcVT.getScalarType() == MVT::bf16)) &&
1083 // TODO: This probably only requires no input flushing?
1085}
1086
1088 LLT DestTy, LLT SrcTy) const {
1089 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1090 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 DestTy.getScalarSizeInBits() == 32 &&
1092 SrcTy.getScalarSizeInBits() == 16 &&
1093 // TODO: This probably only requires no input flushing?
1094 denormalModeIsFlushAllF32(*MI.getMF());
1095}
1096
1098 // SI has some legal vector types, but no legal vector operations. Say no
1099 // shuffles are legal in order to prefer scalarizing some vector operations.
1100 return false;
1101}
1102
1104 CallingConv::ID CC,
1105 EVT VT) const {
1107 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1108
1109 if (VT.isVector()) {
1110 EVT ScalarVT = VT.getScalarType();
1111 unsigned Size = ScalarVT.getSizeInBits();
1112 if (Size == 16) {
1113 if (Subtarget->has16BitInsts()) {
1114 if (VT.isInteger())
1115 return MVT::v2i16;
1116 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 }
1118 return VT.isInteger() ? MVT::i32 : MVT::f32;
1119 }
1120
1121 if (Size < 16)
1122 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1123 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1124 }
1125
1126 if (VT.getSizeInBits() > 32)
1127 return MVT::i32;
1128
1129 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1130}
1131
1133 CallingConv::ID CC,
1134 EVT VT) const {
1136 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1137
1138 if (VT.isVector()) {
1139 unsigned NumElts = VT.getVectorNumElements();
1140 EVT ScalarVT = VT.getScalarType();
1141 unsigned Size = ScalarVT.getSizeInBits();
1142
1143 // FIXME: Should probably promote 8-bit vectors to i16.
1144 if (Size == 16 && Subtarget->has16BitInsts())
1145 return (NumElts + 1) / 2;
1146
1147 if (Size <= 32)
1148 return NumElts;
1149
1150 if (Size > 32)
1151 return NumElts * ((Size + 31) / 32);
1152 } else if (VT.getSizeInBits() > 32)
1153 return (VT.getSizeInBits() + 31) / 32;
1154
1155 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1156}
1157
1159 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1160 unsigned &NumIntermediates, MVT &RegisterVT) const {
1161 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1162 unsigned NumElts = VT.getVectorNumElements();
1163 EVT ScalarVT = VT.getScalarType();
1164 unsigned Size = ScalarVT.getSizeInBits();
1165 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1166 // support, but unless we can properly handle 3-vectors, it will be still be
1167 // inconsistent.
1168 if (Size == 16 && Subtarget->has16BitInsts()) {
1169 if (ScalarVT == MVT::bf16) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = MVT::v2bf16;
1172 } else {
1173 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1174 IntermediateVT = RegisterVT;
1175 }
1176 NumIntermediates = (NumElts + 1) / 2;
1177 return NumIntermediates;
1178 }
1179
1180 if (Size == 32) {
1181 RegisterVT = ScalarVT.getSimpleVT();
1182 IntermediateVT = RegisterVT;
1183 NumIntermediates = NumElts;
1184 return NumIntermediates;
1185 }
1186
1187 if (Size < 16 && Subtarget->has16BitInsts()) {
1188 // FIXME: Should probably form v2i16 pieces
1189 RegisterVT = MVT::i16;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1193 }
1194
1195 if (Size != 16 && Size <= 32) {
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1200 }
1201
1202 if (Size > 32) {
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = RegisterVT;
1205 NumIntermediates = NumElts * ((Size + 31) / 32);
1206 return NumIntermediates;
1207 }
1208 }
1209
1211 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1212}
1213
1215 const DataLayout &DL, Type *Ty,
1216 unsigned MaxNumLanes) {
1217 assert(MaxNumLanes != 0);
1218
1219 LLVMContext &Ctx = Ty->getContext();
1220 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1221 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1222 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1223 NumElts);
1224 }
1225
1226 return TLI.getValueType(DL, Ty);
1227}
1228
1229// Peek through TFE struct returns to only use the data size.
1231 const DataLayout &DL, Type *Ty,
1232 unsigned MaxNumLanes) {
1233 auto *ST = dyn_cast<StructType>(Ty);
1234 if (!ST)
1235 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1236
1237 // TFE intrinsics return an aggregate type.
1238 assert(ST->getNumContainedTypes() == 2 &&
1239 ST->getContainedType(1)->isIntegerTy(32));
1240 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1241}
1242
1243/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1244/// in-memory representation. This return value is a custom type because there
1245/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1246/// could cause issues during codegen, these address space 7 pointers will be
1247/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1248/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1249/// for cost modeling, to work. (This also sets us up decently for doing the
1250/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1252 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1253 return MVT::amdgpuBufferFatPointer;
1255 DL.getPointerSizeInBits(AS) == 192)
1256 return MVT::amdgpuBufferStridedPointer;
1258}
1259/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1260/// v8i32 when padding is added.
1261/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1262/// also v8i32 with padding.
1264 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1265 DL.getPointerSizeInBits(AS) == 160) ||
1267 DL.getPointerSizeInBits(AS) == 192))
1268 return MVT::v8i32;
1270}
1271
1272static unsigned getIntrMemWidth(unsigned IntrID) {
1273 switch (IntrID) {
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 return 8;
1278 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1281 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1282 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 return 32;
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1287 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1288 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 return 64;
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1295 return 128;
1296 default:
1297 llvm_unreachable("Unknown width");
1298 }
1299}
1300
1301static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1303 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1304 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1305 switch (AtomicOrderingCABI(Ord)) {
1308 break;
1311 break;
1314 break;
1315 default:
1317 break;
1318 }
1319
1320 Info.flags =
1322 Info.flags |= MOCooperative;
1323
1324 MDNode *ScopeMD = cast<MDNode>(
1325 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1326 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1327 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1328}
1329
1331 const CallInst &CI,
1332 MachineFunction &MF,
1333 unsigned IntrID) const {
1334 Info.flags = MachineMemOperand::MONone;
1335 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1336 Info.flags |= MachineMemOperand::MOInvariant;
1337 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1339 Info.flags |= getTargetMMOFlags(CI);
1340
1341 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1343 AttributeSet Attr =
1345 MemoryEffects ME = Attr.getMemoryEffects();
1346 if (ME.doesNotAccessMemory())
1347 return false;
1348
1349 // TODO: Should images get their own address space?
1350 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1351
1352 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1353 if (RsrcIntr->IsImage) {
1354 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1356 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1357 Info.align.reset();
1358 }
1359
1360 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1361 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1362 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1363 // We conservatively set the memory operand of a buffer intrinsic to the
1364 // base resource pointer, so that we can access alias information about
1365 // those pointers. Cases like "this points at the same value
1366 // but with a different offset" are handled in
1367 // areMemAccessesTriviallyDisjoint.
1368 Info.ptrVal = RsrcArg;
1369 }
1370
1371 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1372 if (!IsSPrefetch) {
1373 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1374 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1375 Info.flags |= MachineMemOperand::MOVolatile;
1376 }
1377
1379 if (ME.onlyReadsMemory()) {
1380 if (RsrcIntr->IsImage) {
1381 unsigned MaxNumLanes = 4;
1382
1383 if (!BaseOpcode->Gather4) {
1384 // If this isn't a gather, we may have excess loaded elements in the
1385 // IR type. Check the dmask for the real number of elements loaded.
1386 unsigned DMask =
1387 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1388 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1389 }
1390
1391 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1392 CI.getType(), MaxNumLanes);
1393 } else {
1394 Info.memVT =
1396 std::numeric_limits<unsigned>::max());
1397 }
1398
1399 // FIXME: What does alignment mean for an image?
1400 Info.opc = ISD::INTRINSIC_W_CHAIN;
1401 Info.flags |= MachineMemOperand::MOLoad;
1402 } else if (ME.onlyWritesMemory()) {
1403 Info.opc = ISD::INTRINSIC_VOID;
1404
1405 Type *DataTy = CI.getArgOperand(0)->getType();
1406 if (RsrcIntr->IsImage) {
1407 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1408 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1409 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1410 DMaskLanes);
1411 } else
1412 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1413
1414 Info.flags |= MachineMemOperand::MOStore;
1415 } else {
1416 // Atomic, NoReturn Sampler or prefetch
1417 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1419 Info.flags |=
1421
1422 if (!IsSPrefetch)
1423 Info.flags |= MachineMemOperand::MOStore;
1424
1425 switch (IntrID) {
1426 default:
1427 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1428 // Fake memory access type for no return sampler intrinsics
1429 Info.memVT = MVT::i32;
1430 } else {
1431 // XXX - Should this be volatile without known ordering?
1432 Info.flags |= MachineMemOperand::MOVolatile;
1433 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1434 }
1435 break;
1436 case Intrinsic::amdgcn_raw_buffer_load_lds:
1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1441 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1442 Info.ptrVal = CI.getArgOperand(1);
1443 return true;
1444 }
1445 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1446 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1449 Info.memVT =
1451 std::numeric_limits<unsigned>::max());
1452 Info.flags &= ~MachineMemOperand::MOStore;
1453 return true;
1454 }
1455 }
1456 }
1457 return true;
1458 }
1459
1460 switch (IntrID) {
1461 case Intrinsic::amdgcn_ds_ordered_add:
1462 case Intrinsic::amdgcn_ds_ordered_swap: {
1463 Info.opc = ISD::INTRINSIC_W_CHAIN;
1464 Info.memVT = MVT::getVT(CI.getType());
1465 Info.ptrVal = CI.getOperand(0);
1466 Info.align.reset();
1468
1469 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1470 if (!Vol->isZero())
1471 Info.flags |= MachineMemOperand::MOVolatile;
1472
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1476 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1479 Info.ptrVal = nullptr;
1480 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1482 return true;
1483 }
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume: {
1486 Info.opc = ISD::INTRINSIC_W_CHAIN;
1487 Info.memVT = MVT::getVT(CI.getType());
1488 Info.ptrVal = CI.getOperand(0);
1489 Info.align.reset();
1491
1492 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1493 if (!Vol->isZero())
1494 Info.flags |= MachineMemOperand::MOVolatile;
1495
1496 return true;
1497 }
1498 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1499 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1500 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1503 Info.memVT = MVT::getVT(CI.getType());
1504 Info.ptrVal = CI.getOperand(0);
1505 Info.memVT = MVT::i64;
1506 Info.size = 8;
1507 Info.align.reset();
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_global_atomic_csub: {
1512 Info.opc = ISD::INTRINSIC_W_CHAIN;
1513 Info.memVT = MVT::getVT(CI.getType());
1514 Info.ptrVal = CI.getOperand(0);
1515 Info.align.reset();
1518 return true;
1519 }
1520 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1522 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1523 Info.opc = ISD::INTRINSIC_W_CHAIN;
1524 Info.memVT =
1525 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1526 ? CI.getType()
1528 ->getElementType(0)); // XXX: what is correct VT?
1529
1530 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1531 Info.align.reset();
1532 Info.flags |=
1534 return true;
1535 }
1536 case Intrinsic::amdgcn_global_atomic_fmin_num:
1537 case Intrinsic::amdgcn_global_atomic_fmax_num:
1538 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1539 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1540 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1541 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1542 Info.opc = ISD::INTRINSIC_W_CHAIN;
1543 Info.memVT = MVT::getVT(CI.getType());
1544 Info.ptrVal = CI.getOperand(0);
1545 Info.align.reset();
1549 return true;
1550 }
1551 case Intrinsic::amdgcn_flat_load_monitor_b32:
1552 case Intrinsic::amdgcn_flat_load_monitor_b64:
1553 case Intrinsic::amdgcn_flat_load_monitor_b128:
1554 case Intrinsic::amdgcn_global_load_monitor_b32:
1555 case Intrinsic::amdgcn_global_load_monitor_b64:
1556 case Intrinsic::amdgcn_global_load_monitor_b128:
1557 case Intrinsic::amdgcn_cluster_load_b32:
1558 case Intrinsic::amdgcn_cluster_load_b64:
1559 case Intrinsic::amdgcn_cluster_load_b128:
1560 case Intrinsic::amdgcn_ds_load_tr6_b96:
1561 case Intrinsic::amdgcn_ds_load_tr4_b64:
1562 case Intrinsic::amdgcn_ds_load_tr8_b64:
1563 case Intrinsic::amdgcn_ds_load_tr16_b128:
1564 case Intrinsic::amdgcn_global_load_tr6_b96:
1565 case Intrinsic::amdgcn_global_load_tr4_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b64:
1567 case Intrinsic::amdgcn_global_load_tr_b128:
1568 case Intrinsic::amdgcn_ds_read_tr4_b64:
1569 case Intrinsic::amdgcn_ds_read_tr6_b96:
1570 case Intrinsic::amdgcn_ds_read_tr8_b64:
1571 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1572 Info.opc = ISD::INTRINSIC_W_CHAIN;
1573 Info.memVT = MVT::getVT(CI.getType());
1574 Info.ptrVal = CI.getOperand(0);
1575 Info.align.reset();
1576 Info.flags |= MachineMemOperand::MOLoad;
1577 return true;
1578 }
1579 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1581 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1582 Info.opc = ISD::INTRINSIC_W_CHAIN;
1583 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1584 Info.ptrVal = CI.getOperand(0);
1585 Info.align.reset();
1586 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1587 return true;
1588 }
1589 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1591 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1592 Info.opc = ISD::INTRINSIC_VOID;
1593 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1594 Info.ptrVal = CI.getArgOperand(0);
1595 Info.align.reset();
1596 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1597 return true;
1598 }
1599 case Intrinsic::amdgcn_ds_gws_init:
1600 case Intrinsic::amdgcn_ds_gws_barrier:
1601 case Intrinsic::amdgcn_ds_gws_sema_v:
1602 case Intrinsic::amdgcn_ds_gws_sema_br:
1603 case Intrinsic::amdgcn_ds_gws_sema_p:
1604 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1605 Info.opc = ISD::INTRINSIC_VOID;
1606
1607 const GCNTargetMachine &TM =
1608 static_cast<const GCNTargetMachine &>(getTargetMachine());
1609
1611 Info.ptrVal = MFI->getGWSPSV(TM);
1612
1613 // This is an abstract access, but we need to specify a type and size.
1614 Info.memVT = MVT::i32;
1615 Info.size = 4;
1616 Info.align = Align(4);
1617
1618 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1619 Info.flags |= MachineMemOperand::MOLoad;
1620 else
1621 Info.flags |= MachineMemOperand::MOStore;
1622 return true;
1623 }
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1627 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1632 Info.opc = ISD::INTRINSIC_VOID;
1633 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1634 Info.ptrVal = CI.getArgOperand(1);
1636 return true;
1637 }
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1641 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1642 Info.opc = ISD::INTRINSIC_VOID;
1643 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1644 Info.ptrVal = CI.getArgOperand(0);
1646 return true;
1647 }
1648 case Intrinsic::amdgcn_load_to_lds:
1649 case Intrinsic::amdgcn_global_load_lds: {
1650 Info.opc = ISD::INTRINSIC_VOID;
1651 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1652 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1653 Info.ptrVal = CI.getArgOperand(1);
1655 return true;
1656 }
1657 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1660 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1661 Info.opc = ISD::INTRINSIC_W_CHAIN;
1662
1663 const GCNTargetMachine &TM =
1664 static_cast<const GCNTargetMachine &>(getTargetMachine());
1665
1667 Info.ptrVal = MFI->getGWSPSV(TM);
1668
1669 // This is an abstract access, but we need to specify a type and size.
1670 Info.memVT = MVT::i32;
1671 Info.size = 4;
1672 Info.align = Align(4);
1673
1675 return true;
1676 }
1677 case Intrinsic::amdgcn_s_prefetch_data:
1678 case Intrinsic::amdgcn_flat_prefetch:
1679 case Intrinsic::amdgcn_global_prefetch: {
1680 Info.opc = ISD::INTRINSIC_VOID;
1681 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1682 Info.ptrVal = CI.getArgOperand(0);
1683 Info.flags |= MachineMemOperand::MOLoad;
1684 return true;
1685 }
1686 default:
1687 return false;
1688 }
1689}
1690
1692 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1694 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1695 // The DAG's ValueType loses the addrspaces.
1696 // Add them as 2 extra Constant operands "from" and "to".
1697 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1698 unsigned DstAS = I.getType()->getPointerAddressSpace();
1699 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1700 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1701 break;
1702 }
1703 default:
1704 break;
1705 }
1706}
1707
1710 Type *&AccessTy) const {
1711 Value *Ptr = nullptr;
1712 switch (II->getIntrinsicID()) {
1713 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1714 case Intrinsic::amdgcn_cluster_load_b128:
1715 case Intrinsic::amdgcn_cluster_load_b64:
1716 case Intrinsic::amdgcn_cluster_load_b32:
1717 case Intrinsic::amdgcn_ds_append:
1718 case Intrinsic::amdgcn_ds_consume:
1719 case Intrinsic::amdgcn_ds_load_tr8_b64:
1720 case Intrinsic::amdgcn_ds_load_tr16_b128:
1721 case Intrinsic::amdgcn_ds_load_tr4_b64:
1722 case Intrinsic::amdgcn_ds_load_tr6_b96:
1723 case Intrinsic::amdgcn_ds_read_tr4_b64:
1724 case Intrinsic::amdgcn_ds_read_tr6_b96:
1725 case Intrinsic::amdgcn_ds_read_tr8_b64:
1726 case Intrinsic::amdgcn_ds_read_tr16_b64:
1727 case Intrinsic::amdgcn_ds_ordered_add:
1728 case Intrinsic::amdgcn_ds_ordered_swap:
1729 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1730 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1731 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1732 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1733 case Intrinsic::amdgcn_flat_load_monitor_b128:
1734 case Intrinsic::amdgcn_flat_load_monitor_b32:
1735 case Intrinsic::amdgcn_flat_load_monitor_b64:
1736 case Intrinsic::amdgcn_global_atomic_csub:
1737 case Intrinsic::amdgcn_global_atomic_fmax_num:
1738 case Intrinsic::amdgcn_global_atomic_fmin_num:
1739 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1740 case Intrinsic::amdgcn_global_load_monitor_b128:
1741 case Intrinsic::amdgcn_global_load_monitor_b32:
1742 case Intrinsic::amdgcn_global_load_monitor_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b64:
1744 case Intrinsic::amdgcn_global_load_tr_b128:
1745 case Intrinsic::amdgcn_global_load_tr4_b64:
1746 case Intrinsic::amdgcn_global_load_tr6_b96:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1751 Ptr = II->getArgOperand(0);
1752 break;
1753 case Intrinsic::amdgcn_load_to_lds:
1754 case Intrinsic::amdgcn_global_load_lds:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1758 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1762 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1763 Ptr = II->getArgOperand(1);
1764 break;
1765 default:
1766 return false;
1767 }
1768 AccessTy = II->getType();
1769 Ops.push_back(Ptr);
1770 return true;
1771}
1772
1774 unsigned AddrSpace) const {
1775 if (!Subtarget->hasFlatInstOffsets()) {
1776 // Flat instructions do not have offsets, and only have the register
1777 // address.
1778 return AM.BaseOffs == 0 && AM.Scale == 0;
1779 }
1780
1781 decltype(SIInstrFlags::FLAT) FlatVariant =
1785
1786 return AM.Scale == 0 &&
1787 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1788 AM.BaseOffs, AddrSpace, FlatVariant));
1789}
1790
1792 if (Subtarget->hasFlatGlobalInsts())
1794
1795 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1796 // Assume the we will use FLAT for all global memory accesses
1797 // on VI.
1798 // FIXME: This assumption is currently wrong. On VI we still use
1799 // MUBUF instructions for the r + i addressing mode. As currently
1800 // implemented, the MUBUF instructions only work on buffer < 4GB.
1801 // It may be possible to support > 4GB buffers with MUBUF instructions,
1802 // by setting the stride value in the resource descriptor which would
1803 // increase the size limit to (stride * 4GB). However, this is risky,
1804 // because it has never been validated.
1806 }
1807
1808 return isLegalMUBUFAddressingMode(AM);
1809}
1810
1811bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1812 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1813 // additionally can do r + r + i with addr64. 32-bit has more addressing
1814 // mode options. Depending on the resource constant, it can also do
1815 // (i64 r0) + (i32 r1) * (i14 i).
1816 //
1817 // Private arrays end up using a scratch buffer most of the time, so also
1818 // assume those use MUBUF instructions. Scratch loads / stores are currently
1819 // implemented as mubuf instructions with offen bit set, so slightly
1820 // different than the normal addr64.
1821 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1822 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1823 return false;
1824
1825 // FIXME: Since we can split immediate into soffset and immediate offset,
1826 // would it make sense to allow any immediate?
1827
1828 switch (AM.Scale) {
1829 case 0: // r + i or just i, depending on HasBaseReg.
1830 return true;
1831 case 1:
1832 return true; // We have r + r or r + i.
1833 case 2:
1834 if (AM.HasBaseReg) {
1835 // Reject 2 * r + r.
1836 return false;
1837 }
1838
1839 // Allow 2 * r as r + r
1840 // Or 2 * r + i is allowed as r + r + i.
1841 return true;
1842 default: // Don't allow n * r
1843 return false;
1844 }
1845}
1846
1848 const AddrMode &AM, Type *Ty,
1849 unsigned AS,
1850 Instruction *I) const {
1851 // No global is ever allowed as a base.
1852 if (AM.BaseGV)
1853 return false;
1854
1855 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1856 return isLegalGlobalAddressingMode(AM);
1857
1858 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1862 // If the offset isn't a multiple of 4, it probably isn't going to be
1863 // correctly aligned.
1864 // FIXME: Can we get the real alignment here?
1865 if (AM.BaseOffs % 4 != 0)
1866 return isLegalMUBUFAddressingMode(AM);
1867
1868 if (!Subtarget->hasScalarSubwordLoads()) {
1869 // There are no SMRD extloads, so if we have to do a small type access we
1870 // will use a MUBUF load.
1871 // FIXME?: We also need to do this if unaligned, but we don't know the
1872 // alignment here.
1873 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1874 return isLegalGlobalAddressingMode(AM);
1875 }
1876
1877 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1878 // SMRD instructions have an 8-bit, dword offset on SI.
1879 if (!isUInt<8>(AM.BaseOffs / 4))
1880 return false;
1881 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1882 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1883 // in 8-bits, it can use a smaller encoding.
1884 if (!isUInt<32>(AM.BaseOffs / 4))
1885 return false;
1886 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1887 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1888 if (!isUInt<20>(AM.BaseOffs))
1889 return false;
1890 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1891 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1892 // for S_BUFFER_* instructions).
1893 if (!isInt<21>(AM.BaseOffs))
1894 return false;
1895 } else {
1896 // On GFX12, all offsets are signed 24-bit in bytes.
1897 if (!isInt<24>(AM.BaseOffs))
1898 return false;
1899 }
1900
1901 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1903 AM.BaseOffs < 0) {
1904 // Scalar (non-buffer) loads can only use a negative offset if
1905 // soffset+offset is non-negative. Since the compiler can only prove that
1906 // in a few special cases, it is safer to claim that negative offsets are
1907 // not supported.
1908 return false;
1909 }
1910
1911 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1912 return true;
1913
1914 if (AM.Scale == 1 && AM.HasBaseReg)
1915 return true;
1916
1917 return false;
1918 }
1919
1920 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1921 return Subtarget->enableFlatScratch()
1923 : isLegalMUBUFAddressingMode(AM);
1924
1925 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1926 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1927 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1928 // field.
1929 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1930 // an 8-bit dword offset but we don't know the alignment here.
1931 if (!isUInt<16>(AM.BaseOffs))
1932 return false;
1933
1934 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1935 return true;
1936
1937 if (AM.Scale == 1 && AM.HasBaseReg)
1938 return true;
1939
1940 return false;
1941 }
1942
1944 // For an unknown address space, this usually means that this is for some
1945 // reason being used for pure arithmetic, and not based on some addressing
1946 // computation. We don't have instructions that compute pointers with any
1947 // addressing modes, so treat them as having no offset like flat
1948 // instructions.
1950 }
1951
1952 // Assume a user alias of global for unknown address spaces.
1953 return isLegalGlobalAddressingMode(AM);
1954}
1955
1957 const MachineFunction &MF) const {
1959 return (MemVT.getSizeInBits() <= 4 * 32);
1960 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1961 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1962 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1963 }
1965 return (MemVT.getSizeInBits() <= 2 * 32);
1966 return true;
1967}
1968
1970 unsigned Size, unsigned AddrSpace, Align Alignment,
1971 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1972 if (IsFast)
1973 *IsFast = 0;
1974
1975 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1976 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1977 // Check if alignment requirements for ds_read/write instructions are
1978 // disabled.
1979 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1980 return false;
1981
1982 Align RequiredAlignment(
1983 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1984 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1985 Alignment < RequiredAlignment)
1986 return false;
1987
1988 // Either, the alignment requirements are "enabled", or there is an
1989 // unaligned LDS access related hardware bug though alignment requirements
1990 // are "disabled". In either case, we need to check for proper alignment
1991 // requirements.
1992 //
1993 switch (Size) {
1994 case 64:
1995 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1996 // address is negative, then the instruction is incorrectly treated as
1997 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1998 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1999 // load later in the SILoadStoreOptimizer.
2000 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2001 return false;
2002
2003 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2004 // can do a 4 byte aligned, 8 byte access in a single operation using
2005 // ds_read2/write2_b32 with adjacent offsets.
2006 RequiredAlignment = Align(4);
2007
2008 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2009 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2010 // ds_write2_b32 depending on the alignment. In either case with either
2011 // alignment there is no faster way of doing this.
2012
2013 // The numbers returned here and below are not additive, it is a 'speed
2014 // rank'. They are just meant to be compared to decide if a certain way
2015 // of lowering an operation is faster than another. For that purpose
2016 // naturally aligned operation gets it bitsize to indicate that "it
2017 // operates with a speed comparable to N-bit wide load". With the full
2018 // alignment ds128 is slower than ds96 for example. If underaligned it
2019 // is comparable to a speed of a single dword access, which would then
2020 // mean 32 < 128 and it is faster to issue a wide load regardless.
2021 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2022 // wider load which will not be aligned anymore the latter is slower.
2023 if (IsFast)
2024 *IsFast = (Alignment >= RequiredAlignment) ? 64
2025 : (Alignment < Align(4)) ? 32
2026 : 1;
2027 return true;
2028 }
2029
2030 break;
2031 case 96:
2032 if (!Subtarget->hasDS96AndDS128())
2033 return false;
2034
2035 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2036 // gfx8 and older.
2037
2038 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2039 // Naturally aligned access is fastest. However, also report it is Fast
2040 // if memory is aligned less than DWORD. A narrow load or store will be
2041 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2042 // be more of them, so overall we will pay less penalty issuing a single
2043 // instruction.
2044
2045 // See comment on the values above.
2046 if (IsFast)
2047 *IsFast = (Alignment >= RequiredAlignment) ? 96
2048 : (Alignment < Align(4)) ? 32
2049 : 1;
2050 return true;
2051 }
2052
2053 break;
2054 case 128:
2055 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2056 return false;
2057
2058 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2059 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2060 // single operation using ds_read2/write2_b64.
2061 RequiredAlignment = Align(8);
2062
2063 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2064 // Naturally aligned access is fastest. However, also report it is Fast
2065 // if memory is aligned less than DWORD. A narrow load or store will be
2066 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2067 // will be more of them, so overall we will pay less penalty issuing a
2068 // single instruction.
2069
2070 // See comment on the values above.
2071 if (IsFast)
2072 *IsFast = (Alignment >= RequiredAlignment) ? 128
2073 : (Alignment < Align(4)) ? 32
2074 : 1;
2075 return true;
2076 }
2077
2078 break;
2079 default:
2080 if (Size > 32)
2081 return false;
2082
2083 break;
2084 }
2085
2086 // See comment on the values above.
2087 // Note that we have a single-dword or sub-dword here, so if underaligned
2088 // it is a slowest possible access, hence returned value is 0.
2089 if (IsFast)
2090 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2091
2092 return Alignment >= RequiredAlignment ||
2093 Subtarget->hasUnalignedDSAccessEnabled();
2094 }
2095
2096 // FIXME: We have to be conservative here and assume that flat operations
2097 // will access scratch. If we had access to the IR function, then we
2098 // could determine if any private memory was used in the function.
2099 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2100 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2101 bool AlignedBy4 = Alignment >= Align(4);
2102 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2103 if (IsFast)
2104 *IsFast = AlignedBy4 ? Size : 1;
2105 return true;
2106 }
2107
2108 if (IsFast)
2109 *IsFast = AlignedBy4;
2110
2111 return AlignedBy4;
2112 }
2113
2114 // So long as they are correct, wide global memory operations perform better
2115 // than multiple smaller memory ops -- even when misaligned
2116 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2117 if (IsFast)
2118 *IsFast = Size;
2119
2120 return Alignment >= Align(4) ||
2121 Subtarget->hasUnalignedBufferAccessEnabled();
2122 }
2123
2124 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2125 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2126 // out-of-bounds behavior, but in the edge case where an access starts
2127 // out-of-bounds and then enter in-bounds, the entire access would be treated
2128 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2129 // natural alignment of buffer accesses.
2130 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2131 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2132 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2133 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2134 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2135 return false;
2136 }
2137
2138 // Smaller than dword value must be aligned.
2139 if (Size < 32)
2140 return false;
2141
2142 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2143 // byte-address are ignored, thus forcing Dword alignment.
2144 // This applies to private, global, and constant memory.
2145 if (IsFast)
2146 *IsFast = 1;
2147
2148 return Size >= 32 && Alignment >= Align(4);
2149}
2150
2152 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2153 unsigned *IsFast) const {
2155 Alignment, Flags, IsFast);
2156}
2157
2159 LLVMContext &Context, const MemOp &Op,
2160 const AttributeList &FuncAttributes) const {
2161 // FIXME: Should account for address space here.
2162
2163 // The default fallback uses the private pointer size as a guess for a type to
2164 // use. Make sure we switch these to 64-bit accesses.
2165
2166 if (Op.size() >= 16 &&
2167 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2168 return MVT::v4i32;
2169
2170 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2171 return MVT::v2i32;
2172
2173 // Use the default.
2174 return MVT::Other;
2175}
2176
2178 const MemSDNode *MemNode = cast<MemSDNode>(N);
2179 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2180}
2181
2186
2188 unsigned DestAS) const {
2189 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2190 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2191 Subtarget->hasGloballyAddressableScratch()) {
2192 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2193 return false;
2194 }
2195
2196 // Flat -> private/local is a simple truncate.
2197 // Flat -> global is no-op
2198 return true;
2199 }
2200
2201 const GCNTargetMachine &TM =
2202 static_cast<const GCNTargetMachine &>(getTargetMachine());
2203 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2204}
2205
2213
2215 Type *Ty) const {
2216 // FIXME: Could be smarter if called for vector constants.
2217 return true;
2218}
2219
2221 unsigned Index) const {
2223 return false;
2224
2225 // TODO: Add more cases that are cheap.
2226 return Index == 0;
2227}
2228
2229bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2230 // TODO: This should be more aggressive, particular for 16-bit element
2231 // vectors. However there are some mixed improvements and regressions.
2232 EVT EltTy = VT.getVectorElementType();
2233 return EltTy.getSizeInBits() % 32 == 0;
2234}
2235
2237 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2238 switch (Op) {
2239 case ISD::LOAD:
2240 case ISD::STORE:
2241 return true;
2242 default:
2243 return false;
2244 }
2245 }
2246
2247 // SimplifySetCC uses this function to determine whether or not it should
2248 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2249 if (VT == MVT::i1 && Op == ISD::SETCC)
2250 return false;
2251
2253}
2254
2255SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2256 const SDLoc &SL,
2257 SDValue Chain,
2258 uint64_t Offset) const {
2259 const DataLayout &DL = DAG.getDataLayout();
2263
2264 auto [InputPtrReg, RC, ArgTy] =
2265 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2266
2267 // We may not have the kernarg segment argument if we have no kernel
2268 // arguments.
2269 if (!InputPtrReg)
2270 return DAG.getConstant(Offset, SL, PtrVT);
2271
2273 SDValue BasePtr = DAG.getCopyFromReg(
2274 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2275
2276 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2277}
2278
2279SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2280 const SDLoc &SL) const {
2283 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2284}
2285
2286SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2287 const SDLoc &SL) const {
2288
2290 std::optional<uint32_t> KnownSize =
2292 if (KnownSize.has_value())
2293 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2294 return SDValue();
2295}
2296
2297SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2298 const SDLoc &SL, SDValue Val,
2299 bool Signed,
2300 const ISD::InputArg *Arg) const {
2301 // First, if it is a widened vector, narrow it.
2302 if (VT.isVector() &&
2304 EVT NarrowedVT =
2307 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2308 DAG.getConstant(0, SL, MVT::i32));
2309 }
2310
2311 // Then convert the vector elements or scalar value.
2312 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2313 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2314 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2315 }
2316
2317 if (MemVT.isFloatingPoint())
2318 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2319 else if (Signed)
2320 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2321 else
2322 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2323
2324 return Val;
2325}
2326
2327SDValue SITargetLowering::lowerKernargMemParameter(
2328 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2329 uint64_t Offset, Align Alignment, bool Signed,
2330 const ISD::InputArg *Arg) const {
2331 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2332
2333 // Try to avoid using an extload by loading earlier than the argument address,
2334 // and extracting the relevant bits. The load should hopefully be merged with
2335 // the previous argument.
2336 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2337 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2338 int64_t AlignDownOffset = alignDown(Offset, 4);
2339 int64_t OffsetDiff = Offset - AlignDownOffset;
2340
2341 EVT IntVT = MemVT.changeTypeToInteger();
2342
2343 // TODO: If we passed in the base kernel offset we could have a better
2344 // alignment than 4, but we don't really need it.
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2346 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2349
2350 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2351 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2352
2353 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2354 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2355 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2356
2357 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2358 }
2359
2360 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2361 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2364
2365 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2366 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2367}
2368
2369/// Coerce an argument which was passed in a different ABI type to the original
2370/// expected value type.
2371SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2372 SDValue Val,
2373 CCValAssign &VA,
2374 const SDLoc &SL) const {
2375 EVT ValVT = VA.getValVT();
2376
2377 // If this is an 8 or 16-bit value, it is really passed promoted
2378 // to 32 bits. Insert an assert[sz]ext to capture this, then
2379 // truncate to the right size.
2380 switch (VA.getLocInfo()) {
2381 case CCValAssign::Full:
2382 return Val;
2383 case CCValAssign::BCvt:
2384 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2385 case CCValAssign::SExt:
2386 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2387 DAG.getValueType(ValVT));
2388 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2389 case CCValAssign::ZExt:
2390 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2391 DAG.getValueType(ValVT));
2392 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2393 case CCValAssign::AExt:
2394 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2395 default:
2396 llvm_unreachable("Unknown loc info!");
2397 }
2398}
2399
2400SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2401 CCValAssign &VA, const SDLoc &SL,
2402 SDValue Chain,
2403 const ISD::InputArg &Arg) const {
2404 MachineFunction &MF = DAG.getMachineFunction();
2405 MachineFrameInfo &MFI = MF.getFrameInfo();
2406
2407 if (Arg.Flags.isByVal()) {
2408 unsigned Size = Arg.Flags.getByValSize();
2409 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2410 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2411 }
2412
2413 unsigned ArgOffset = VA.getLocMemOffset();
2414 unsigned ArgSize = VA.getValVT().getStoreSize();
2415
2416 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2417
2418 // Create load nodes to retrieve arguments from the stack.
2419 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2420
2421 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2423 MVT MemVT = VA.getValVT();
2424
2425 switch (VA.getLocInfo()) {
2426 default:
2427 break;
2428 case CCValAssign::BCvt:
2429 MemVT = VA.getLocVT();
2430 break;
2431 case CCValAssign::SExt:
2432 ExtType = ISD::SEXTLOAD;
2433 break;
2434 case CCValAssign::ZExt:
2435 ExtType = ISD::ZEXTLOAD;
2436 break;
2437 case CCValAssign::AExt:
2438 ExtType = ISD::EXTLOAD;
2439 break;
2440 }
2441
2442 SDValue ArgValue = DAG.getExtLoad(
2443 ExtType, SL, VA.getLocVT(), Chain, FIN,
2445
2446 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2447 if (ConvertedVal == ArgValue)
2448 return ConvertedVal;
2449
2450 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2451}
2452
2453SDValue SITargetLowering::lowerWorkGroupId(
2454 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2457 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2458 if (!Subtarget->hasClusters())
2459 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2460
2461 // Clusters are supported. Return the global position in the grid. If clusters
2462 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2463
2464 // WorkGroupIdXYZ = ClusterId == 0 ?
2465 // ClusterIdXYZ :
2466 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2467 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2468 SDLoc SL(ClusterIdXYZ);
2469 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2470 SDValue One = DAG.getConstant(1, SL, VT);
2471 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2472 SDValue ClusterWorkGroupIdXYZ =
2473 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2474 SDValue GlobalIdXYZ =
2475 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2476 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2477
2478 switch (MFI.getClusterDims().getKind()) {
2481 return GlobalIdXYZ;
2483 return ClusterIdXYZ;
2485 using namespace AMDGPU::Hwreg;
2486 SDValue ClusterIdField =
2487 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2488 SDNode *GetReg =
2489 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2490 SDValue ClusterId(GetReg, 0);
2491 SDValue Zero = DAG.getConstant(0, SL, VT);
2492 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2493 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2494 }
2495 }
2496
2497 llvm_unreachable("nothing should reach here");
2498}
2499
2500SDValue SITargetLowering::getPreloadedValue(
2501 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2503 const ArgDescriptor *Reg = nullptr;
2504 const TargetRegisterClass *RC;
2505 LLT Ty;
2506
2508 const ArgDescriptor WorkGroupIDX =
2509 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2510 // If GridZ is not programmed in an entry function then the hardware will set
2511 // it to all zeros, so there is no need to mask the GridY value in the low
2512 // order bits.
2513 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2514 AMDGPU::TTMP7,
2515 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2516 const ArgDescriptor WorkGroupIDZ =
2517 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2518 const ArgDescriptor ClusterWorkGroupIDX =
2519 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2520 const ArgDescriptor ClusterWorkGroupIDY =
2521 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2522 const ArgDescriptor ClusterWorkGroupIDZ =
2523 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2524 const ArgDescriptor ClusterWorkGroupMaxIDX =
2525 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2526 const ArgDescriptor ClusterWorkGroupMaxIDY =
2527 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2528 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2529 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2530 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2531 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2532
2533 auto LoadConstant = [&](unsigned N) {
2534 return DAG.getConstant(N, SDLoc(), VT);
2535 };
2536
2537 if (Subtarget->hasArchitectedSGPRs() &&
2539 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2540 bool HasFixedDims = ClusterDims.isFixedDims();
2541
2542 switch (PVID) {
2544 Reg = &WorkGroupIDX;
2545 RC = &AMDGPU::SReg_32RegClass;
2546 Ty = LLT::scalar(32);
2547 break;
2549 Reg = &WorkGroupIDY;
2550 RC = &AMDGPU::SReg_32RegClass;
2551 Ty = LLT::scalar(32);
2552 break;
2554 Reg = &WorkGroupIDZ;
2555 RC = &AMDGPU::SReg_32RegClass;
2556 Ty = LLT::scalar(32);
2557 break;
2559 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDX;
2562 RC = &AMDGPU::SReg_32RegClass;
2563 Ty = LLT::scalar(32);
2564 break;
2566 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDY;
2569 RC = &AMDGPU::SReg_32RegClass;
2570 Ty = LLT::scalar(32);
2571 break;
2573 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2574 return LoadConstant(0);
2575 Reg = &ClusterWorkGroupIDZ;
2576 RC = &AMDGPU::SReg_32RegClass;
2577 Ty = LLT::scalar(32);
2578 break;
2580 if (HasFixedDims)
2581 return LoadConstant(ClusterDims.getDims()[0] - 1);
2582 Reg = &ClusterWorkGroupMaxIDX;
2583 RC = &AMDGPU::SReg_32RegClass;
2584 Ty = LLT::scalar(32);
2585 break;
2587 if (HasFixedDims)
2588 return LoadConstant(ClusterDims.getDims()[1] - 1);
2589 Reg = &ClusterWorkGroupMaxIDY;
2590 RC = &AMDGPU::SReg_32RegClass;
2591 Ty = LLT::scalar(32);
2592 break;
2594 if (HasFixedDims)
2595 return LoadConstant(ClusterDims.getDims()[2] - 1);
2596 Reg = &ClusterWorkGroupMaxIDZ;
2597 RC = &AMDGPU::SReg_32RegClass;
2598 Ty = LLT::scalar(32);
2599 break;
2601 Reg = &ClusterWorkGroupMaxFlatID;
2602 RC = &AMDGPU::SReg_32RegClass;
2603 Ty = LLT::scalar(32);
2604 break;
2605 default:
2606 break;
2607 }
2608 }
2609
2610 if (!Reg)
2611 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2612 if (!Reg) {
2614 // It's possible for a kernarg intrinsic call to appear in a kernel with
2615 // no allocated segment, in which case we do not add the user sgpr
2616 // argument, so just return null.
2617 return DAG.getConstant(0, SDLoc(), VT);
2618 }
2619
2620 // It's undefined behavior if a function marked with the amdgpu-no-*
2621 // attributes uses the corresponding intrinsic.
2622 return DAG.getPOISON(VT);
2623 }
2624
2625 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2626}
2627
2629 CallingConv::ID CallConv,
2630 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2631 FunctionType *FType,
2633 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2634 const ISD::InputArg *Arg = &Ins[I];
2635
2636 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2637 "vector type argument should have been split");
2638
2639 // First check if it's a PS input addr.
2640 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2641 PSInputNum <= 15) {
2642 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2643
2644 // Inconveniently only the first part of the split is marked as isSplit,
2645 // so skip to the end. We only want to increment PSInputNum once for the
2646 // entire split argument.
2647 if (Arg->Flags.isSplit()) {
2648 while (!Arg->Flags.isSplitEnd()) {
2649 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2650 "unexpected vector split in ps argument type");
2651 if (!SkipArg)
2652 Splits.push_back(*Arg);
2653 Arg = &Ins[++I];
2654 }
2655 }
2656
2657 if (SkipArg) {
2658 // We can safely skip PS inputs.
2659 Skipped.set(Arg->getOrigArgIndex());
2660 ++PSInputNum;
2661 continue;
2662 }
2663
2664 Info->markPSInputAllocated(PSInputNum);
2665 if (Arg->Used)
2666 Info->markPSInputEnabled(PSInputNum);
2667
2668 ++PSInputNum;
2669 }
2670
2671 Splits.push_back(*Arg);
2672 }
2673}
2674
2675// Allocate special inputs passed in VGPRs.
2677 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2678 SIMachineFunctionInfo &Info) const {
2679 const LLT S32 = LLT::scalar(32);
2681
2682 if (Info.hasWorkItemIDX()) {
2683 Register Reg = AMDGPU::VGPR0;
2684 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2685
2686 CCInfo.AllocateReg(Reg);
2687 unsigned Mask =
2688 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2689 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2690 }
2691
2692 if (Info.hasWorkItemIDY()) {
2693 assert(Info.hasWorkItemIDX());
2694 if (Subtarget->hasPackedTID()) {
2695 Info.setWorkItemIDY(
2696 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2697 } else {
2698 unsigned Reg = AMDGPU::VGPR1;
2699 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2700
2701 CCInfo.AllocateReg(Reg);
2702 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2703 }
2704 }
2705
2706 if (Info.hasWorkItemIDZ()) {
2707 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2708 if (Subtarget->hasPackedTID()) {
2709 Info.setWorkItemIDZ(
2710 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2711 } else {
2712 unsigned Reg = AMDGPU::VGPR2;
2713 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2714
2715 CCInfo.AllocateReg(Reg);
2716 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2717 }
2718 }
2719}
2720
2721// Try to allocate a VGPR at the end of the argument list, or if no argument
2722// VGPRs are left allocating a stack slot.
2723// If \p Mask is is given it indicates bitfield position in the register.
2724// If \p Arg is given use it with new ]p Mask instead of allocating new.
2725static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2726 ArgDescriptor Arg = ArgDescriptor()) {
2727 if (Arg.isSet())
2728 return ArgDescriptor::createArg(Arg, Mask);
2729
2730 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2731 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2732 if (RegIdx == ArgVGPRs.size()) {
2733 // Spill to stack required.
2734 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2735
2736 return ArgDescriptor::createStack(Offset, Mask);
2737 }
2738
2739 unsigned Reg = ArgVGPRs[RegIdx];
2740 Reg = CCInfo.AllocateReg(Reg);
2741 assert(Reg != AMDGPU::NoRegister);
2742
2743 MachineFunction &MF = CCInfo.getMachineFunction();
2744 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2745 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2746 return ArgDescriptor::createRegister(Reg, Mask);
2747}
2748
2750 const TargetRegisterClass *RC,
2751 unsigned NumArgRegs) {
2752 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2753 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2754 if (RegIdx == ArgSGPRs.size())
2755 report_fatal_error("ran out of SGPRs for arguments");
2756
2757 unsigned Reg = ArgSGPRs[RegIdx];
2758 Reg = CCInfo.AllocateReg(Reg);
2759 assert(Reg != AMDGPU::NoRegister);
2760
2761 MachineFunction &MF = CCInfo.getMachineFunction();
2762 MF.addLiveIn(Reg, RC);
2764}
2765
2766// If this has a fixed position, we still should allocate the register in the
2767// CCInfo state. Technically we could get away with this for values passed
2768// outside of the normal argument range.
2770 const TargetRegisterClass *RC,
2771 MCRegister Reg) {
2772 Reg = CCInfo.AllocateReg(Reg);
2773 assert(Reg != AMDGPU::NoRegister);
2774 MachineFunction &MF = CCInfo.getMachineFunction();
2775 MF.addLiveIn(Reg, RC);
2776}
2777
2778static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2779 if (Arg) {
2780 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2781 Arg.getRegister());
2782 } else
2783 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2784}
2785
2786static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2787 if (Arg) {
2788 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2789 Arg.getRegister());
2790 } else
2791 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2792}
2793
2794/// Allocate implicit function VGPR arguments at the end of allocated user
2795/// arguments.
2797 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2798 SIMachineFunctionInfo &Info) const {
2799 const unsigned Mask = 0x3ff;
2800 ArgDescriptor Arg;
2801
2802 if (Info.hasWorkItemIDX()) {
2803 Arg = allocateVGPR32Input(CCInfo, Mask);
2804 Info.setWorkItemIDX(Arg);
2805 }
2806
2807 if (Info.hasWorkItemIDY()) {
2808 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2809 Info.setWorkItemIDY(Arg);
2810 }
2811
2812 if (Info.hasWorkItemIDZ())
2813 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2814}
2815
2816/// Allocate implicit function VGPR arguments in fixed registers.
2818 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2819 SIMachineFunctionInfo &Info) const {
2820 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2821 if (!Reg)
2822 report_fatal_error("failed to allocate VGPR for implicit arguments");
2823
2824 const unsigned Mask = 0x3ff;
2825 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2826 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2827 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2828}
2829
2831 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2832 SIMachineFunctionInfo &Info) const {
2833 auto &ArgInfo = Info.getArgInfo();
2834 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2835
2836 // TODO: Unify handling with private memory pointers.
2837 if (UserSGPRInfo.hasDispatchPtr())
2838 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2839
2840 if (UserSGPRInfo.hasQueuePtr())
2841 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2842
2843 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2844 // constant offset from the kernarg segment.
2845 if (Info.hasImplicitArgPtr())
2846 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2847
2848 if (UserSGPRInfo.hasDispatchID())
2849 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2850
2851 // flat_scratch_init is not applicable for non-kernel functions.
2852
2853 if (Info.hasWorkGroupIDX())
2854 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2855
2856 if (Info.hasWorkGroupIDY())
2857 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2858
2859 if (Info.hasWorkGroupIDZ())
2860 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2861
2862 if (Info.hasLDSKernelId())
2863 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2864}
2865
2866// Allocate special inputs passed in user SGPRs.
2868 MachineFunction &MF,
2869 const SIRegisterInfo &TRI,
2870 SIMachineFunctionInfo &Info) const {
2871 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2872 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2873 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2874 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2875 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2876 }
2877
2878 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2879 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2880 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2881 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2882 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2883 }
2884
2885 if (UserSGPRInfo.hasDispatchPtr()) {
2886 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2887 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2888 CCInfo.AllocateReg(DispatchPtrReg);
2889 }
2890
2891 if (UserSGPRInfo.hasQueuePtr()) {
2892 Register QueuePtrReg = Info.addQueuePtr(TRI);
2893 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2894 CCInfo.AllocateReg(QueuePtrReg);
2895 }
2896
2897 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2899 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2900 CCInfo.AllocateReg(InputPtrReg);
2901
2902 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2903 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2904 }
2905
2906 if (UserSGPRInfo.hasDispatchID()) {
2907 Register DispatchIDReg = Info.addDispatchID(TRI);
2908 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2909 CCInfo.AllocateReg(DispatchIDReg);
2910 }
2911
2912 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2913 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2914 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2915 CCInfo.AllocateReg(FlatScratchInitReg);
2916 }
2917
2918 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2919 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2920 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2921 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2922 }
2923
2924 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2925 // these from the dispatch pointer.
2926}
2927
2928// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2929// sequential starting from the first argument.
2931 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2933 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2934 Function &F = MF.getFunction();
2935 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2936 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2937 bool InPreloadSequence = true;
2938 unsigned InIdx = 0;
2939 bool AlignedForImplictArgs = false;
2940 unsigned ImplicitArgOffset = 0;
2941 for (auto &Arg : F.args()) {
2942 if (!InPreloadSequence || !Arg.hasInRegAttr())
2943 break;
2944
2945 unsigned ArgIdx = Arg.getArgNo();
2946 // Don't preload non-original args or parts not in the current preload
2947 // sequence.
2948 if (InIdx < Ins.size() &&
2949 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2950 break;
2951
2952 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2953 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2954 InIdx++) {
2955 assert(ArgLocs[ArgIdx].isMemLoc());
2956 auto &ArgLoc = ArgLocs[InIdx];
2957 const Align KernelArgBaseAlign = Align(16);
2958 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2959 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2960 unsigned NumAllocSGPRs =
2961 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2962
2963 // Fix alignment for hidden arguments.
2964 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2965 if (!AlignedForImplictArgs) {
2966 ImplicitArgOffset =
2967 alignTo(LastExplicitArgOffset,
2968 Subtarget->getAlignmentForImplicitArgPtr()) -
2969 LastExplicitArgOffset;
2970 AlignedForImplictArgs = true;
2971 }
2972 ArgOffset += ImplicitArgOffset;
2973 }
2974
2975 // Arg is preloaded into the previous SGPR.
2976 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2977 assert(InIdx >= 1 && "No previous SGPR");
2978 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2979 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2980 continue;
2981 }
2982
2983 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2984 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2985 // Check for free user SGPRs for preloading.
2986 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2987 InPreloadSequence = false;
2988 break;
2989 }
2990
2991 // Preload this argument.
2992 const TargetRegisterClass *RC =
2993 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2994 SmallVectorImpl<MCRegister> *PreloadRegs =
2995 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2996
2997 if (PreloadRegs->size() > 1)
2998 RC = &AMDGPU::SGPR_32RegClass;
2999 for (auto &Reg : *PreloadRegs) {
3000 assert(Reg);
3001 MF.addLiveIn(Reg, RC);
3002 CCInfo.AllocateReg(Reg);
3003 }
3004
3005 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3006 }
3007 }
3008}
3009
3011 const SIRegisterInfo &TRI,
3012 SIMachineFunctionInfo &Info) const {
3013 // Always allocate this last since it is a synthetic preload.
3014 if (Info.hasLDSKernelId()) {
3015 Register Reg = Info.addLDSKernelId();
3016 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3017 CCInfo.AllocateReg(Reg);
3018 }
3019}
3020
3021// Allocate special input registers that are initialized per-wave.
3024 CallingConv::ID CallConv,
3025 bool IsShader) const {
3026 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3027 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3028 // Note: user SGPRs are handled by the front-end for graphics shaders
3029 // Pad up the used user SGPRs with dead inputs.
3030
3031 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3032 // before enabling architected SGPRs for workgroup IDs.
3033 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3034
3035 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3036 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3037 // rely on it to reach 16 since if we end up having no stack usage, it will
3038 // not really be added.
3039 unsigned NumRequiredSystemSGPRs =
3040 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3041 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3042 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3043 Register Reg = Info.addReservedUserSGPR();
3044 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3045 CCInfo.AllocateReg(Reg);
3046 }
3047 }
3048
3049 if (!HasArchitectedSGPRs) {
3050 if (Info.hasWorkGroupIDX()) {
3051 Register Reg = Info.addWorkGroupIDX();
3052 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3053 CCInfo.AllocateReg(Reg);
3054 }
3055
3056 if (Info.hasWorkGroupIDY()) {
3057 Register Reg = Info.addWorkGroupIDY();
3058 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3059 CCInfo.AllocateReg(Reg);
3060 }
3061
3062 if (Info.hasWorkGroupIDZ()) {
3063 Register Reg = Info.addWorkGroupIDZ();
3064 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3065 CCInfo.AllocateReg(Reg);
3066 }
3067 }
3068
3069 if (Info.hasWorkGroupInfo()) {
3070 Register Reg = Info.addWorkGroupInfo();
3071 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 CCInfo.AllocateReg(Reg);
3073 }
3074
3075 if (Info.hasPrivateSegmentWaveByteOffset()) {
3076 // Scratch wave offset passed in system SGPR.
3077 unsigned PrivateSegmentWaveByteOffsetReg;
3078
3079 if (IsShader) {
3080 PrivateSegmentWaveByteOffsetReg =
3081 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3082
3083 // This is true if the scratch wave byte offset doesn't have a fixed
3084 // location.
3085 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3086 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3087 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3088 }
3089 } else
3090 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3091
3092 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3093 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3094 }
3095
3096 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3097 Info.getNumPreloadedSGPRs() >= 16);
3098}
3099
3101 MachineFunction &MF,
3102 const SIRegisterInfo &TRI,
3104 // Now that we've figured out where the scratch register inputs are, see if
3105 // should reserve the arguments and use them directly.
3106 MachineFrameInfo &MFI = MF.getFrameInfo();
3107 bool HasStackObjects = MFI.hasStackObjects();
3108 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3109
3110 // Record that we know we have non-spill stack objects so we don't need to
3111 // check all stack objects later.
3112 if (HasStackObjects)
3113 Info.setHasNonSpillStackObjects(true);
3114
3115 // Everything live out of a block is spilled with fast regalloc, so it's
3116 // almost certain that spilling will be required.
3117 if (TM.getOptLevel() == CodeGenOptLevel::None)
3118 HasStackObjects = true;
3119
3120 // For now assume stack access is needed in any callee functions, so we need
3121 // the scratch registers to pass in.
3122 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3123
3124 if (!ST.enableFlatScratch()) {
3125 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3126 // If we have stack objects, we unquestionably need the private buffer
3127 // resource. For the Code Object V2 ABI, this will be the first 4 user
3128 // SGPR inputs. We can reserve those and use them directly.
3129
3130 Register PrivateSegmentBufferReg =
3132 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3133 } else {
3134 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3135 // We tentatively reserve the last registers (skipping the last registers
3136 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3137 // we'll replace these with the ones immediately after those which were
3138 // really allocated. In the prologue copies will be inserted from the
3139 // argument to these reserved registers.
3140
3141 // Without HSA, relocations are used for the scratch pointer and the
3142 // buffer resource setup is always inserted in the prologue. Scratch wave
3143 // offset is still in an input SGPR.
3144 Info.setScratchRSrcReg(ReservedBufferReg);
3145 }
3146 }
3147
3149
3150 // For entry functions we have to set up the stack pointer if we use it,
3151 // whereas non-entry functions get this "for free". This means there is no
3152 // intrinsic advantage to using S32 over S34 in cases where we do not have
3153 // calls but do need a frame pointer (i.e. if we are requested to have one
3154 // because frame pointer elimination is disabled). To keep things simple we
3155 // only ever use S32 as the call ABI stack pointer, and so using it does not
3156 // imply we need a separate frame pointer.
3157 //
3158 // Try to use s32 as the SP, but move it if it would interfere with input
3159 // arguments. This won't work with calls though.
3160 //
3161 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3162 // registers.
3163 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3164 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3165 } else {
3167
3168 if (MFI.hasCalls())
3169 report_fatal_error("call in graphics shader with too many input SGPRs");
3170
3171 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3172 if (!MRI.isLiveIn(Reg)) {
3173 Info.setStackPtrOffsetReg(Reg);
3174 break;
3175 }
3176 }
3177
3178 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3179 report_fatal_error("failed to find register for SP");
3180 }
3181
3182 // hasFP should be accurate for entry functions even before the frame is
3183 // finalized, because it does not rely on the known stack size, only
3184 // properties like whether variable sized objects are present.
3185 if (ST.getFrameLowering()->hasFP(MF)) {
3186 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3187 }
3188}
3189
3192 return !Info->isEntryFunction();
3193}
3194
3196
3198 MachineBasicBlock *Entry,
3199 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3201
3202 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3203 if (!IStart)
3204 return;
3205
3206 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3207 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3208 MachineBasicBlock::iterator MBBI = Entry->begin();
3209 for (const MCPhysReg *I = IStart; *I; ++I) {
3210 const TargetRegisterClass *RC = nullptr;
3211 if (AMDGPU::SReg_64RegClass.contains(*I))
3212 RC = &AMDGPU::SGPR_64RegClass;
3213 else if (AMDGPU::SReg_32RegClass.contains(*I))
3214 RC = &AMDGPU::SGPR_32RegClass;
3215 else
3216 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3217
3218 Register NewVR = MRI->createVirtualRegister(RC);
3219 // Create copy from CSR to a virtual register.
3220 Entry->addLiveIn(*I);
3221 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3222 .addReg(*I);
3223
3224 // Insert the copy-back instructions right before the terminator.
3225 for (auto *Exit : Exits)
3226 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3227 TII->get(TargetOpcode::COPY), *I)
3228 .addReg(NewVR);
3229 }
3230}
3231
3233 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3234 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3235 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3237
3239 const Function &Fn = MF.getFunction();
3242 bool IsError = false;
3243
3244 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3246 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3247 IsError = true;
3248 }
3249
3252 BitVector Skipped(Ins.size());
3253 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3254 *DAG.getContext());
3255
3256 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3257 bool IsKernel = AMDGPU::isKernel(CallConv);
3258 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3259
3260 if (IsGraphics) {
3261 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3262 assert(!UserSGPRInfo.hasDispatchPtr() &&
3263 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3264 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3265 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3266 (void)UserSGPRInfo;
3267 if (!Subtarget->enableFlatScratch())
3268 assert(!UserSGPRInfo.hasFlatScratchInit());
3269 if ((CallConv != CallingConv::AMDGPU_CS &&
3270 CallConv != CallingConv::AMDGPU_Gfx &&
3271 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3272 !Subtarget->hasArchitectedSGPRs())
3273 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3274 !Info->hasWorkGroupIDZ());
3275 }
3276
3277 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3278
3279 if (CallConv == CallingConv::AMDGPU_PS) {
3280 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3281
3282 // At least one interpolation mode must be enabled or else the GPU will
3283 // hang.
3284 //
3285 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3286 // set PSInputAddr, the user wants to enable some bits after the compilation
3287 // based on run-time states. Since we can't know what the final PSInputEna
3288 // will look like, so we shouldn't do anything here and the user should take
3289 // responsibility for the correct programming.
3290 //
3291 // Otherwise, the following restrictions apply:
3292 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3293 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3294 // enabled too.
3295 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3296 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3297 CCInfo.AllocateReg(AMDGPU::VGPR0);
3298 CCInfo.AllocateReg(AMDGPU::VGPR1);
3299 Info->markPSInputAllocated(0);
3300 Info->markPSInputEnabled(0);
3301 }
3302 if (Subtarget->isAmdPalOS()) {
3303 // For isAmdPalOS, the user does not enable some bits after compilation
3304 // based on run-time states; the register values being generated here are
3305 // the final ones set in hardware. Therefore we need to apply the
3306 // workaround to PSInputAddr and PSInputEnable together. (The case where
3307 // a bit is set in PSInputAddr but not PSInputEnable is where the
3308 // frontend set up an input arg for a particular interpolation mode, but
3309 // nothing uses that input arg. Really we should have an earlier pass
3310 // that removes such an arg.)
3311 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3312 if ((PsInputBits & 0x7F) == 0 ||
3313 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3314 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3315 }
3316 } else if (IsKernel) {
3317 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3318 } else {
3319 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3320 Ins.end());
3321 }
3322
3323 if (IsKernel)
3324 analyzeFormalArgumentsCompute(CCInfo, Ins);
3325
3326 if (IsEntryFunc) {
3327 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3328 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3329 if (IsKernel && Subtarget->hasKernargPreload())
3330 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3331
3332 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3333 } else if (!IsGraphics) {
3334 // For the fixed ABI, pass workitem IDs in the last argument register.
3335 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3336
3337 // FIXME: Sink this into allocateSpecialInputSGPRs
3338 if (!Subtarget->enableFlatScratch())
3339 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3340
3341 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3342 }
3343
3344 if (!IsKernel) {
3345 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3346 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3347
3348 // This assumes the registers are allocated by CCInfo in ascending order
3349 // with no gaps.
3350 Info->setNumWaveDispatchSGPRs(
3351 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3352 Info->setNumWaveDispatchVGPRs(
3353 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3354 } else if (Info->getNumKernargPreloadedSGPRs()) {
3355 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3356 }
3357
3359
3360 if (IsWholeWaveFunc) {
3362 {MVT::i1, MVT::Other}, Chain);
3363 InVals.push_back(Setup.getValue(0));
3364 Chains.push_back(Setup.getValue(1));
3365 }
3366
3367 // FIXME: This is the minimum kernel argument alignment. We should improve
3368 // this to the maximum alignment of the arguments.
3369 //
3370 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3371 // kern arg offset.
3372 const Align KernelArgBaseAlign = Align(16);
3373
3374 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3375 ++i) {
3376 const ISD::InputArg &Arg = Ins[i];
3377 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3378 InVals.push_back(DAG.getPOISON(Arg.VT));
3379 continue;
3380 }
3381
3382 CCValAssign &VA = ArgLocs[ArgIdx++];
3383 MVT VT = VA.getLocVT();
3384
3385 if (IsEntryFunc && VA.isMemLoc()) {
3386 VT = Ins[i].VT;
3387 EVT MemVT = VA.getLocVT();
3388
3389 const uint64_t Offset = VA.getLocMemOffset();
3390 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3391
3392 if (Arg.Flags.isByRef()) {
3393 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3394
3395 const GCNTargetMachine &TM =
3396 static_cast<const GCNTargetMachine &>(getTargetMachine());
3397 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3398 Arg.Flags.getPointerAddrSpace())) {
3401 }
3402
3403 InVals.push_back(Ptr);
3404 continue;
3405 }
3406
3407 SDValue NewArg;
3408 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3409 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3410 // In this case the argument is packed into the previous preload SGPR.
3411 int64_t AlignDownOffset = alignDown(Offset, 4);
3412 int64_t OffsetDiff = Offset - AlignDownOffset;
3413 EVT IntVT = MemVT.changeTypeToInteger();
3414
3415 const SIMachineFunctionInfo *Info =
3418 Register Reg =
3419 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3420
3421 assert(Reg);
3422 Register VReg = MRI.getLiveInVirtReg(Reg);
3423 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3424
3425 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3426 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3427
3428 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3429 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3430 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3431 Ins[i].Flags.isSExt(), &Ins[i]);
3432
3433 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3434 } else {
3435 const SIMachineFunctionInfo *Info =
3438 const SmallVectorImpl<MCRegister> &PreloadRegs =
3439 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3440
3441 SDValue Copy;
3442 if (PreloadRegs.size() == 1) {
3443 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3444 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3445 NewArg = DAG.getCopyFromReg(
3446 Chain, DL, VReg,
3448 TRI->getRegSizeInBits(*RC)));
3449
3450 } else {
3451 // If the kernarg alignment does not match the alignment of the SGPR
3452 // tuple RC that can accommodate this argument, it will be built up
3453 // via copies from from the individual SGPRs that the argument was
3454 // preloaded to.
3456 for (auto Reg : PreloadRegs) {
3457 Register VReg = MRI.getLiveInVirtReg(Reg);
3458 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3459 Elts.push_back(Copy);
3460 }
3461 NewArg =
3462 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3463 PreloadRegs.size()),
3464 DL, Elts);
3465 }
3466
3467 // If the argument was preloaded to multiple consecutive 32-bit
3468 // registers because of misalignment between addressable SGPR tuples
3469 // and the argument size, we can still assume that because of kernarg
3470 // segment alignment restrictions that NewArg's size is the same as
3471 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3472 // truncate since we cannot preload to less than a single SGPR and the
3473 // MemVT may be smaller.
3474 EVT MemVTInt =
3476 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3477 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3478
3479 NewArg = DAG.getBitcast(MemVT, NewArg);
3480 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3481 Ins[i].Flags.isSExt(), &Ins[i]);
3482 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3483 }
3484 } else {
3485 // Hidden arguments that are in the kernel signature must be preloaded
3486 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3487 // the argument list and is not preloaded.
3488 if (Arg.isOrigArg()) {
3489 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3490 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3492 *OrigArg->getParent(),
3493 "hidden argument in kernel signature was not preloaded",
3494 DL.getDebugLoc()));
3495 }
3496 }
3497
3498 NewArg =
3499 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3500 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3501 }
3502 Chains.push_back(NewArg.getValue(1));
3503
3504 auto *ParamTy =
3505 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3506 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3507 ParamTy &&
3508 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3509 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3510 // On SI local pointers are just offsets into LDS, so they are always
3511 // less than 16-bits. On CI and newer they could potentially be
3512 // real pointers, so we can't guarantee their size.
3513 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3514 DAG.getValueType(MVT::i16));
3515 }
3516
3517 InVals.push_back(NewArg);
3518 continue;
3519 }
3520 if (!IsEntryFunc && VA.isMemLoc()) {
3521 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3522 InVals.push_back(Val);
3523 if (!Arg.Flags.isByVal())
3524 Chains.push_back(Val.getValue(1));
3525 continue;
3526 }
3527
3528 assert(VA.isRegLoc() && "Parameter must be in a register!");
3529
3530 Register Reg = VA.getLocReg();
3531 const TargetRegisterClass *RC = nullptr;
3532 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3533 RC = &AMDGPU::VGPR_32RegClass;
3534 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3535 RC = &AMDGPU::SGPR_32RegClass;
3536 else
3537 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3538
3539 Reg = MF.addLiveIn(Reg, RC);
3540 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3541
3542 if (Arg.Flags.isSRet()) {
3543 // The return object should be reasonably addressable.
3544
3545 // FIXME: This helps when the return is a real sret. If it is a
3546 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3547 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3548 unsigned NumBits =
3550 Val = DAG.getNode(
3551 ISD::AssertZext, DL, VT, Val,
3552 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3553 }
3554
3555 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3556 InVals.push_back(Val);
3557 }
3558
3559 // Start adding system SGPRs.
3560 if (IsEntryFunc)
3561 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3562
3563 // DAG.getPass() returns nullptr when using new pass manager.
3564 // TODO: Use DAG.getMFAM() to access analysis result.
3565 if (DAG.getPass()) {
3566 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3567 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3568 }
3569
3570 unsigned StackArgSize = CCInfo.getStackSize();
3571 Info->setBytesInStackArgArea(StackArgSize);
3572
3573 return Chains.empty() ? Chain
3574 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3575}
3576
3577// TODO: If return values can't fit in registers, we should return as many as
3578// possible in registers before passing on stack.
3580 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3581 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3582 const Type *RetTy) const {
3583 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3584 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3585 // for shaders. Vector types should be explicitly handled by CC.
3586 if (AMDGPU::isEntryFunctionCC(CallConv))
3587 return true;
3588
3590 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3591 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3592 return false;
3593
3594 // We must use the stack if return would require unavailable registers.
3595 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3596 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3597 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3598 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3599 return false;
3600
3601 return true;
3602}
3603
3604SDValue
3606 bool isVarArg,
3608 const SmallVectorImpl<SDValue> &OutVals,
3609 const SDLoc &DL, SelectionDAG &DAG) const {
3613
3614 if (AMDGPU::isKernel(CallConv)) {
3615 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3616 OutVals, DL, DAG);
3617 }
3618
3619 bool IsShader = AMDGPU::isShader(CallConv);
3620
3621 Info->setIfReturnsVoid(Outs.empty());
3622 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3623
3624 // CCValAssign - represent the assignment of the return value to a location.
3626
3627 // CCState - Info about the registers and stack slots.
3628 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3629 *DAG.getContext());
3630
3631 // Analyze outgoing return values.
3632 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3633
3634 SDValue Glue;
3636 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3637
3638 SDValue ReadFirstLane =
3639 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3640 // Copy the result values into the output registers.
3641 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3642 ++I, ++RealRVLocIdx) {
3643 CCValAssign &VA = RVLocs[I];
3644 assert(VA.isRegLoc() && "Can only return in registers!");
3645 // TODO: Partially return in registers if return values don't fit.
3646 SDValue Arg = OutVals[RealRVLocIdx];
3647
3648 // Copied from other backends.
3649 switch (VA.getLocInfo()) {
3650 case CCValAssign::Full:
3651 break;
3652 case CCValAssign::BCvt:
3653 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3654 break;
3655 case CCValAssign::SExt:
3656 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3657 break;
3658 case CCValAssign::ZExt:
3659 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3660 break;
3661 case CCValAssign::AExt:
3662 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3663 break;
3664 default:
3665 llvm_unreachable("Unknown loc info!");
3666 }
3667 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3669 ReadFirstLane, Arg);
3670 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3671 Glue = Chain.getValue(1);
3672 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3673 }
3674
3675 // FIXME: Does sret work properly?
3676 if (!Info->isEntryFunction()) {
3677 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3678 const MCPhysReg *I =
3679 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3680 if (I) {
3681 for (; *I; ++I) {
3682 if (AMDGPU::SReg_64RegClass.contains(*I))
3683 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3684 else if (AMDGPU::SReg_32RegClass.contains(*I))
3685 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3686 else
3687 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3688 }
3689 }
3690 }
3691
3692 // Update chain and glue.
3693 RetOps[0] = Chain;
3694 if (Glue.getNode())
3695 RetOps.push_back(Glue);
3696
3697 unsigned Opc = AMDGPUISD::ENDPGM;
3698 if (!IsWaveEnd)
3699 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3700 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3702 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3703}
3704
3706 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3707 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3708 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3709 SDValue ThisVal) const {
3710 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3711
3712 // Assign locations to each value returned by this call.
3714 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3715 *DAG.getContext());
3716 CCInfo.AnalyzeCallResult(Ins, RetCC);
3717
3718 // Copy all of the result registers out of their specified physreg.
3719 for (CCValAssign VA : RVLocs) {
3720 SDValue Val;
3721
3722 if (VA.isRegLoc()) {
3723 Val =
3724 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3725 Chain = Val.getValue(1);
3726 InGlue = Val.getValue(2);
3727 } else if (VA.isMemLoc()) {
3728 report_fatal_error("TODO: return values in memory");
3729 } else
3730 llvm_unreachable("unknown argument location type");
3731
3732 switch (VA.getLocInfo()) {
3733 case CCValAssign::Full:
3734 break;
3735 case CCValAssign::BCvt:
3736 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3737 break;
3738 case CCValAssign::ZExt:
3739 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3740 DAG.getValueType(VA.getValVT()));
3741 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3742 break;
3743 case CCValAssign::SExt:
3744 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3745 DAG.getValueType(VA.getValVT()));
3746 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3747 break;
3748 case CCValAssign::AExt:
3749 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3750 break;
3751 default:
3752 llvm_unreachable("Unknown loc info!");
3753 }
3754
3755 InVals.push_back(Val);
3756 }
3757
3758 return Chain;
3759}
3760
3761// Add code to pass special inputs required depending on used features separate
3762// from the explicit user arguments present in the IR.
3764 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3765 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3766 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3767 // If we don't have a call site, this was a call inserted by
3768 // legalization. These can never use special inputs.
3769 if (!CLI.CB)
3770 return;
3771
3772 SelectionDAG &DAG = CLI.DAG;
3773 const SDLoc &DL = CLI.DL;
3774 const Function &F = DAG.getMachineFunction().getFunction();
3775
3776 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3777 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3778
3779 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3781 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3782 // DAG.getPass() returns nullptr when using new pass manager.
3783 // TODO: Use DAG.getMFAM() to access analysis result.
3784 if (DAG.getPass()) {
3785 auto &ArgUsageInfo =
3787 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3788 }
3789 }
3790
3791 // TODO: Unify with private memory register handling. This is complicated by
3792 // the fact that at least in kernels, the input argument is not necessarily
3793 // in the same location as the input.
3794 // clang-format off
3795 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3797 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3798 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3799 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3800 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3801 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3802 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3803 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3804 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3805 };
3806 // clang-format on
3807
3808 for (auto [InputID, Attr] : ImplicitAttrs) {
3809 // If the callee does not use the attribute value, skip copying the value.
3810 if (CLI.CB->hasFnAttr(Attr))
3811 continue;
3812
3813 const auto [OutgoingArg, ArgRC, ArgTy] =
3814 CalleeArgInfo->getPreloadedValue(InputID);
3815 if (!OutgoingArg)
3816 continue;
3817
3818 const auto [IncomingArg, IncomingArgRC, Ty] =
3819 CallerArgInfo.getPreloadedValue(InputID);
3820 assert(IncomingArgRC == ArgRC);
3821
3822 // All special arguments are ints for now.
3823 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3824 SDValue InputReg;
3825
3826 if (IncomingArg) {
3827 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3828 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3829 // The implicit arg ptr is special because it doesn't have a corresponding
3830 // input for kernels, and is computed from the kernarg segment pointer.
3831 InputReg = getImplicitArgPtr(DAG, DL);
3832 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3833 std::optional<uint32_t> Id =
3835 if (Id.has_value()) {
3836 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3837 } else {
3838 InputReg = DAG.getPOISON(ArgVT);
3839 }
3840 } else {
3841 // We may have proven the input wasn't needed, although the ABI is
3842 // requiring it. We just need to allocate the register appropriately.
3843 InputReg = DAG.getPOISON(ArgVT);
3844 }
3845
3846 if (OutgoingArg->isRegister()) {
3847 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3848 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3849 report_fatal_error("failed to allocate implicit input argument");
3850 } else {
3851 unsigned SpecialArgOffset =
3852 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3853 SDValue ArgStore =
3854 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3855 MemOpChains.push_back(ArgStore);
3856 }
3857 }
3858
3859 // Pack workitem IDs into a single register or pass it as is if already
3860 // packed.
3861
3862 auto [OutgoingArg, ArgRC, Ty] =
3864 if (!OutgoingArg)
3865 std::tie(OutgoingArg, ArgRC, Ty) =
3867 if (!OutgoingArg)
3868 std::tie(OutgoingArg, ArgRC, Ty) =
3870 if (!OutgoingArg)
3871 return;
3872
3873 const ArgDescriptor *IncomingArgX = std::get<0>(
3875 const ArgDescriptor *IncomingArgY = std::get<0>(
3877 const ArgDescriptor *IncomingArgZ = std::get<0>(
3879
3880 SDValue InputReg;
3881 SDLoc SL;
3882
3883 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3884 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3885 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3886
3887 // If incoming ids are not packed we need to pack them.
3888 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3889 NeedWorkItemIDX) {
3890 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3891 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3892 } else {
3893 InputReg = DAG.getConstant(0, DL, MVT::i32);
3894 }
3895 }
3896
3897 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3898 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3899 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3900 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3901 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3902 InputReg = InputReg.getNode()
3903 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3904 : Y;
3905 }
3906
3907 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3908 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3909 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3910 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3911 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3912 InputReg = InputReg.getNode()
3913 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3914 : Z;
3915 }
3916
3917 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3918 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3919 // We're in a situation where the outgoing function requires the workitem
3920 // ID, but the calling function does not have it (e.g a graphics function
3921 // calling a C calling convention function). This is illegal, but we need
3922 // to produce something.
3923 InputReg = DAG.getPOISON(MVT::i32);
3924 } else {
3925 // Workitem ids are already packed, any of present incoming arguments
3926 // will carry all required fields.
3927 ArgDescriptor IncomingArg =
3928 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3929 : IncomingArgY ? *IncomingArgY
3930 : *IncomingArgZ,
3931 ~0u);
3932 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3933 }
3934 }
3935
3936 if (OutgoingArg->isRegister()) {
3937 if (InputReg)
3938 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3939
3940 CCInfo.AllocateReg(OutgoingArg->getRegister());
3941 } else {
3942 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3943 if (InputReg) {
3944 SDValue ArgStore =
3945 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3946 MemOpChains.push_back(ArgStore);
3947 }
3948 }
3949}
3950
3952 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3954 const SmallVectorImpl<SDValue> &OutVals,
3955 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3956 if (AMDGPU::isChainCC(CalleeCC))
3957 return true;
3958
3959 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3960 return false;
3961
3962 // For a divergent call target, we need to do a waterfall loop over the
3963 // possible callees which precludes us from using a simple jump.
3964 if (Callee->isDivergent())
3965 return false;
3966
3968 const Function &CallerF = MF.getFunction();
3969 CallingConv::ID CallerCC = CallerF.getCallingConv();
3971 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3972
3973 // Kernels aren't callable, and don't have a live in return address so it
3974 // doesn't make sense to do a tail call with entry functions.
3975 if (!CallerPreserved)
3976 return false;
3977
3978 bool CCMatch = CallerCC == CalleeCC;
3979
3981 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3982 return true;
3983 return false;
3984 }
3985
3986 // TODO: Can we handle var args?
3987 if (IsVarArg)
3988 return false;
3989
3990 for (const Argument &Arg : CallerF.args()) {
3991 if (Arg.hasByValAttr())
3992 return false;
3993 }
3994
3995 LLVMContext &Ctx = *DAG.getContext();
3996
3997 // Check that the call results are passed in the same way.
3998 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3999 CCAssignFnForCall(CalleeCC, IsVarArg),
4000 CCAssignFnForCall(CallerCC, IsVarArg)))
4001 return false;
4002
4003 // The callee has to preserve all registers the caller needs to preserve.
4004 if (!CCMatch) {
4005 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4006 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4007 return false;
4008 }
4009
4010 // Nothing more to check if the callee is taking no arguments.
4011 if (Outs.empty())
4012 return true;
4013
4015 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4016
4017 // FIXME: We are not allocating special input registers, so we will be
4018 // deciding based on incorrect register assignments.
4019 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4020
4021 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4022 // If the stack arguments for this call do not fit into our own save area then
4023 // the call cannot be made tail.
4024 // TODO: Is this really necessary?
4025 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4026 return false;
4027
4028 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4029 // FIXME: What about inreg arguments that end up passed in memory?
4030 if (!CCVA.isRegLoc())
4031 continue;
4032
4033 // If we are passing an argument in an SGPR, and the value is divergent,
4034 // this call requires a waterfall loop.
4035 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4036 LLVM_DEBUG(
4037 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4038 << printReg(CCVA.getLocReg(), TRI) << '\n');
4039 return false;
4040 }
4041 }
4042
4043 const MachineRegisterInfo &MRI = MF.getRegInfo();
4044 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4045}
4046
4048 if (!CI->isTailCall())
4049 return false;
4050
4051 const Function *ParentFn = CI->getParent()->getParent();
4053 return false;
4054 return true;
4055}
4056
4057namespace {
4058// Chain calls have special arguments that we need to handle. These are
4059// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4060// arguments (index 0 and 1 respectively).
4061enum ChainCallArgIdx {
4062 Exec = 2,
4063 Flags,
4064 NumVGPRs,
4065 FallbackExec,
4066 FallbackCallee
4067};
4068} // anonymous namespace
4069
4070// The wave scratch offset register is used as the global base pointer.
4072 SmallVectorImpl<SDValue> &InVals) const {
4073 CallingConv::ID CallConv = CLI.CallConv;
4074 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4075
4076 SelectionDAG &DAG = CLI.DAG;
4077
4078 const SDLoc &DL = CLI.DL;
4079 SDValue Chain = CLI.Chain;
4080 SDValue Callee = CLI.Callee;
4081
4082 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4083 bool UsesDynamicVGPRs = false;
4084 if (IsChainCallConv) {
4085 // The last arguments should be the value that we need to put in EXEC,
4086 // followed by the flags and any other arguments with special meanings.
4087 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4088 // we don't treat them like the "real" arguments.
4089 auto RequestedExecIt =
4090 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4091 return Arg.OrigArgIndex == 2;
4092 });
4093 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4094
4095 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4096 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4097 CLI.OutVals.end());
4098 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4099
4100 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4101 "Haven't popped all the special args");
4102
4103 TargetLowering::ArgListEntry RequestedExecArg =
4104 CLI.Args[ChainCallArgIdx::Exec];
4105 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4106 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4107
4108 // Convert constants into TargetConstants, so they become immediate operands
4109 // instead of being selected into S_MOV.
4110 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4111 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4112 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4113 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4114 } else
4115 ChainCallSpecialArgs.push_back(Arg.Node);
4116 };
4117
4118 PushNodeOrTargetConstant(RequestedExecArg);
4119
4120 // Process any other special arguments depending on the value of the flags.
4121 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4122
4123 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4124 if (FlagsValue.isZero()) {
4125 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4126 return lowerUnhandledCall(CLI, InVals,
4127 "no additional args allowed if flags == 0");
4128 } else if (FlagsValue.isOneBitSet(0)) {
4129 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4130 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4131 }
4132
4133 if (!Subtarget->isWave32()) {
4134 return lowerUnhandledCall(
4135 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4136 }
4137
4138 UsesDynamicVGPRs = true;
4139 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4140 CLI.Args.end(), PushNodeOrTargetConstant);
4141 }
4142 }
4143
4145 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4147 bool &IsTailCall = CLI.IsTailCall;
4148 bool IsVarArg = CLI.IsVarArg;
4149 bool IsSibCall = false;
4151
4152 if (Callee.isUndef() || isNullConstant(Callee)) {
4153 if (!CLI.IsTailCall) {
4154 for (ISD::InputArg &Arg : CLI.Ins)
4155 InVals.push_back(DAG.getPOISON(Arg.VT));
4156 }
4157
4158 return Chain;
4159 }
4160
4161 if (IsVarArg) {
4162 return lowerUnhandledCall(CLI, InVals,
4163 "unsupported call to variadic function ");
4164 }
4165
4166 if (!CLI.CB)
4167 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4168
4169 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4170 return lowerUnhandledCall(CLI, InVals,
4171 "unsupported required tail call to function ");
4172 }
4173
4174 if (IsTailCall) {
4175 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4176 Outs, OutVals, Ins, DAG);
4177 if (!IsTailCall &&
4178 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4179 report_fatal_error("failed to perform tail call elimination on a call "
4180 "site marked musttail or on llvm.amdgcn.cs.chain");
4181 }
4182
4183 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4184
4185 // A sibling call is one where we're under the usual C ABI and not planning
4186 // to change that but can still do a tail call:
4187 if (!TailCallOpt && IsTailCall)
4188 IsSibCall = true;
4189
4190 if (IsTailCall)
4191 ++NumTailCalls;
4192 }
4193
4196 SmallVector<SDValue, 8> MemOpChains;
4197
4198 // Analyze operands of the call, assigning locations to each operand.
4200 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4201 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4202
4203 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4205 // With a fixed ABI, allocate fixed registers before user arguments.
4206 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4207 }
4208
4209 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4210
4211 // Get a count of how many bytes are to be pushed on the stack.
4212 unsigned NumBytes = CCInfo.getStackSize();
4213
4214 if (IsSibCall) {
4215 // Since we're not changing the ABI to make this a tail call, the memory
4216 // operands are already available in the caller's incoming argument space.
4217 NumBytes = 0;
4218 }
4219
4220 // FPDiff is the byte offset of the call's argument area from the callee's.
4221 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4222 // by this amount for a tail call. In a sibling call it must be 0 because the
4223 // caller will deallocate the entire stack and the callee still expects its
4224 // arguments to begin at SP+0. Completely unused for non-tail calls.
4225 int32_t FPDiff = 0;
4226 MachineFrameInfo &MFI = MF.getFrameInfo();
4227 auto *TRI = Subtarget->getRegisterInfo();
4228
4229 // Adjust the stack pointer for the new arguments...
4230 // These operations are automatically eliminated by the prolog/epilog pass
4231 if (!IsSibCall)
4232 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4233
4234 if (!IsSibCall || IsChainCallConv) {
4235 if (!Subtarget->enableFlatScratch()) {
4236 SmallVector<SDValue, 4> CopyFromChains;
4237
4238 // In the HSA case, this should be an identity copy.
4239 SDValue ScratchRSrcReg =
4240 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4241 RegsToPass.emplace_back(IsChainCallConv
4242 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4243 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4244 ScratchRSrcReg);
4245 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4246 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4247 }
4248 }
4249
4250 const unsigned NumSpecialInputs = RegsToPass.size();
4251
4252 MVT PtrVT = MVT::i32;
4253
4254 // Walk the register/memloc assignments, inserting copies/loads.
4255 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4256 CCValAssign &VA = ArgLocs[i];
4257 SDValue Arg = OutVals[i];
4258
4259 // Promote the value if needed.
4260 switch (VA.getLocInfo()) {
4261 case CCValAssign::Full:
4262 break;
4263 case CCValAssign::BCvt:
4264 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4265 break;
4266 case CCValAssign::ZExt:
4267 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4268 break;
4269 case CCValAssign::SExt:
4270 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4271 break;
4272 case CCValAssign::AExt:
4273 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4274 break;
4275 case CCValAssign::FPExt:
4276 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4277 break;
4278 default:
4279 llvm_unreachable("Unknown loc info!");
4280 }
4281
4282 if (VA.isRegLoc()) {
4283 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4284 } else {
4285 assert(VA.isMemLoc());
4286
4287 SDValue DstAddr;
4288 MachinePointerInfo DstInfo;
4289
4290 unsigned LocMemOffset = VA.getLocMemOffset();
4291 int32_t Offset = LocMemOffset;
4292
4293 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4294 MaybeAlign Alignment;
4295
4296 if (IsTailCall) {
4297 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4298 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4299 : VA.getValVT().getStoreSize();
4300
4301 // FIXME: We can have better than the minimum byval required alignment.
4302 Alignment =
4303 Flags.isByVal()
4304 ? Flags.getNonZeroByValAlign()
4305 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4306
4307 Offset = Offset + FPDiff;
4308 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4309
4310 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4311 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4312
4313 // Make sure any stack arguments overlapping with where we're storing
4314 // are loaded before this eventual operation. Otherwise they'll be
4315 // clobbered.
4316
4317 // FIXME: Why is this really necessary? This seems to just result in a
4318 // lot of code to copy the stack and write them back to the same
4319 // locations, which are supposed to be immutable?
4320 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4321 } else {
4322 // Stores to the argument stack area are relative to the stack pointer.
4323 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4324 MVT::i32);
4325 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4326 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4327 Alignment =
4328 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4329 }
4330
4331 if (Outs[i].Flags.isByVal()) {
4332 SDValue SizeNode =
4333 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4334 SDValue Cpy =
4335 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4336 Outs[i].Flags.getNonZeroByValAlign(),
4337 /*isVol = */ false, /*AlwaysInline = */ true,
4338 /*CI=*/nullptr, std::nullopt, DstInfo,
4340
4341 MemOpChains.push_back(Cpy);
4342 } else {
4343 SDValue Store =
4344 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4345 MemOpChains.push_back(Store);
4346 }
4347 }
4348 }
4349
4350 if (!MemOpChains.empty())
4351 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4352
4353 SDValue ReadFirstLaneID =
4354 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4355
4356 SDValue TokenGlue;
4357 if (CLI.ConvergenceControlToken) {
4358 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4360 }
4361
4362 // Build a sequence of copy-to-reg nodes chained together with token chain
4363 // and flag operands which copy the outgoing args into the appropriate regs.
4364 SDValue InGlue;
4365
4366 unsigned ArgIdx = 0;
4367 for (auto [Reg, Val] : RegsToPass) {
4368 if (ArgIdx++ >= NumSpecialInputs &&
4369 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4370 // For chain calls, the inreg arguments are required to be
4371 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4372 // they are uniform.
4373 //
4374 // For other calls, if an inreg arguments is known to be uniform,
4375 // speculatively insert a readfirstlane in case it is in a VGPR.
4376 //
4377 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4378 // value, so let that continue to produce invalid code.
4379
4380 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4381 if (TokenGlue)
4382 ReadfirstlaneArgs.push_back(TokenGlue);
4384 ReadfirstlaneArgs);
4385 }
4386
4387 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4388 InGlue = Chain.getValue(1);
4389 }
4390
4391 // We don't usually want to end the call-sequence here because we would tidy
4392 // the frame up *after* the call, however in the ABI-changing tail-call case
4393 // we've carefully laid out the parameters so that when sp is reset they'll be
4394 // in the correct location.
4395 if (IsTailCall && !IsSibCall) {
4396 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4397 InGlue = Chain.getValue(1);
4398 }
4399
4400 std::vector<SDValue> Ops({Chain});
4401
4402 // Add a redundant copy of the callee global which will not be legalized, as
4403 // we need direct access to the callee later.
4405 const GlobalValue *GV = GSD->getGlobal();
4406 Ops.push_back(Callee);
4407 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4408 } else {
4409 if (IsTailCall) {
4410 // isEligibleForTailCallOptimization considered whether the call target is
4411 // divergent, but we may still end up with a uniform value in a VGPR.
4412 // Insert a readfirstlane just in case.
4413 SDValue ReadFirstLaneID =
4414 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4415
4416 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4417 if (TokenGlue)
4418 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4419 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4420 ReadfirstlaneArgs);
4421 }
4422
4423 Ops.push_back(Callee);
4424 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4425 }
4426
4427 if (IsTailCall) {
4428 // Each tail call may have to adjust the stack by a different amount, so
4429 // this information must travel along with the operation for eventual
4430 // consumption by emitEpilogue.
4431 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4432 }
4433
4434 if (IsChainCallConv)
4435 llvm::append_range(Ops, ChainCallSpecialArgs);
4436
4437 // Add argument registers to the end of the list so that they are known live
4438 // into the call.
4439 for (auto &[Reg, Val] : RegsToPass)
4440 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4441
4442 // Add a register mask operand representing the call-preserved registers.
4443 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4444 assert(Mask && "Missing call preserved mask for calling convention");
4445 Ops.push_back(DAG.getRegisterMask(Mask));
4446
4447 if (SDValue Token = CLI.ConvergenceControlToken) {
4449 GlueOps.push_back(Token);
4450 if (InGlue)
4451 GlueOps.push_back(InGlue);
4452
4453 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4454 MVT::Glue, GlueOps),
4455 0);
4456 }
4457
4458 if (InGlue)
4459 Ops.push_back(InGlue);
4460
4461 // If we're doing a tall call, use a TC_RETURN here rather than an
4462 // actual call instruction.
4463 if (IsTailCall) {
4464 MFI.setHasTailCall();
4465 unsigned OPC = AMDGPUISD::TC_RETURN;
4466 switch (CallConv) {
4469 break;
4472 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4474 break;
4475 }
4476
4477 // If the caller is a whole wave function, we need to use a special opcode
4478 // so we can patch up EXEC.
4479 if (Info->isWholeWaveFunction())
4481
4482 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4483 }
4484
4485 // Returns a chain and a flag for retval copy to use.
4486 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4487 Chain = Call.getValue(0);
4488 InGlue = Call.getValue(1);
4489
4490 uint64_t CalleePopBytes = NumBytes;
4491 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4492 if (!Ins.empty())
4493 InGlue = Chain.getValue(1);
4494
4495 // Handle result values, copying them out of physregs into vregs that we
4496 // return.
4497 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4498 InVals, /*IsThisReturn=*/false, SDValue());
4499}
4500
4501// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4502// except for:
4503// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4504// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4506 SelectionDAG &DAG) const {
4507 const MachineFunction &MF = DAG.getMachineFunction();
4509
4510 SDLoc dl(Op);
4511 EVT VT = Op.getValueType();
4512 SDValue Chain = Op.getOperand(0);
4513 Register SPReg = Info->getStackPtrOffsetReg();
4514
4515 // Chain the dynamic stack allocation so that it doesn't modify the stack
4516 // pointer when other instructions are using the stack.
4517 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4518
4519 SDValue Size = Op.getOperand(1);
4520 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4521 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4522
4523 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4525 "Stack grows upwards for AMDGPU");
4526
4527 Chain = BaseAddr.getValue(1);
4528 Align StackAlign = TFL->getStackAlign();
4529 if (Alignment > StackAlign) {
4530 uint64_t ScaledAlignment = Alignment.value()
4531 << Subtarget->getWavefrontSizeLog2();
4532 uint64_t StackAlignMask = ScaledAlignment - 1;
4533 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4534 DAG.getConstant(StackAlignMask, dl, VT));
4535 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4536 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4537 }
4538
4539 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4540 SDValue NewSP;
4542 // For constant sized alloca, scale alloca size by wave-size
4543 SDValue ScaledSize = DAG.getNode(
4544 ISD::SHL, dl, VT, Size,
4545 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4546 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4547 } else {
4548 // For dynamic sized alloca, perform wave-wide reduction to get max of
4549 // alloca size(divergent) and then scale it by wave-size
4550 SDValue WaveReduction =
4551 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4552 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4553 Size, DAG.getConstant(0, dl, MVT::i32));
4554 SDValue ScaledSize = DAG.getNode(
4555 ISD::SHL, dl, VT, Size,
4556 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4557 NewSP =
4558 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4559 SDValue ReadFirstLaneID =
4560 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4561 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4562 NewSP);
4563 }
4564
4565 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4566 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4567
4568 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4569}
4570
4572 if (Op.getValueType() != MVT::i32)
4573 return Op; // Defer to cannot select error.
4574
4576 SDLoc SL(Op);
4577
4578 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4579
4580 // Convert from wave uniform to swizzled vector address. This should protect
4581 // from any edge cases where the stacksave result isn't directly used with
4582 // stackrestore.
4583 SDValue VectorAddress =
4584 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4585 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4586}
4587
4589 SelectionDAG &DAG) const {
4590 SDLoc SL(Op);
4591 assert(Op.getValueType() == MVT::i32);
4592
4593 uint32_t BothRoundHwReg =
4595 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4596
4597 SDValue IntrinID =
4598 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4599 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4600 Op.getOperand(0), IntrinID, GetRoundBothImm);
4601
4602 // There are two rounding modes, one for f32 and one for f64/f16. We only
4603 // report in the standard value range if both are the same.
4604 //
4605 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4606 // ties away from zero is not supported, and the other values are rotated by
4607 // 1.
4608 //
4609 // If the two rounding modes are not the same, report a target defined value.
4610
4611 // Mode register rounding mode fields:
4612 //
4613 // [1:0] Single-precision round mode.
4614 // [3:2] Double/Half-precision round mode.
4615 //
4616 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4617 //
4618 // Hardware Spec
4619 // Toward-0 3 0
4620 // Nearest Even 0 1
4621 // +Inf 1 2
4622 // -Inf 2 3
4623 // NearestAway0 N/A 4
4624 //
4625 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4626 // table we can index by the raw hardware mode.
4627 //
4628 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4629
4630 SDValue BitTable =
4632
4633 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4634 SDValue RoundModeTimesNumBits =
4635 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4636
4637 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4638 // knew only one mode was demanded.
4639 SDValue TableValue =
4640 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4641 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4642
4643 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4644 SDValue TableEntry =
4645 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4646
4647 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4648 // if it's an extended value.
4649 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4650 SDValue IsStandardValue =
4651 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4652 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4653 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4654 TableEntry, EnumOffset);
4655
4656 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4657}
4658
4660 SelectionDAG &DAG) const {
4661 SDLoc SL(Op);
4662
4663 SDValue NewMode = Op.getOperand(1);
4664 assert(NewMode.getValueType() == MVT::i32);
4665
4666 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4667 // hardware MODE.fp_round values.
4668 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4669 uint32_t ClampedVal = std::min(
4670 static_cast<uint32_t>(ConstMode->getZExtValue()),
4672 NewMode = DAG.getConstant(
4673 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4674 } else {
4675 // If we know the input can only be one of the supported standard modes in
4676 // the range 0-3, we can use a simplified mapping to hardware values.
4677 KnownBits KB = DAG.computeKnownBits(NewMode);
4678 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4679 // The supported standard values are 0-3. The extended values start at 8. We
4680 // need to offset by 4 if the value is in the extended range.
4681
4682 if (UseReducedTable) {
4683 // Truncate to the low 32-bits.
4684 SDValue BitTable = DAG.getConstant(
4685 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4686
4687 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4688 SDValue RoundModeTimesNumBits =
4689 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4690
4691 NewMode =
4692 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4693
4694 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4695 // the table extracted bits into inline immediates.
4696 } else {
4697 // table_index = umin(value, value - 4)
4698 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4699 SDValue BitTable =
4701
4702 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4703 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4704 SDValue IndexVal =
4705 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4706
4707 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4708 SDValue RoundModeTimesNumBits =
4709 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4710
4711 SDValue TableValue =
4712 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4713 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4714
4715 // No need to mask out the high bits since the setreg will ignore them
4716 // anyway.
4717 NewMode = TruncTable;
4718 }
4719
4720 // Insert a readfirstlane in case the value is a VGPR. We could do this
4721 // earlier and keep more operations scalar, but that interferes with
4722 // combining the source.
4723 SDValue ReadFirstLaneID =
4724 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4725 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4726 ReadFirstLaneID, NewMode);
4727 }
4728
4729 // N.B. The setreg will be later folded into s_round_mode on supported
4730 // targets.
4731 SDValue IntrinID =
4732 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4733 uint32_t BothRoundHwReg =
4735 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4736
4737 SDValue SetReg =
4738 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4739 IntrinID, RoundBothImm, NewMode);
4740
4741 return SetReg;
4742}
4743
4745 if (Op->isDivergent() &&
4746 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4747 // Cannot do I$ prefetch with divergent pointer.
4748 return SDValue();
4749
4750 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4754 break;
4756 if (Subtarget->hasSafeSmemPrefetch())
4757 break;
4758 [[fallthrough]];
4759 default:
4760 return SDValue();
4761 }
4762
4763 // I$ prefetch
4764 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4765 return SDValue();
4766
4767 return Op;
4768}
4769
4770// Work around DAG legality rules only based on the result type.
4772 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4773 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4774 EVT SrcVT = Src.getValueType();
4775
4776 if (SrcVT.getScalarType() != MVT::bf16)
4777 return Op;
4778
4779 SDLoc SL(Op);
4780 SDValue BitCast =
4781 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4782
4783 EVT DstVT = Op.getValueType();
4784 if (IsStrict)
4785 llvm_unreachable("Need STRICT_BF16_TO_FP");
4786
4787 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4788}
4789
4791 SDLoc SL(Op);
4792 if (Op.getValueType() != MVT::i64)
4793 return Op;
4794
4795 uint32_t ModeHwReg =
4797 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4798 uint32_t TrapHwReg =
4800 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4801
4802 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4803 SDValue IntrinID =
4804 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4805 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4806 Op.getOperand(0), IntrinID, ModeHwRegImm);
4807 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4808 Op.getOperand(0), IntrinID, TrapHwRegImm);
4809 SDValue TokenReg =
4810 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4811 GetTrapReg.getValue(1));
4812
4813 SDValue CvtPtr =
4814 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4815 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4816
4817 return DAG.getMergeValues({Result, TokenReg}, SL);
4818}
4819
4821 SDLoc SL(Op);
4822 if (Op.getOperand(1).getValueType() != MVT::i64)
4823 return Op;
4824
4825 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4826 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4827 DAG.getConstant(0, SL, MVT::i32));
4828 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4829 DAG.getConstant(1, SL, MVT::i32));
4830
4831 SDValue ReadFirstLaneID =
4832 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4833 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4834 ReadFirstLaneID, NewModeReg);
4835 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4836 ReadFirstLaneID, NewTrapReg);
4837
4838 unsigned ModeHwReg =
4840 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4841 unsigned TrapHwReg =
4843 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4844
4845 SDValue IntrinID =
4846 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4847 SDValue SetModeReg =
4848 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4849 IntrinID, ModeHwRegImm, NewModeReg);
4850 SDValue SetTrapReg =
4851 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4852 IntrinID, TrapHwRegImm, NewTrapReg);
4853 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4854}
4855
4857 const MachineFunction &MF) const {
4858 const Function &Fn = MF.getFunction();
4859
4861 .Case("m0", AMDGPU::M0)
4862 .Case("exec", AMDGPU::EXEC)
4863 .Case("exec_lo", AMDGPU::EXEC_LO)
4864 .Case("exec_hi", AMDGPU::EXEC_HI)
4865 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4866 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4867 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4868 .Default(Register());
4869 if (!Reg)
4870 return Reg;
4871
4872 if (!Subtarget->hasFlatScrRegister() &&
4873 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4874 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4875 "\" for subtarget."));
4876 }
4877
4878 switch (Reg) {
4879 case AMDGPU::M0:
4880 case AMDGPU::EXEC_LO:
4881 case AMDGPU::EXEC_HI:
4882 case AMDGPU::FLAT_SCR_LO:
4883 case AMDGPU::FLAT_SCR_HI:
4884 if (VT.getSizeInBits() == 32)
4885 return Reg;
4886 break;
4887 case AMDGPU::EXEC:
4888 case AMDGPU::FLAT_SCR:
4889 if (VT.getSizeInBits() == 64)
4890 return Reg;
4891 break;
4892 default:
4893 llvm_unreachable("missing register type checking");
4894 }
4895
4897 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4898}
4899
4900// If kill is not the last instruction, split the block so kill is always a
4901// proper terminator.
4904 MachineBasicBlock *BB) const {
4905 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4907 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4908 return SplitBB;
4909}
4910
4911// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4912// \p MI will be the only instruction in the loop body block. Otherwise, it will
4913// be the first instruction in the remainder block.
4914//
4915/// \returns { LoopBody, Remainder }
4916static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4918 MachineFunction *MF = MBB.getParent();
4920
4921 // To insert the loop we need to split the block. Move everything after this
4922 // point to a new block, and insert a new empty block between the two.
4924 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4926 ++MBBI;
4927
4928 MF->insert(MBBI, LoopBB);
4929 MF->insert(MBBI, RemainderBB);
4930
4931 LoopBB->addSuccessor(LoopBB);
4932 LoopBB->addSuccessor(RemainderBB);
4933
4934 // Move the rest of the block into a new block.
4935 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4936
4937 if (InstInLoop) {
4938 auto Next = std::next(I);
4939
4940 // Move instruction to loop body.
4941 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4942
4943 // Move the rest of the block.
4944 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4945 } else {
4946 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4947 }
4948
4949 MBB.addSuccessor(LoopBB);
4950
4951 return std::pair(LoopBB, RemainderBB);
4952}
4953
4954/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4956 MachineBasicBlock *MBB = MI.getParent();
4958 auto I = MI.getIterator();
4959 auto E = std::next(I);
4960
4961 // clang-format off
4962 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4963 .addImm(0);
4964 // clang-format on
4965
4966 MIBundleBuilder Bundler(*MBB, I, E);
4967 finalizeBundle(*MBB, Bundler.begin());
4968}
4969
4972 MachineBasicBlock *BB) const {
4973 const DebugLoc &DL = MI.getDebugLoc();
4974
4976
4978
4979 // Apparently kill flags are only valid if the def is in the same block?
4980 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4981 Src->setIsKill(false);
4982
4983 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4984
4985 MachineBasicBlock::iterator I = LoopBB->end();
4986
4987 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4989
4990 // Clear TRAP_STS.MEM_VIOL
4991 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4992 .addImm(0)
4993 .addImm(EncodedReg);
4994
4996
4997 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4998
4999 // Load and check TRAP_STS.MEM_VIOL
5000 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5001 .addImm(EncodedReg);
5002
5003 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5004 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5005 .addReg(Reg, RegState::Kill)
5006 .addImm(0);
5007 // clang-format off
5008 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5009 .addMBB(LoopBB);
5010 // clang-format on
5011
5012 return RemainderBB;
5013}
5014
5015// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5016// wavefront. If the value is uniform and just happens to be in a VGPR, this
5017// will only do one iteration. In the worst case, this will loop 64 times.
5018//
5019// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5022 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5023 const DebugLoc &DL, const MachineOperand &Idx,
5024 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5025 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5026 Register &SGPRIdxReg) {
5027
5028 MachineFunction *MF = OrigBB.getParent();
5029 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5030 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5033
5034 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5035 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5036 Register NewExec = MRI.createVirtualRegister(BoolRC);
5037 Register CurrentIdxReg =
5038 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5039 Register CondReg = MRI.createVirtualRegister(BoolRC);
5040
5041 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5042 .addReg(InitReg)
5043 .addMBB(&OrigBB)
5044 .addReg(ResultReg)
5045 .addMBB(&LoopBB);
5046
5047 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5048 .addReg(InitSaveExecReg)
5049 .addMBB(&OrigBB)
5050 .addReg(NewExec)
5051 .addMBB(&LoopBB);
5052
5053 // Read the next variant <- also loop target.
5054 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5055 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5056
5057 // Compare the just read M0 value to all possible Idx values.
5058 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5059 .addReg(CurrentIdxReg)
5060 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5061
5062 // Update EXEC, save the original EXEC value to VCC.
5063 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5064 .addReg(CondReg, RegState::Kill);
5065
5066 MRI.setSimpleHint(NewExec, CondReg);
5067
5068 if (UseGPRIdxMode) {
5069 if (Offset == 0) {
5070 SGPRIdxReg = CurrentIdxReg;
5071 } else {
5072 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5073 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5074 .addReg(CurrentIdxReg, RegState::Kill)
5075 .addImm(Offset);
5076 }
5077 } else {
5078 // Move index from VCC into M0
5079 if (Offset == 0) {
5080 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5081 .addReg(CurrentIdxReg, RegState::Kill);
5082 } else {
5083 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5084 .addReg(CurrentIdxReg, RegState::Kill)
5085 .addImm(Offset);
5086 }
5087 }
5088
5089 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5090 MachineInstr *InsertPt =
5091 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5092 .addReg(LMC.ExecReg)
5093 .addReg(NewExec);
5094
5095 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5096 // s_cbranch_scc0?
5097
5098 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5099 // clang-format off
5100 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5101 .addMBB(&LoopBB);
5102 // clang-format on
5103
5104 return InsertPt->getIterator();
5105}
5106
5107// This has slightly sub-optimal regalloc when the source vector is killed by
5108// the read. The register allocator does not understand that the kill is
5109// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5110// subregister from it, using 1 more VGPR than necessary. This was saved when
5111// this was expanded after register allocation.
5114 unsigned InitResultReg, unsigned PhiReg, int Offset,
5115 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5116 MachineFunction *MF = MBB.getParent();
5117 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5118 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5120 const DebugLoc &DL = MI.getDebugLoc();
5122
5123 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5124 Register DstReg = MI.getOperand(0).getReg();
5125 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5126 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5128
5129 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5130
5131 // Save the EXEC mask
5132 // clang-format off
5133 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5134 .addReg(LMC.ExecReg);
5135 // clang-format on
5136
5137 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5138
5139 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5140
5141 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5142 InitResultReg, DstReg, PhiReg, TmpExec,
5143 Offset, UseGPRIdxMode, SGPRIdxReg);
5144
5145 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5147 ++MBBI;
5148 MF->insert(MBBI, LandingPad);
5149 LoopBB->removeSuccessor(RemainderBB);
5150 LandingPad->addSuccessor(RemainderBB);
5151 LoopBB->addSuccessor(LandingPad);
5152 MachineBasicBlock::iterator First = LandingPad->begin();
5153 // clang-format off
5154 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5155 .addReg(SaveExec);
5156 // clang-format on
5157
5158 return InsPt;
5159}
5160
5161// Returns subreg index, offset
5162static std::pair<unsigned, int>
5164 const TargetRegisterClass *SuperRC, unsigned VecReg,
5165 int Offset) {
5166 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5167
5168 // Skip out of bounds offsets, or else we would end up using an undefined
5169 // register.
5170 if (Offset >= NumElts || Offset < 0)
5171 return std::pair(AMDGPU::sub0, Offset);
5172
5173 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5174}
5175
5178 int Offset) {
5179 MachineBasicBlock *MBB = MI.getParent();
5180 const DebugLoc &DL = MI.getDebugLoc();
5182
5183 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5184
5185 assert(Idx->getReg() != AMDGPU::NoRegister);
5186
5187 if (Offset == 0) {
5188 // clang-format off
5189 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5190 .add(*Idx);
5191 // clang-format on
5192 } else {
5193 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5194 .add(*Idx)
5195 .addImm(Offset);
5196 }
5197}
5198
5201 int Offset) {
5202 MachineBasicBlock *MBB = MI.getParent();
5203 const DebugLoc &DL = MI.getDebugLoc();
5205
5206 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5207
5208 if (Offset == 0)
5209 return Idx->getReg();
5210
5211 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5212 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5213 .add(*Idx)
5214 .addImm(Offset);
5215 return Tmp;
5216}
5217
5220 const GCNSubtarget &ST) {
5221 const SIInstrInfo *TII = ST.getInstrInfo();
5222 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5223 MachineFunction *MF = MBB.getParent();
5225
5226 Register Dst = MI.getOperand(0).getReg();
5227 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5228 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5229 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5230
5231 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5232 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5233
5234 unsigned SubReg;
5235 std::tie(SubReg, Offset) =
5236 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5237
5238 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5239
5240 // Check for a SGPR index.
5241 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5243 const DebugLoc &DL = MI.getDebugLoc();
5244
5245 if (UseGPRIdxMode) {
5246 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5247 // to avoid interfering with other uses, so probably requires a new
5248 // optimization pass.
5250
5251 const MCInstrDesc &GPRIDXDesc =
5252 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5253 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5254 .addReg(SrcReg)
5255 .addReg(Idx)
5256 .addImm(SubReg);
5257 } else {
5259
5260 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5261 .addReg(SrcReg, 0, SubReg)
5262 .addReg(SrcReg, RegState::Implicit);
5263 }
5264
5265 MI.eraseFromParent();
5266
5267 return &MBB;
5268 }
5269
5270 // Control flow needs to be inserted if indexing with a VGPR.
5271 const DebugLoc &DL = MI.getDebugLoc();
5273
5274 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5275 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5276
5277 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5278
5279 Register SGPRIdxReg;
5280 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5281 UseGPRIdxMode, SGPRIdxReg);
5282
5283 MachineBasicBlock *LoopBB = InsPt->getParent();
5284
5285 if (UseGPRIdxMode) {
5286 const MCInstrDesc &GPRIDXDesc =
5287 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5288
5289 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5290 .addReg(SrcReg)
5291 .addReg(SGPRIdxReg)
5292 .addImm(SubReg);
5293 } else {
5294 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5295 .addReg(SrcReg, 0, SubReg)
5296 .addReg(SrcReg, RegState::Implicit);
5297 }
5298
5299 MI.eraseFromParent();
5300
5301 return LoopBB;
5302}
5303
5306 const GCNSubtarget &ST) {
5307 const SIInstrInfo *TII = ST.getInstrInfo();
5308 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5309 MachineFunction *MF = MBB.getParent();
5311
5312 Register Dst = MI.getOperand(0).getReg();
5313 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5314 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5315 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5316 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5317 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5318 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5319
5320 // This can be an immediate, but will be folded later.
5321 assert(Val->getReg());
5322
5323 unsigned SubReg;
5324 std::tie(SubReg, Offset) =
5325 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5326 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5327
5328 if (Idx->getReg() == AMDGPU::NoRegister) {
5330 const DebugLoc &DL = MI.getDebugLoc();
5331
5332 assert(Offset == 0);
5333
5334 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5335 .add(*SrcVec)
5336 .add(*Val)
5337 .addImm(SubReg);
5338
5339 MI.eraseFromParent();
5340 return &MBB;
5341 }
5342
5343 // Check for a SGPR index.
5344 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5346 const DebugLoc &DL = MI.getDebugLoc();
5347
5348 if (UseGPRIdxMode) {
5350
5351 const MCInstrDesc &GPRIDXDesc =
5352 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5353 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5354 .addReg(SrcVec->getReg())
5355 .add(*Val)
5356 .addReg(Idx)
5357 .addImm(SubReg);
5358 } else {
5360
5361 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5362 TRI.getRegSizeInBits(*VecRC), 32, false);
5363 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5364 .addReg(SrcVec->getReg())
5365 .add(*Val)
5366 .addImm(SubReg);
5367 }
5368 MI.eraseFromParent();
5369 return &MBB;
5370 }
5371
5372 // Control flow needs to be inserted if indexing with a VGPR.
5373 if (Val->isReg())
5374 MRI.clearKillFlags(Val->getReg());
5375
5376 const DebugLoc &DL = MI.getDebugLoc();
5377
5378 Register PhiReg = MRI.createVirtualRegister(VecRC);
5379
5380 Register SGPRIdxReg;
5381 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5382 UseGPRIdxMode, SGPRIdxReg);
5383 MachineBasicBlock *LoopBB = InsPt->getParent();
5384
5385 if (UseGPRIdxMode) {
5386 const MCInstrDesc &GPRIDXDesc =
5387 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5388
5389 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5390 .addReg(PhiReg)
5391 .add(*Val)
5392 .addReg(SGPRIdxReg)
5393 .addImm(SubReg);
5394 } else {
5395 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5396 TRI.getRegSizeInBits(*VecRC), 32, false);
5397 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5398 .addReg(PhiReg)
5399 .add(*Val)
5400 .addImm(SubReg);
5401 }
5402
5403 MI.eraseFromParent();
5404 return LoopBB;
5405}
5406
5408 MachineBasicBlock *BB) {
5409 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5410 // For GFX12, we emit s_add_u64 and s_sub_u64.
5411 MachineFunction *MF = BB->getParent();
5412 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5413 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5415 const DebugLoc &DL = MI.getDebugLoc();
5416 MachineOperand &Dest = MI.getOperand(0);
5417 MachineOperand &Src0 = MI.getOperand(1);
5418 MachineOperand &Src1 = MI.getOperand(2);
5419 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5420 if (ST.hasScalarAddSub64()) {
5421 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5422 // clang-format off
5423 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5424 .add(Src0)
5425 .add(Src1);
5426 // clang-format on
5427 } else {
5428 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5429 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5430
5431 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5432 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5433
5434 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5435 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5436 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5437 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5438
5439 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5440 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5441 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5442 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5443
5444 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5445 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5446 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5447 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5448 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5449 .addReg(DestSub0)
5450 .addImm(AMDGPU::sub0)
5451 .addReg(DestSub1)
5452 .addImm(AMDGPU::sub1);
5453 }
5454 MI.eraseFromParent();
5455 return BB;
5456}
5457
5459 switch (Opc) {
5460 case AMDGPU::S_MIN_U32:
5461 return std::numeric_limits<uint32_t>::max();
5462 case AMDGPU::S_MIN_I32:
5463 return std::numeric_limits<int32_t>::max();
5464 case AMDGPU::S_MAX_U32:
5465 return std::numeric_limits<uint32_t>::min();
5466 case AMDGPU::S_MAX_I32:
5467 return std::numeric_limits<int32_t>::min();
5468 case AMDGPU::S_ADD_I32:
5469 case AMDGPU::S_SUB_I32:
5470 case AMDGPU::S_OR_B32:
5471 case AMDGPU::S_XOR_B32:
5472 return std::numeric_limits<uint32_t>::min();
5473 case AMDGPU::S_AND_B32:
5474 return std::numeric_limits<uint32_t>::max();
5475 default:
5477 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5478 }
5479}
5480
5482 switch (Opc) {
5483 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5484 return std::numeric_limits<uint64_t>::max();
5485 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5486 return std::numeric_limits<int64_t>::max();
5487 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5488 return std::numeric_limits<uint64_t>::min();
5489 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5490 return std::numeric_limits<int64_t>::min();
5491 case AMDGPU::S_ADD_U64_PSEUDO:
5492 case AMDGPU::S_SUB_U64_PSEUDO:
5493 case AMDGPU::S_OR_B64:
5494 case AMDGPU::S_XOR_B64:
5495 return std::numeric_limits<uint64_t>::min();
5496 case AMDGPU::S_AND_B64:
5497 return std::numeric_limits<uint64_t>::max();
5498 default:
5500 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5501 }
5502}
5503
5504static bool is32bitWaveReduceOperation(unsigned Opc) {
5505 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5506 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5507 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5508 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5509 Opc == AMDGPU::S_XOR_B32;
5510}
5511
5514 const GCNSubtarget &ST,
5515 unsigned Opc) {
5517 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5518 const DebugLoc &DL = MI.getDebugLoc();
5519 const SIInstrInfo *TII = ST.getInstrInfo();
5520
5521 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5522 Register SrcReg = MI.getOperand(1).getReg();
5523 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5524 Register DstReg = MI.getOperand(0).getReg();
5525 MachineBasicBlock *RetBB = nullptr;
5526 if (isSGPR) {
5527 switch (Opc) {
5528 case AMDGPU::S_MIN_U32:
5529 case AMDGPU::S_MIN_I32:
5530 case AMDGPU::S_MAX_U32:
5531 case AMDGPU::S_MAX_I32:
5532 case AMDGPU::S_AND_B32:
5533 case AMDGPU::S_OR_B32: {
5534 // Idempotent operations.
5535 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5536 RetBB = &BB;
5537 break;
5538 }
5539 case AMDGPU::V_CMP_LT_U64_e64: // umin
5540 case AMDGPU::V_CMP_LT_I64_e64: // min
5541 case AMDGPU::V_CMP_GT_U64_e64: // umax
5542 case AMDGPU::V_CMP_GT_I64_e64: // max
5543 case AMDGPU::S_AND_B64:
5544 case AMDGPU::S_OR_B64: {
5545 // Idempotent operations.
5546 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5547 RetBB = &BB;
5548 break;
5549 }
5550 case AMDGPU::S_XOR_B32:
5551 case AMDGPU::S_XOR_B64:
5552 case AMDGPU::S_ADD_I32:
5553 case AMDGPU::S_ADD_U64_PSEUDO:
5554 case AMDGPU::S_SUB_I32:
5555 case AMDGPU::S_SUB_U64_PSEUDO: {
5556 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5557 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5558 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5559 Register NumActiveLanes =
5560 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5561
5562 bool IsWave32 = ST.isWave32();
5563 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5564 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5565 unsigned BitCountOpc =
5566 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5567
5568 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5569
5570 auto NewAccumulator =
5571 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5572 .addReg(ExecMask);
5573
5574 switch (Opc) {
5575 case AMDGPU::S_XOR_B32:
5576 case AMDGPU::S_XOR_B64: {
5577 // Performing an XOR operation on a uniform value
5578 // depends on the parity of the number of active lanes.
5579 // For even parity, the result will be 0, for odd
5580 // parity the result will be the same as the input value.
5581 Register ParityRegister =
5582 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5583
5584 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5585 .addReg(NewAccumulator->getOperand(0).getReg())
5586 .addImm(1)
5587 .setOperandDead(3); // Dead scc
5588 if (Opc == AMDGPU::S_XOR_B32) {
5589 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5590 .addReg(SrcReg)
5591 .addReg(ParityRegister);
5592 } else {
5593 Register DestSub0 =
5594 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5595 Register DestSub1 =
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5597
5598 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5599 const TargetRegisterClass *SrcSubRC =
5600 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5601
5602 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5603 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5604 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5605 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5606
5607 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5608 .add(Op1L)
5609 .addReg(ParityRegister);
5610
5611 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5612 .add(Op1H)
5613 .addReg(ParityRegister);
5614
5615 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5616 .addReg(DestSub0)
5617 .addImm(AMDGPU::sub0)
5618 .addReg(DestSub1)
5619 .addImm(AMDGPU::sub1);
5620 }
5621 break;
5622 }
5623 case AMDGPU::S_SUB_I32: {
5624 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5625
5626 // Take the negation of the source operand.
5627 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5628 .addImm(0)
5629 .addReg(SrcReg);
5630 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5631 .addReg(NegatedVal)
5632 .addReg(NewAccumulator->getOperand(0).getReg());
5633 break;
5634 }
5635 case AMDGPU::S_ADD_I32: {
5636 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5637 .addReg(SrcReg)
5638 .addReg(NewAccumulator->getOperand(0).getReg());
5639 break;
5640 }
5641 case AMDGPU::S_ADD_U64_PSEUDO:
5642 case AMDGPU::S_SUB_U64_PSEUDO: {
5643 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5644 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5645 Register Op1H_Op0L_Reg =
5646 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5647 Register Op1L_Op0H_Reg =
5648 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register NegatedValLo =
5652 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register NegatedValHi =
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655
5656 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5657 const TargetRegisterClass *Src1SubRC =
5658 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5659
5660 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5661 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5662 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5663 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5664
5665 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5666 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5667 .addImm(0)
5668 .addReg(NewAccumulator->getOperand(0).getReg())
5669 .setOperandDead(3); // Dead scc
5670 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5671 .addReg(NegatedValLo)
5672 .addImm(31)
5673 .setOperandDead(3); // Dead scc
5674 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5675 .add(Op1L)
5676 .addReg(NegatedValHi);
5677 }
5678 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5679 ? NegatedValLo
5680 : NewAccumulator->getOperand(0).getReg();
5681 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5682 .add(Op1L)
5683 .addReg(LowOpcode);
5684 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5685 .add(Op1L)
5686 .addReg(LowOpcode);
5687 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5688 .add(Op1H)
5689 .addReg(LowOpcode);
5690
5691 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5692 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5693 .addReg(CarryReg)
5694 .addReg(Op1H_Op0L_Reg)
5695 .setOperandDead(3); // Dead scc
5696
5697 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5698 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5699 .addReg(HiVal)
5700 .addReg(Op1L_Op0H_Reg)
5701 .setOperandDead(3); // Dead scc
5702 }
5703 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5704 .addReg(DestSub0)
5705 .addImm(AMDGPU::sub0)
5706 .addReg(DestSub1)
5707 .addImm(AMDGPU::sub1);
5708 break;
5709 }
5710 }
5711 RetBB = &BB;
5712 }
5713 }
5714 } else {
5715 // TODO: Implement DPP Strategy and switch based on immediate strategy
5716 // operand. For now, for all the cases (default, Iterative and DPP we use
5717 // iterative approach by default.)
5718
5719 // To reduce the VGPR using iterative approach, we need to iterate
5720 // over all the active lanes. Lowering consists of ComputeLoop,
5721 // which iterate over only active lanes. We use copy of EXEC register
5722 // as induction variable and every active lane modifies it using bitset0
5723 // so that we will get the next active lane for next iteration.
5725 Register SrcReg = MI.getOperand(1).getReg();
5726 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5727
5728 // Create Control flow for loop
5729 // Split MI's Machine Basic block into For loop
5730 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5731
5732 // Create virtual registers required for lowering.
5733 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5734 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5735 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5736 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5737 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5738 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5739 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5740 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5741 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5742
5743 bool IsWave32 = ST.isWave32();
5744 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5745 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5746
5747 // Create initial values of induction variable from Exec, Accumulator and
5748 // insert branch instr to newly created ComputeBlock
5749 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5750 if (is32BitOpc) {
5752 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5753 .addImm(IdentityValue);
5754 } else {
5756 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5757 .addImm(IdentityValue);
5758 }
5759 // clang-format off
5760 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5761 .addMBB(ComputeLoop);
5762 // clang-format on
5763
5764 // Start constructing ComputeLoop
5765 I = ComputeLoop->begin();
5766 auto Accumulator =
5767 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5768 .addReg(IdentityValReg)
5769 .addMBB(&BB);
5770 auto ActiveBits =
5771 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5772 .addReg(LoopIterator)
5773 .addMBB(&BB);
5774
5775 I = ComputeLoop->end();
5776 MachineInstr *NewAccumulator;
5777 // Perform the computations
5778 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5779 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5780 .addReg(ActiveBitsReg);
5781 if (is32BitOpc) {
5782 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5783 LaneValueReg)
5784 .addReg(SrcReg)
5785 .addReg(FF1Reg);
5786 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5787 .addReg(Accumulator->getOperand(0).getReg())
5788 .addReg(LaneValueReg);
5789 } else {
5790 Register LaneValueLoReg =
5791 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5792 Register LaneValueHiReg =
5793 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5794 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5795 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5796 const TargetRegisterClass *SrcSubRC =
5797 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5798 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5799 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5800 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5801 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5802 // lane value input should be in an sgpr
5803 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5804 LaneValueLoReg)
5805 .add(Op1L)
5806 .addReg(FF1Reg);
5807 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5808 LaneValueHiReg)
5809 .add(Op1H)
5810 .addReg(FF1Reg);
5811 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5812 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5813 .addReg(LaneValueLoReg)
5814 .addImm(AMDGPU::sub0)
5815 .addReg(LaneValueHiReg)
5816 .addImm(AMDGPU::sub1);
5817 switch (Opc) {
5818 case AMDGPU::S_OR_B64:
5819 case AMDGPU::S_AND_B64:
5820 case AMDGPU::S_XOR_B64: {
5821 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5822 .addReg(Accumulator->getOperand(0).getReg())
5823 .addReg(LaneValue->getOperand(0).getReg())
5824 .setOperandDead(3); // Dead scc
5825 break;
5826 }
5827 case AMDGPU::V_CMP_GT_I64_e64:
5828 case AMDGPU::V_CMP_GT_U64_e64:
5829 case AMDGPU::V_CMP_LT_I64_e64:
5830 case AMDGPU::V_CMP_LT_U64_e64: {
5831 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5832 Register ComparisonResultReg =
5833 MRI.createVirtualRegister(WaveMaskRegClass);
5834 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5835 const TargetRegisterClass *VSubRegClass =
5836 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5837 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5838 MachineOperand SrcReg0Sub0 =
5839 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5840 VregClass, AMDGPU::sub0, VSubRegClass);
5841 MachineOperand SrcReg0Sub1 =
5842 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5843 VregClass, AMDGPU::sub1, VSubRegClass);
5844 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5845 AccumulatorVReg)
5846 .add(SrcReg0Sub0)
5847 .addImm(AMDGPU::sub0)
5848 .add(SrcReg0Sub1)
5849 .addImm(AMDGPU::sub1);
5850 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5851 .addReg(LaneValue->getOperand(0).getReg())
5852 .addReg(AccumulatorVReg);
5853
5854 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5855 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5856 .addReg(LaneMaskReg)
5857 .addReg(ActiveBitsReg);
5858
5859 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5860 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5861 .addReg(LaneValue->getOperand(0).getReg())
5862 .addReg(Accumulator->getOperand(0).getReg());
5863 break;
5864 }
5865 case AMDGPU::S_ADD_U64_PSEUDO:
5866 case AMDGPU::S_SUB_U64_PSEUDO: {
5867 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5868 .addReg(Accumulator->getOperand(0).getReg())
5869 .addReg(LaneValue->getOperand(0).getReg());
5870 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5871 break;
5872 }
5873 }
5874 }
5875 // Manipulate the iterator to get the next active lane
5876 unsigned BITSETOpc =
5877 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5878 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5879 .addReg(FF1Reg)
5880 .addReg(ActiveBitsReg);
5881
5882 // Add phi nodes
5883 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5884 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5885
5886 // Creating branching
5887 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5888 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5889 .addReg(NewActiveBitsReg)
5890 .addImm(0);
5891 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5892 .addMBB(ComputeLoop);
5893
5894 RetBB = ComputeEnd;
5895 }
5896 MI.eraseFromParent();
5897 return RetBB;
5898}
5899
5902 MachineBasicBlock *BB) const {
5903
5905 MachineFunction *MF = BB->getParent();
5907
5908 switch (MI.getOpcode()) {
5909 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5910 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5911 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5912 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5913 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5914 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5915 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5916 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5917 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5918 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5919 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5920 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5921 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5922 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5923 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5924 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5925 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5926 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5927 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5928 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5929 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5930 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5931 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5932 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5933 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5934 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5935 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5936 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5937 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5938 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5939 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5940 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5941 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5942 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5943 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5944 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5945 case AMDGPU::S_UADDO_PSEUDO:
5946 case AMDGPU::S_USUBO_PSEUDO: {
5947 const DebugLoc &DL = MI.getDebugLoc();
5948 MachineOperand &Dest0 = MI.getOperand(0);
5949 MachineOperand &Dest1 = MI.getOperand(1);
5950 MachineOperand &Src0 = MI.getOperand(2);
5951 MachineOperand &Src1 = MI.getOperand(3);
5952
5953 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5954 ? AMDGPU::S_ADD_I32
5955 : AMDGPU::S_SUB_I32;
5956 // clang-format off
5957 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5958 .add(Src0)
5959 .add(Src1);
5960 // clang-format on
5961
5962 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5963 .addImm(1)
5964 .addImm(0);
5965
5966 MI.eraseFromParent();
5967 return BB;
5968 }
5969 case AMDGPU::S_ADD_U64_PSEUDO:
5970 case AMDGPU::S_SUB_U64_PSEUDO: {
5971 return Expand64BitScalarArithmetic(MI, BB);
5972 }
5973 case AMDGPU::V_ADD_U64_PSEUDO:
5974 case AMDGPU::V_SUB_U64_PSEUDO: {
5976 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5977 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5978 const DebugLoc &DL = MI.getDebugLoc();
5979
5980 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5981
5982 MachineOperand &Dest = MI.getOperand(0);
5983 MachineOperand &Src0 = MI.getOperand(1);
5984 MachineOperand &Src1 = MI.getOperand(2);
5985
5986 if (ST.hasAddSubU64Insts()) {
5987 auto I = BuildMI(*BB, MI, DL,
5988 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5989 : AMDGPU::V_SUB_U64_e64),
5990 Dest.getReg())
5991 .add(Src0)
5992 .add(Src1)
5993 .addImm(0); // clamp
5994 TII->legalizeOperands(*I);
5995 MI.eraseFromParent();
5996 return BB;
5997 }
5998
5999 if (IsAdd && ST.hasLshlAddU64Inst()) {
6000 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6001 Dest.getReg())
6002 .add(Src0)
6003 .addImm(0)
6004 .add(Src1);
6005 TII->legalizeOperands(*Add);
6006 MI.eraseFromParent();
6007 return BB;
6008 }
6009
6010 const auto *CarryRC = TRI->getWaveMaskRegClass();
6011
6012 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6013 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6014
6015 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6016 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6017
6018 const TargetRegisterClass *Src0RC = Src0.isReg()
6019 ? MRI.getRegClass(Src0.getReg())
6020 : &AMDGPU::VReg_64RegClass;
6021 const TargetRegisterClass *Src1RC = Src1.isReg()
6022 ? MRI.getRegClass(Src1.getReg())
6023 : &AMDGPU::VReg_64RegClass;
6024
6025 const TargetRegisterClass *Src0SubRC =
6026 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6027 const TargetRegisterClass *Src1SubRC =
6028 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6029
6030 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6031 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6032 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6033 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6034
6035 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6036 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6037 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6038 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6039
6040 unsigned LoOpc =
6041 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6042 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6043 .addReg(CarryReg, RegState::Define)
6044 .add(SrcReg0Sub0)
6045 .add(SrcReg1Sub0)
6046 .addImm(0); // clamp bit
6047
6048 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6049 MachineInstr *HiHalf =
6050 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6051 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6052 .add(SrcReg0Sub1)
6053 .add(SrcReg1Sub1)
6054 .addReg(CarryReg, RegState::Kill)
6055 .addImm(0); // clamp bit
6056
6057 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6058 .addReg(DestSub0)
6059 .addImm(AMDGPU::sub0)
6060 .addReg(DestSub1)
6061 .addImm(AMDGPU::sub1);
6062 TII->legalizeOperands(*LoHalf);
6063 TII->legalizeOperands(*HiHalf);
6064 MI.eraseFromParent();
6065 return BB;
6066 }
6067 case AMDGPU::S_ADD_CO_PSEUDO:
6068 case AMDGPU::S_SUB_CO_PSEUDO: {
6069 // This pseudo has a chance to be selected
6070 // only from uniform add/subcarry node. All the VGPR operands
6071 // therefore assumed to be splat vectors.
6073 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6074 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6076 const DebugLoc &DL = MI.getDebugLoc();
6077 MachineOperand &Dest = MI.getOperand(0);
6078 MachineOperand &CarryDest = MI.getOperand(1);
6079 MachineOperand &Src0 = MI.getOperand(2);
6080 MachineOperand &Src1 = MI.getOperand(3);
6081 MachineOperand &Src2 = MI.getOperand(4);
6082 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6083 ? AMDGPU::S_ADDC_U32
6084 : AMDGPU::S_SUBB_U32;
6085 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6086 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6087 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6088 .addReg(Src0.getReg());
6089 Src0.setReg(RegOp0);
6090 }
6091 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6092 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6093 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6094 .addReg(Src1.getReg());
6095 Src1.setReg(RegOp1);
6096 }
6097 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6098 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6099 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6100 .addReg(Src2.getReg());
6101 Src2.setReg(RegOp2);
6102 }
6103
6104 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6105 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
6106 assert(WaveSize == 64 || WaveSize == 32);
6107
6108 if (WaveSize == 64) {
6109 if (ST.hasScalarCompareEq64()) {
6110 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6111 .addReg(Src2.getReg())
6112 .addImm(0);
6113 } else {
6114 const TargetRegisterClass *SubRC =
6115 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6116 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6117 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6118 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6119 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6120 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6121
6122 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6123 .add(Src2Sub0)
6124 .add(Src2Sub1);
6125
6126 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6127 .addReg(Src2_32, RegState::Kill)
6128 .addImm(0);
6129 }
6130 } else {
6131 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6132 .addReg(Src2.getReg())
6133 .addImm(0);
6134 }
6135
6136 // clang-format off
6137 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6138 .add(Src0)
6139 .add(Src1);
6140 // clang-format on
6141
6142 unsigned SelOpc =
6143 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6144
6145 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6146 .addImm(-1)
6147 .addImm(0);
6148
6149 MI.eraseFromParent();
6150 return BB;
6151 }
6152 case AMDGPU::SI_INIT_M0: {
6153 MachineOperand &M0Init = MI.getOperand(0);
6154 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6155 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6156 AMDGPU::M0)
6157 .add(M0Init);
6158 MI.eraseFromParent();
6159 return BB;
6160 }
6161 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6162 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6163 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6164 TII->get(AMDGPU::S_CMP_EQ_U32))
6165 .addImm(0)
6166 .addImm(0);
6167 return BB;
6168 }
6169 case AMDGPU::GET_GROUPSTATICSIZE: {
6170 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6171 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6172 DebugLoc DL = MI.getDebugLoc();
6173 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6174 .add(MI.getOperand(0))
6175 .addImm(MFI->getLDSSize());
6176 MI.eraseFromParent();
6177 return BB;
6178 }
6179 case AMDGPU::GET_SHADERCYCLESHILO: {
6182 const DebugLoc &DL = MI.getDebugLoc();
6183 // The algorithm is:
6184 //
6185 // hi1 = getreg(SHADER_CYCLES_HI)
6186 // lo1 = getreg(SHADER_CYCLES_LO)
6187 // hi2 = getreg(SHADER_CYCLES_HI)
6188 //
6189 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6190 // Otherwise there was overflow and the result is hi2:0. In both cases the
6191 // result should represent the actual time at some point during the sequence
6192 // of three getregs.
6193 using namespace AMDGPU::Hwreg;
6194 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6195 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6196 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6197 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6198 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6199 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6200 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6201 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6202 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6203 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6204 .addReg(RegHi1)
6205 .addReg(RegHi2);
6206 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6207 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6208 .addReg(RegLo1)
6209 .addImm(0);
6210 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6211 .add(MI.getOperand(0))
6212 .addReg(RegLo)
6213 .addImm(AMDGPU::sub0)
6214 .addReg(RegHi2)
6215 .addImm(AMDGPU::sub1);
6216 MI.eraseFromParent();
6217 return BB;
6218 }
6219 case AMDGPU::SI_INDIRECT_SRC_V1:
6220 case AMDGPU::SI_INDIRECT_SRC_V2:
6221 case AMDGPU::SI_INDIRECT_SRC_V4:
6222 case AMDGPU::SI_INDIRECT_SRC_V8:
6223 case AMDGPU::SI_INDIRECT_SRC_V9:
6224 case AMDGPU::SI_INDIRECT_SRC_V10:
6225 case AMDGPU::SI_INDIRECT_SRC_V11:
6226 case AMDGPU::SI_INDIRECT_SRC_V12:
6227 case AMDGPU::SI_INDIRECT_SRC_V16:
6228 case AMDGPU::SI_INDIRECT_SRC_V32:
6229 return emitIndirectSrc(MI, *BB, *getSubtarget());
6230 case AMDGPU::SI_INDIRECT_DST_V1:
6231 case AMDGPU::SI_INDIRECT_DST_V2:
6232 case AMDGPU::SI_INDIRECT_DST_V4:
6233 case AMDGPU::SI_INDIRECT_DST_V8:
6234 case AMDGPU::SI_INDIRECT_DST_V9:
6235 case AMDGPU::SI_INDIRECT_DST_V10:
6236 case AMDGPU::SI_INDIRECT_DST_V11:
6237 case AMDGPU::SI_INDIRECT_DST_V12:
6238 case AMDGPU::SI_INDIRECT_DST_V16:
6239 case AMDGPU::SI_INDIRECT_DST_V32:
6240 return emitIndirectDst(MI, *BB, *getSubtarget());
6241 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6242 case AMDGPU::SI_KILL_I1_PSEUDO:
6243 return splitKillBlock(MI, BB);
6244 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6246 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6247 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6248
6249 Register Dst = MI.getOperand(0).getReg();
6250 const MachineOperand &Src0 = MI.getOperand(1);
6251 const MachineOperand &Src1 = MI.getOperand(2);
6252 const DebugLoc &DL = MI.getDebugLoc();
6253 Register SrcCond = MI.getOperand(3).getReg();
6254
6255 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6256 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6257 const auto *CondRC = TRI->getWaveMaskRegClass();
6258 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6259
6260 const TargetRegisterClass *Src0RC = Src0.isReg()
6261 ? MRI.getRegClass(Src0.getReg())
6262 : &AMDGPU::VReg_64RegClass;
6263 const TargetRegisterClass *Src1RC = Src1.isReg()
6264 ? MRI.getRegClass(Src1.getReg())
6265 : &AMDGPU::VReg_64RegClass;
6266
6267 const TargetRegisterClass *Src0SubRC =
6268 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6269 const TargetRegisterClass *Src1SubRC =
6270 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6271
6272 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6273 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6274 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6275 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6276
6277 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6278 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6279 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6280 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6281
6282 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6283 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6284 .addImm(0)
6285 .add(Src0Sub0)
6286 .addImm(0)
6287 .add(Src1Sub0)
6288 .addReg(SrcCondCopy);
6289 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6290 .addImm(0)
6291 .add(Src0Sub1)
6292 .addImm(0)
6293 .add(Src1Sub1)
6294 .addReg(SrcCondCopy);
6295
6296 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6297 .addReg(DstLo)
6298 .addImm(AMDGPU::sub0)
6299 .addReg(DstHi)
6300 .addImm(AMDGPU::sub1);
6301 MI.eraseFromParent();
6302 return BB;
6303 }
6304 case AMDGPU::SI_BR_UNDEF: {
6306 const DebugLoc &DL = MI.getDebugLoc();
6307 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6308 .add(MI.getOperand(0));
6309 Br->getOperand(1).setIsUndef(); // read undef SCC
6310 MI.eraseFromParent();
6311 return BB;
6312 }
6313 case AMDGPU::ADJCALLSTACKUP:
6314 case AMDGPU::ADJCALLSTACKDOWN: {
6316 MachineInstrBuilder MIB(*MF, &MI);
6317 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6318 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6319 return BB;
6320 }
6321 case AMDGPU::SI_CALL_ISEL: {
6323 const DebugLoc &DL = MI.getDebugLoc();
6324
6325 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6326
6328 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6329
6330 for (const MachineOperand &MO : MI.operands())
6331 MIB.add(MO);
6332
6333 MIB.cloneMemRefs(MI);
6334 MI.eraseFromParent();
6335 return BB;
6336 }
6337 case AMDGPU::V_ADD_CO_U32_e32:
6338 case AMDGPU::V_SUB_CO_U32_e32:
6339 case AMDGPU::V_SUBREV_CO_U32_e32: {
6340 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6341 const DebugLoc &DL = MI.getDebugLoc();
6342 unsigned Opc = MI.getOpcode();
6343
6344 bool NeedClampOperand = false;
6345 if (TII->pseudoToMCOpcode(Opc) == -1) {
6347 NeedClampOperand = true;
6348 }
6349
6350 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6351 if (TII->isVOP3(*I)) {
6352 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6353 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6354 I.addReg(TRI->getVCC(), RegState::Define);
6355 }
6356 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6357 if (NeedClampOperand)
6358 I.addImm(0); // clamp bit for e64 encoding
6359
6360 TII->legalizeOperands(*I);
6361
6362 MI.eraseFromParent();
6363 return BB;
6364 }
6365 case AMDGPU::V_ADDC_U32_e32:
6366 case AMDGPU::V_SUBB_U32_e32:
6367 case AMDGPU::V_SUBBREV_U32_e32:
6368 // These instructions have an implicit use of vcc which counts towards the
6369 // constant bus limit.
6370 TII->legalizeOperands(MI);
6371 return BB;
6372 case AMDGPU::DS_GWS_INIT:
6373 case AMDGPU::DS_GWS_SEMA_BR:
6374 case AMDGPU::DS_GWS_BARRIER:
6375 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6376 [[fallthrough]];
6377 case AMDGPU::DS_GWS_SEMA_V:
6378 case AMDGPU::DS_GWS_SEMA_P:
6379 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6380 // A s_waitcnt 0 is required to be the instruction immediately following.
6381 if (getSubtarget()->hasGWSAutoReplay()) {
6383 return BB;
6384 }
6385
6386 return emitGWSMemViolTestLoop(MI, BB);
6387 case AMDGPU::S_SETREG_B32: {
6388 // Try to optimize cases that only set the denormal mode or rounding mode.
6389 //
6390 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6391 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6392 // instead.
6393 //
6394 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6395 // allow you to have a no side effect instruction in the output of a
6396 // sideeffecting pattern.
6397 auto [ID, Offset, Width] =
6398 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6400 return BB;
6401
6402 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6403 const unsigned SetMask = WidthMask << Offset;
6404
6405 if (getSubtarget()->hasDenormModeInst()) {
6406 unsigned SetDenormOp = 0;
6407 unsigned SetRoundOp = 0;
6408
6409 // The dedicated instructions can only set the whole denorm or round mode
6410 // at once, not a subset of bits in either.
6411 if (SetMask ==
6413 // If this fully sets both the round and denorm mode, emit the two
6414 // dedicated instructions for these.
6415 SetRoundOp = AMDGPU::S_ROUND_MODE;
6416 SetDenormOp = AMDGPU::S_DENORM_MODE;
6417 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6418 SetRoundOp = AMDGPU::S_ROUND_MODE;
6419 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6420 SetDenormOp = AMDGPU::S_DENORM_MODE;
6421 }
6422
6423 if (SetRoundOp || SetDenormOp) {
6425 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6426 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6427 unsigned ImmVal = Def->getOperand(1).getImm();
6428 if (SetRoundOp) {
6429 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6430 .addImm(ImmVal & 0xf);
6431
6432 // If we also have the denorm mode, get just the denorm mode bits.
6433 ImmVal >>= 4;
6434 }
6435
6436 if (SetDenormOp) {
6437 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6438 .addImm(ImmVal & 0xf);
6439 }
6440
6441 MI.eraseFromParent();
6442 return BB;
6443 }
6444 }
6445 }
6446
6447 // If only FP bits are touched, used the no side effects pseudo.
6448 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6449 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6450 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6451
6452 return BB;
6453 }
6454 case AMDGPU::S_INVERSE_BALLOT_U32:
6455 case AMDGPU::S_INVERSE_BALLOT_U64:
6456 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6457 // necessary. After that they are equivalent to a COPY.
6458 MI.setDesc(TII->get(AMDGPU::COPY));
6459 return BB;
6460 case AMDGPU::ENDPGM_TRAP: {
6461 const DebugLoc &DL = MI.getDebugLoc();
6462 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6463 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6464 MI.addOperand(MachineOperand::CreateImm(0));
6465 return BB;
6466 }
6467
6468 // We need a block split to make the real endpgm a terminator. We also don't
6469 // want to break phis in successor blocks, so we can't just delete to the
6470 // end of the block.
6471
6472 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6474 MF->push_back(TrapBB);
6475 // clang-format off
6476 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6477 .addImm(0);
6478 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6479 .addMBB(TrapBB);
6480 // clang-format on
6481
6482 BB->addSuccessor(TrapBB);
6483 MI.eraseFromParent();
6484 return SplitBB;
6485 }
6486 case AMDGPU::SIMULATED_TRAP: {
6487 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6489 MachineBasicBlock *SplitBB =
6490 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6491 MI.eraseFromParent();
6492 return SplitBB;
6493 }
6494 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6495 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6497
6498 // During ISel, it's difficult to propagate the original EXEC mask to use as
6499 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6500 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6501 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6502 Register OriginalExec = Setup->getOperand(0).getReg();
6503 MF->getRegInfo().clearKillFlags(OriginalExec);
6504 MI.getOperand(0).setReg(OriginalExec);
6505 return BB;
6506 }
6507 default:
6508 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6509 if (!MI.mayStore())
6511 return BB;
6512 }
6514 }
6515}
6516
6518 // This currently forces unfolding various combinations of fsub into fma with
6519 // free fneg'd operands. As long as we have fast FMA (controlled by
6520 // isFMAFasterThanFMulAndFAdd), we should perform these.
6521
6522 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6523 // most of these combines appear to be cycle neutral but save on instruction
6524 // count / code size.
6525 return true;
6526}
6527
6529
6531 EVT VT) const {
6532 if (!VT.isVector()) {
6533 return MVT::i1;
6534 }
6535 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6536}
6537
6539 // TODO: Should i16 be used always if legal? For now it would force VALU
6540 // shifts.
6541 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6542}
6543
6545 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6546 ? Ty.changeElementSize(16)
6547 : Ty.changeElementSize(32);
6548}
6549
6550// Answering this is somewhat tricky and depends on the specific device which
6551// have different rates for fma or all f64 operations.
6552//
6553// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6554// regardless of which device (although the number of cycles differs between
6555// devices), so it is always profitable for f64.
6556//
6557// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6558// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6559// which we can always do even without fused FP ops since it returns the same
6560// result as the separate operations and since it is always full
6561// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6562// however does not support denormals, so we do report fma as faster if we have
6563// a fast fma device and require denormals.
6564//
6566 EVT VT) const {
6567 VT = VT.getScalarType();
6568
6569 switch (VT.getSimpleVT().SimpleTy) {
6570 case MVT::f32: {
6571 // If mad is not available this depends only on if f32 fma is full rate.
6572 if (!Subtarget->hasMadMacF32Insts())
6573 return Subtarget->hasFastFMAF32();
6574
6575 // Otherwise f32 mad is always full rate and returns the same result as
6576 // the separate operations so should be preferred over fma.
6577 // However does not support denormals.
6579 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6580
6581 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6582 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6583 }
6584 case MVT::f64:
6585 return true;
6586 case MVT::f16:
6587 case MVT::bf16:
6588 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6589 default:
6590 break;
6591 }
6592
6593 return false;
6594}
6595
6597 LLT Ty) const {
6598 switch (Ty.getScalarSizeInBits()) {
6599 case 16:
6600 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6601 case 32:
6602 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6603 case 64:
6604 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6605 default:
6606 break;
6607 }
6608
6609 return false;
6610}
6611
6613 if (!Ty.isScalar())
6614 return false;
6615
6616 if (Ty.getScalarSizeInBits() == 16)
6617 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6618 if (Ty.getScalarSizeInBits() == 32)
6619 return Subtarget->hasMadMacF32Insts() &&
6620 denormalModeIsFlushAllF32(*MI.getMF());
6621
6622 return false;
6623}
6624
6626 const SDNode *N) const {
6627 // TODO: Check future ftz flag
6628 // v_mad_f32/v_mac_f32 do not support denormals.
6629 EVT VT = N->getValueType(0);
6630 if (VT == MVT::f32)
6631 return Subtarget->hasMadMacF32Insts() &&
6633 if (VT == MVT::f16) {
6634 return Subtarget->hasMadF16() &&
6636 }
6637
6638 return false;
6639}
6640
6641//===----------------------------------------------------------------------===//
6642// Custom DAG Lowering Operations
6643//===----------------------------------------------------------------------===//
6644
6645// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6646// wider vector type is legal.
6648 SelectionDAG &DAG) const {
6649 unsigned Opc = Op.getOpcode();
6650 EVT VT = Op.getValueType();
6651 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6652 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6653 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6654 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6655
6656 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6657
6658 SDLoc SL(Op);
6659 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6660 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6661
6662 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6663}
6664
6665// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6666// regression whereby extra unnecessary instructions were added to codegen
6667// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6668// instructions to extract the result from the vector.
6670 [[maybe_unused]] EVT VT = Op.getValueType();
6671
6672 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6673 VT == MVT::v16i32) &&
6674 "Unexpected ValueType.");
6675
6676 return DAG.UnrollVectorOp(Op.getNode());
6677}
6678
6679// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6680// wider vector type is legal.
6682 SelectionDAG &DAG) const {
6683 unsigned Opc = Op.getOpcode();
6684 EVT VT = Op.getValueType();
6685 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6686 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6687 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6688 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6689 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6690 VT == MVT::v32bf16);
6691
6692 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6693 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6694
6695 SDLoc SL(Op);
6696
6697 SDValue OpLo =
6698 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6699 SDValue OpHi =
6700 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6701
6702 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6703}
6704
6706 SelectionDAG &DAG) const {
6707 unsigned Opc = Op.getOpcode();
6708 EVT VT = Op.getValueType();
6709 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6710 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6711 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6712 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6713 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6714 VT == MVT::v32bf16);
6715
6716 SDValue Op0 = Op.getOperand(0);
6717 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6718 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6719 : std::pair(Op0, Op0);
6720
6721 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6722 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6723
6724 SDLoc SL(Op);
6725 auto ResVT = DAG.GetSplitDestVTs(VT);
6726
6727 SDValue OpLo =
6728 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6729 SDValue OpHi =
6730 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6731
6732 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6733}
6734
6736 switch (Op.getOpcode()) {
6737 default:
6739 case ISD::BRCOND:
6740 return LowerBRCOND(Op, DAG);
6741 case ISD::RETURNADDR:
6742 return LowerRETURNADDR(Op, DAG);
6743 case ISD::LOAD: {
6744 SDValue Result = LowerLOAD(Op, DAG);
6745 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6746 "Load should return a value and a chain");
6747 return Result;
6748 }
6749 case ISD::FSQRT: {
6750 EVT VT = Op.getValueType();
6751 if (VT == MVT::f32)
6752 return lowerFSQRTF32(Op, DAG);
6753 if (VT == MVT::f64)
6754 return lowerFSQRTF64(Op, DAG);
6755 return SDValue();
6756 }
6757 case ISD::FSIN:
6758 case ISD::FCOS:
6759 return LowerTrig(Op, DAG);
6760 case ISD::SELECT:
6761 return LowerSELECT(Op, DAG);
6762 case ISD::FDIV:
6763 return LowerFDIV(Op, DAG);
6764 case ISD::FFREXP:
6765 return LowerFFREXP(Op, DAG);
6766 case ISD::ATOMIC_CMP_SWAP:
6767 return LowerATOMIC_CMP_SWAP(Op, DAG);
6768 case ISD::STORE:
6769 return LowerSTORE(Op, DAG);
6770 case ISD::GlobalAddress: {
6773 return LowerGlobalAddress(MFI, Op, DAG);
6774 }
6776 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6778 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6780 return LowerINTRINSIC_VOID(Op, DAG);
6781 case ISD::ADDRSPACECAST:
6782 return lowerADDRSPACECAST(Op, DAG);
6784 return lowerINSERT_SUBVECTOR(Op, DAG);
6786 return lowerINSERT_VECTOR_ELT(Op, DAG);
6788 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6790 return lowerVECTOR_SHUFFLE(Op, DAG);
6792 return lowerSCALAR_TO_VECTOR(Op, DAG);
6793 case ISD::BUILD_VECTOR:
6794 return lowerBUILD_VECTOR(Op, DAG);
6795 case ISD::FP_ROUND:
6797 return lowerFP_ROUND(Op, DAG);
6798 case ISD::TRAP:
6799 return lowerTRAP(Op, DAG);
6800 case ISD::DEBUGTRAP:
6801 return lowerDEBUGTRAP(Op, DAG);
6802 case ISD::ABS:
6803 case ISD::FABS:
6804 case ISD::FNEG:
6805 case ISD::FCANONICALIZE:
6806 case ISD::BSWAP:
6807 return splitUnaryVectorOp(Op, DAG);
6808 case ISD::FMINNUM:
6809 case ISD::FMAXNUM:
6810 return lowerFMINNUM_FMAXNUM(Op, DAG);
6811 case ISD::FMINIMUMNUM:
6812 case ISD::FMAXIMUMNUM:
6813 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6814 case ISD::FMINIMUM:
6815 case ISD::FMAXIMUM:
6816 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6817 case ISD::FLDEXP:
6818 case ISD::STRICT_FLDEXP:
6819 return lowerFLDEXP(Op, DAG);
6820 case ISD::FMA:
6821 return splitTernaryVectorOp(Op, DAG);
6822 case ISD::FP_TO_SINT:
6823 case ISD::FP_TO_UINT:
6824 return LowerFP_TO_INT(Op, DAG);
6825 case ISD::SHL:
6826 case ISD::SRA:
6827 case ISD::SRL:
6828 case ISD::ADD:
6829 case ISD::SUB:
6830 case ISD::SMIN:
6831 case ISD::SMAX:
6832 case ISD::UMIN:
6833 case ISD::UMAX:
6834 case ISD::FADD:
6835 case ISD::FMUL:
6836 case ISD::FMINNUM_IEEE:
6837 case ISD::FMAXNUM_IEEE:
6838 case ISD::UADDSAT:
6839 case ISD::USUBSAT:
6840 case ISD::SADDSAT:
6841 case ISD::SSUBSAT:
6842 return splitBinaryVectorOp(Op, DAG);
6843 case ISD::FCOPYSIGN:
6844 return lowerFCOPYSIGN(Op, DAG);
6845 case ISD::MUL:
6846 return lowerMUL(Op, DAG);
6847 case ISD::SMULO:
6848 case ISD::UMULO:
6849 return lowerXMULO(Op, DAG);
6850 case ISD::SMUL_LOHI:
6851 case ISD::UMUL_LOHI:
6852 return lowerXMUL_LOHI(Op, DAG);
6853 case ISD::DYNAMIC_STACKALLOC:
6854 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6855 case ISD::STACKSAVE:
6856 return LowerSTACKSAVE(Op, DAG);
6857 case ISD::GET_ROUNDING:
6858 return lowerGET_ROUNDING(Op, DAG);
6859 case ISD::SET_ROUNDING:
6860 return lowerSET_ROUNDING(Op, DAG);
6861 case ISD::PREFETCH:
6862 return lowerPREFETCH(Op, DAG);
6863 case ISD::FP_EXTEND:
6865 return lowerFP_EXTEND(Op, DAG);
6866 case ISD::GET_FPENV:
6867 return lowerGET_FPENV(Op, DAG);
6868 case ISD::SET_FPENV:
6869 return lowerSET_FPENV(Op, DAG);
6870 case ISD::ROTR:
6871 return lowerROTR(Op, DAG);
6872 }
6873 return SDValue();
6874}
6875
6876// Used for D16: Casts the result of an instruction into the right vector,
6877// packs values if loads return unpacked values.
6879 const SDLoc &DL, SelectionDAG &DAG,
6880 bool Unpacked) {
6881 if (!LoadVT.isVector())
6882 return Result;
6883
6884 // Cast back to the original packed type or to a larger type that is a
6885 // multiple of 32 bit for D16. Widening the return type is a required for
6886 // legalization.
6887 EVT FittingLoadVT = LoadVT;
6888 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6889 FittingLoadVT =
6891 LoadVT.getVectorNumElements() + 1);
6892 }
6893
6894 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6895 // Truncate to v2i16/v4i16.
6896 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6897
6898 // Workaround legalizer not scalarizing truncate after vector op
6899 // legalization but not creating intermediate vector trunc.
6901 DAG.ExtractVectorElements(Result, Elts);
6902 for (SDValue &Elt : Elts)
6903 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6904
6905 // Pad illegal v1i16/v3fi6 to v4i16
6906 if ((LoadVT.getVectorNumElements() % 2) == 1)
6907 Elts.push_back(DAG.getPOISON(MVT::i16));
6908
6909 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6910
6911 // Bitcast to original type (v2f16/v4f16).
6912 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6913 }
6914
6915 // Cast back to the original packed type.
6916 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6917}
6918
6919SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6920 SelectionDAG &DAG,
6922 bool IsIntrinsic) const {
6923 SDLoc DL(M);
6924
6925 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6926 EVT LoadVT = M->getValueType(0);
6927
6928 EVT EquivLoadVT = LoadVT;
6929 if (LoadVT.isVector()) {
6930 if (Unpacked) {
6931 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6932 LoadVT.getVectorNumElements());
6933 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6934 // Widen v3f16 to legal type
6935 EquivLoadVT =
6937 LoadVT.getVectorNumElements() + 1);
6938 }
6939 }
6940
6941 // Change from v4f16/v2f16 to EquivLoadVT.
6942 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6943
6945 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6946 M->getMemoryVT(), M->getMemOperand());
6947
6948 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6949
6950 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6951}
6952
6953SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6954 SelectionDAG &DAG,
6955 ArrayRef<SDValue> Ops) const {
6956 SDLoc DL(M);
6957 EVT LoadVT = M->getValueType(0);
6958 EVT EltType = LoadVT.getScalarType();
6959 EVT IntVT = LoadVT.changeTypeToInteger();
6960
6961 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6962
6963 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6964 bool IsTFE = M->getNumValues() == 3;
6965
6966 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6968 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6969 : AMDGPUISD::BUFFER_LOAD;
6970
6971 if (IsD16) {
6972 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6973 }
6974
6975 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6976 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6977 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6978 IsTFE);
6979
6980 if (isTypeLegal(LoadVT)) {
6981 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6982 M->getMemOperand(), DAG);
6983 }
6984
6985 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6986 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6987 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6988 M->getMemOperand(), DAG);
6989 return DAG.getMergeValues(
6990 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6991 DL);
6992}
6993
6995 SelectionDAG &DAG) {
6996 EVT VT = N->getValueType(0);
6997 unsigned CondCode = N->getConstantOperandVal(3);
6998 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6999 return DAG.getPOISON(VT);
7000
7001 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7002
7003 SDValue LHS = N->getOperand(1);
7004 SDValue RHS = N->getOperand(2);
7005
7006 SDLoc DL(N);
7007
7008 EVT CmpVT = LHS.getValueType();
7009 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7010 unsigned PromoteOp =
7012 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7013 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7014 }
7015
7016 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7017
7018 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7019 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7020
7021 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7022 DAG.getCondCode(CCOpcode));
7023 if (VT.bitsEq(CCVT))
7024 return SetCC;
7025 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7026}
7027
7029 SelectionDAG &DAG) {
7030 EVT VT = N->getValueType(0);
7031
7032 unsigned CondCode = N->getConstantOperandVal(3);
7033 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7034 return DAG.getPOISON(VT);
7035
7036 SDValue Src0 = N->getOperand(1);
7037 SDValue Src1 = N->getOperand(2);
7038 EVT CmpVT = Src0.getValueType();
7039 SDLoc SL(N);
7040
7041 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7042 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7043 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7044 }
7045
7046 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7047 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7048 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7049 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7050 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7051 DAG.getCondCode(CCOpcode));
7052 if (VT.bitsEq(CCVT))
7053 return SetCC;
7054 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7055}
7056
7058 SelectionDAG &DAG) {
7059 EVT VT = N->getValueType(0);
7060 SDValue Src = N->getOperand(1);
7061 SDLoc SL(N);
7062
7063 if (Src.getOpcode() == ISD::SETCC) {
7064 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7065 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7066 Src.getOperand(1), Src.getOperand(2));
7067 }
7068 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7069 // (ballot 0) -> 0
7070 if (Arg->isZero())
7071 return DAG.getConstant(0, SL, VT);
7072
7073 // (ballot 1) -> EXEC/EXEC_LO
7074 if (Arg->isOne()) {
7075 Register Exec;
7076 if (VT.getScalarSizeInBits() == 32)
7077 Exec = AMDGPU::EXEC_LO;
7078 else if (VT.getScalarSizeInBits() == 64)
7079 Exec = AMDGPU::EXEC;
7080 else
7081 return SDValue();
7082
7083 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7084 }
7085 }
7086
7087 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7088 // ISD::SETNE)
7089 return DAG.getNode(
7090 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7091 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7092}
7093
7095 SelectionDAG &DAG) {
7096 EVT VT = N->getValueType(0);
7097 unsigned ValSize = VT.getSizeInBits();
7098 unsigned IID = N->getConstantOperandVal(0);
7099 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7100 IID == Intrinsic::amdgcn_permlanex16;
7101 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7102 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7103 SDLoc SL(N);
7104 MVT IntVT = MVT::getIntegerVT(ValSize);
7105 const GCNSubtarget *ST = TLI.getSubtarget();
7106 unsigned SplitSize = 32;
7107 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7108 ST->hasDPALU_DPP() &&
7109 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7110 SplitSize = 64;
7111
7112 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7113 SDValue Src2, MVT ValT) -> SDValue {
7115 switch (IID) {
7116 case Intrinsic::amdgcn_permlane16:
7117 case Intrinsic::amdgcn_permlanex16:
7118 case Intrinsic::amdgcn_update_dpp:
7119 Operands.push_back(N->getOperand(6));
7120 Operands.push_back(N->getOperand(5));
7121 Operands.push_back(N->getOperand(4));
7122 [[fallthrough]];
7123 case Intrinsic::amdgcn_writelane:
7124 Operands.push_back(Src2);
7125 [[fallthrough]];
7126 case Intrinsic::amdgcn_readlane:
7127 case Intrinsic::amdgcn_set_inactive:
7128 case Intrinsic::amdgcn_set_inactive_chain_arg:
7129 case Intrinsic::amdgcn_mov_dpp8:
7130 Operands.push_back(Src1);
7131 [[fallthrough]];
7132 case Intrinsic::amdgcn_readfirstlane:
7133 case Intrinsic::amdgcn_permlane64:
7134 Operands.push_back(Src0);
7135 break;
7136 default:
7137 llvm_unreachable("unhandled lane op");
7138 }
7139
7140 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7141 std::reverse(Operands.begin(), Operands.end());
7142
7143 if (SDNode *GL = N->getGluedNode()) {
7144 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7145 GL = GL->getOperand(0).getNode();
7146 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7147 SDValue(GL, 0)));
7148 }
7149
7150 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7151 };
7152
7153 SDValue Src0 = N->getOperand(1);
7154 SDValue Src1, Src2;
7155 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7156 IID == Intrinsic::amdgcn_mov_dpp8 ||
7157 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7158 Src1 = N->getOperand(2);
7159 if (IID == Intrinsic::amdgcn_writelane ||
7160 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7161 Src2 = N->getOperand(3);
7162 }
7163
7164 if (ValSize == SplitSize) {
7165 // Already legal
7166 return SDValue();
7167 }
7168
7169 if (ValSize < 32) {
7170 bool IsFloat = VT.isFloatingPoint();
7171 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7172 SL, MVT::i32);
7173
7174 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7175 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7176 SL, MVT::i32);
7177 }
7178
7179 if (IID == Intrinsic::amdgcn_writelane) {
7180 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7181 SL, MVT::i32);
7182 }
7183
7184 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7185 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7186 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7187 }
7188
7189 if (ValSize % SplitSize != 0)
7190 return SDValue();
7191
7192 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7193 EVT VT = N->getValueType(0);
7194 unsigned NE = VT.getVectorNumElements();
7195 EVT EltVT = VT.getVectorElementType();
7197 unsigned NumOperands = N->getNumOperands();
7198 SmallVector<SDValue, 4> Operands(NumOperands);
7199 SDNode *GL = N->getGluedNode();
7200
7201 // only handle convergencectrl_glue
7202 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7203
7204 for (unsigned i = 0; i != NE; ++i) {
7205 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7206 ++j) {
7207 SDValue Operand = N->getOperand(j);
7208 EVT OperandVT = Operand.getValueType();
7209 if (OperandVT.isVector()) {
7210 // A vector operand; extract a single element.
7211 EVT OperandEltVT = OperandVT.getVectorElementType();
7212 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7213 Operand, DAG.getVectorIdxConstant(i, SL));
7214 } else {
7215 // A scalar operand; just use it as is.
7216 Operands[j] = Operand;
7217 }
7218 }
7219
7220 if (GL)
7221 Operands[NumOperands - 1] =
7222 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7223 SDValue(GL->getOperand(0).getNode(), 0));
7224
7225 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7226 }
7227
7228 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7229 return DAG.getBuildVector(VecVT, SL, Scalars);
7230 };
7231
7232 if (VT.isVector()) {
7233 switch (MVT::SimpleValueType EltTy =
7235 case MVT::i32:
7236 case MVT::f32:
7237 if (SplitSize == 32) {
7238 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7239 return unrollLaneOp(LaneOp.getNode());
7240 }
7241 [[fallthrough]];
7242 case MVT::i16:
7243 case MVT::f16:
7244 case MVT::bf16: {
7245 unsigned SubVecNumElt =
7246 SplitSize / VT.getVectorElementType().getSizeInBits();
7247 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7249 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7250 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7251 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7252 DAG.getConstant(EltIdx, SL, MVT::i32));
7253
7254 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7255 IsPermLane16)
7256 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7257 DAG.getConstant(EltIdx, SL, MVT::i32));
7258
7259 if (IID == Intrinsic::amdgcn_writelane)
7260 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7261 DAG.getConstant(EltIdx, SL, MVT::i32));
7262
7263 Pieces.push_back(
7264 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7265 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7266 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7267 EltIdx += SubVecNumElt;
7268 }
7269 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7270 }
7271 default:
7272 // Handle all other cases by bitcasting to i32 vectors
7273 break;
7274 }
7275 }
7276
7277 MVT VecVT =
7278 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7279 Src0 = DAG.getBitcast(VecVT, Src0);
7280
7281 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7282 Src1 = DAG.getBitcast(VecVT, Src1);
7283
7284 if (IID == Intrinsic::amdgcn_writelane)
7285 Src2 = DAG.getBitcast(VecVT, Src2);
7286
7287 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7288 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7289 return DAG.getBitcast(VT, UnrolledLaneOp);
7290}
7291
7294 SelectionDAG &DAG) const {
7295 switch (N->getOpcode()) {
7297 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7298 Results.push_back(Res);
7299 return;
7300 }
7302 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7303 Results.push_back(Res);
7304 return;
7305 }
7307 unsigned IID = N->getConstantOperandVal(0);
7308 switch (IID) {
7309 case Intrinsic::amdgcn_make_buffer_rsrc:
7310 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7311 return;
7312 case Intrinsic::amdgcn_cvt_pkrtz: {
7313 SDValue Src0 = N->getOperand(1);
7314 SDValue Src1 = N->getOperand(2);
7315 SDLoc SL(N);
7316 SDValue Cvt =
7317 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7318 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7319 return;
7320 }
7321 case Intrinsic::amdgcn_cvt_pknorm_i16:
7322 case Intrinsic::amdgcn_cvt_pknorm_u16:
7323 case Intrinsic::amdgcn_cvt_pk_i16:
7324 case Intrinsic::amdgcn_cvt_pk_u16: {
7325 SDValue Src0 = N->getOperand(1);
7326 SDValue Src1 = N->getOperand(2);
7327 SDLoc SL(N);
7328 unsigned Opcode;
7329
7330 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7332 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7334 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7336 else
7338
7339 EVT VT = N->getValueType(0);
7340 if (isTypeLegal(VT))
7341 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7342 else {
7343 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7344 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7345 }
7346 return;
7347 }
7348 case Intrinsic::amdgcn_s_buffer_load: {
7349 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7350 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7351 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7352 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7353 // s_buffer_load_i8.
7354 if (!Subtarget->hasScalarSubwordLoads())
7355 return;
7356 SDValue Op = SDValue(N, 0);
7357 SDValue Rsrc = Op.getOperand(1);
7358 SDValue Offset = Op.getOperand(2);
7359 SDValue CachePolicy = Op.getOperand(3);
7360 EVT VT = Op.getValueType();
7361 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7362 SDLoc DL(Op);
7364 const DataLayout &DataLayout = DAG.getDataLayout();
7365 Align Alignment =
7371 VT.getStoreSize(), Alignment);
7372 SDValue LoadVal;
7373 if (!Offset->isDivergent()) {
7374 SDValue Ops[] = {Rsrc, // source register
7375 Offset, CachePolicy};
7376 SDValue BufferLoad =
7378 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7379 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7380 } else {
7381 SDValue Ops[] = {
7382 DAG.getEntryNode(), // Chain
7383 Rsrc, // rsrc
7384 DAG.getConstant(0, DL, MVT::i32), // vindex
7385 {}, // voffset
7386 {}, // soffset
7387 {}, // offset
7388 CachePolicy, // cachepolicy
7389 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7390 };
7391 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7392 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7393 }
7394 Results.push_back(LoadVal);
7395 return;
7396 }
7397 case Intrinsic::amdgcn_dead: {
7398 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7399 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7400 return;
7401 }
7402 }
7403 break;
7404 }
7406 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7407 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7408 // FIXME: Hacky
7409 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7410 Results.push_back(Res.getOperand(I));
7411 }
7412 } else {
7413 Results.push_back(Res);
7414 Results.push_back(Res.getValue(1));
7415 }
7416 return;
7417 }
7418
7419 break;
7420 }
7421 case ISD::SELECT: {
7422 SDLoc SL(N);
7423 EVT VT = N->getValueType(0);
7424 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7425 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7426 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7427
7428 EVT SelectVT = NewVT;
7429 if (NewVT.bitsLT(MVT::i32)) {
7430 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7431 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7432 SelectVT = MVT::i32;
7433 }
7434
7435 SDValue NewSelect =
7436 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7437
7438 if (NewVT != SelectVT)
7439 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7440 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7441 return;
7442 }
7443 case ISD::FNEG: {
7444 if (N->getValueType(0) != MVT::v2f16)
7445 break;
7446
7447 SDLoc SL(N);
7448 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7449
7450 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7451 DAG.getConstant(0x80008000, SL, MVT::i32));
7452 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7453 return;
7454 }
7455 case ISD::FABS: {
7456 if (N->getValueType(0) != MVT::v2f16)
7457 break;
7458
7459 SDLoc SL(N);
7460 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7461
7462 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7463 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7464 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7465 return;
7466 }
7467 case ISD::FSQRT: {
7468 if (N->getValueType(0) != MVT::f16)
7469 break;
7470 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7471 break;
7472 }
7473 default:
7475 break;
7476 }
7477}
7478
7479/// Helper function for LowerBRCOND
7480static SDNode *findUser(SDValue Value, unsigned Opcode) {
7481
7482 for (SDUse &U : Value->uses()) {
7483 if (U.get() != Value)
7484 continue;
7485
7486 if (U.getUser()->getOpcode() == Opcode)
7487 return U.getUser();
7488 }
7489 return nullptr;
7490}
7491
7492unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7493 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7494 switch (Intr->getConstantOperandVal(1)) {
7495 case Intrinsic::amdgcn_if:
7496 return AMDGPUISD::IF;
7497 case Intrinsic::amdgcn_else:
7498 return AMDGPUISD::ELSE;
7499 case Intrinsic::amdgcn_loop:
7500 return AMDGPUISD::LOOP;
7501 case Intrinsic::amdgcn_end_cf:
7502 llvm_unreachable("should not occur");
7503 default:
7504 return 0;
7505 }
7506 }
7507
7508 // break, if_break, else_break are all only used as inputs to loop, not
7509 // directly as branch conditions.
7510 return 0;
7511}
7512
7519
7521 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7522 return false;
7523
7524 // FIXME: Either avoid relying on address space here or change the default
7525 // address space for functions to avoid the explicit check.
7526 return (GV->getValueType()->isFunctionTy() ||
7529}
7530
7532 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7533}
7534
7536 if (!GV->hasExternalLinkage())
7537 return true;
7538
7539 const auto OS = getTargetMachine().getTargetTriple().getOS();
7540 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7541}
7542
7543/// This transforms the control flow intrinsics to get the branch destination as
7544/// last parameter, also switches branch target with BR if the need arise
7545SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7546 SDLoc DL(BRCOND);
7547
7548 SDNode *Intr = BRCOND.getOperand(1).getNode();
7549 SDValue Target = BRCOND.getOperand(2);
7550 SDNode *BR = nullptr;
7551 SDNode *SetCC = nullptr;
7552
7553 if (Intr->getOpcode() == ISD::SETCC) {
7554 // As long as we negate the condition everything is fine
7555 SetCC = Intr;
7556 Intr = SetCC->getOperand(0).getNode();
7557
7558 } else {
7559 // Get the target from BR if we don't negate the condition
7560 BR = findUser(BRCOND, ISD::BR);
7561 assert(BR && "brcond missing unconditional branch user");
7562 Target = BR->getOperand(1);
7563 }
7564
7565 unsigned CFNode = isCFIntrinsic(Intr);
7566 if (CFNode == 0) {
7567 // This is a uniform branch so we don't need to legalize.
7568 return BRCOND;
7569 }
7570
7571 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7573
7574 assert(!SetCC ||
7575 (SetCC->getConstantOperandVal(1) == 1 &&
7576 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7577 ISD::SETNE));
7578
7579 // operands of the new intrinsic call
7581 if (HaveChain)
7582 Ops.push_back(BRCOND.getOperand(0));
7583
7584 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7585 Ops.push_back(Target);
7586
7587 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7588
7589 // build the new intrinsic call
7590 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7591
7592 if (!HaveChain) {
7593 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7594
7596 }
7597
7598 if (BR) {
7599 // Give the branch instruction our target
7600 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7601 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7602 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7603 }
7604
7605 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7606
7607 // Copy the intrinsic results to registers
7608 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7609 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7610 if (!CopyToReg)
7611 continue;
7612
7613 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7614 SDValue(Result, i - 1), SDValue());
7615
7616 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7617 }
7618
7619 // Remove the old intrinsic from the chain
7620 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7621 Intr->getOperand(0));
7622
7623 return Chain;
7624}
7625
7626SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7627 MVT VT = Op.getSimpleValueType();
7628 SDLoc DL(Op);
7629 // Checking the depth
7630 if (Op.getConstantOperandVal(0) != 0)
7631 return DAG.getConstant(0, DL, VT);
7632
7633 MachineFunction &MF = DAG.getMachineFunction();
7634 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7635 // Check for kernel and shader functions
7636 if (Info->isEntryFunction())
7637 return DAG.getConstant(0, DL, VT);
7638
7639 MachineFrameInfo &MFI = MF.getFrameInfo();
7640 // There is a call to @llvm.returnaddress in this function
7641 MFI.setReturnAddressIsTaken(true);
7642
7643 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7644 // Get the return address reg and mark it as an implicit live-in
7645 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7646 getRegClassFor(VT, Op.getNode()->isDivergent()));
7647
7648 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7649}
7650
7651SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7652 const SDLoc &DL, EVT VT) const {
7653 return Op.getValueType().bitsLE(VT)
7654 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7655 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7656 DAG.getTargetConstant(0, DL, MVT::i32));
7657}
7658
7659SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7660 SelectionDAG &DAG) const {
7661 EVT DstVT = Op.getValueType();
7662 unsigned NumElts = DstVT.getVectorNumElements();
7663 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7664
7665 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7666
7667 SDLoc DL(Op);
7668 unsigned Opc = Op.getOpcode();
7669 SDValue Flags = Op.getOperand(1);
7670 EVT HalfDstVT =
7671 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7672 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7673 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7674
7675 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7676}
7677
7678SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7679 SDValue Src = Op.getOperand(0);
7680 EVT SrcVT = Src.getValueType();
7681 EVT DstVT = Op.getValueType();
7682
7683 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7684 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7685 if (SrcVT.getScalarType() != MVT::f32)
7686 return SDValue();
7687 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7688 }
7689
7690 if (SrcVT.getScalarType() != MVT::f64)
7691 return Op;
7692
7693 SDLoc DL(Op);
7694 if (DstVT == MVT::f16) {
7695 // TODO: Handle strictfp
7696 if (Op.getOpcode() != ISD::FP_ROUND)
7697 return Op;
7698
7699 if (!Subtarget->has16BitInsts()) {
7700 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7701 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7702 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7703 }
7704 if (Op->getFlags().hasApproximateFuncs()) {
7705 SDValue Flags = Op.getOperand(1);
7706 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7707 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7708 }
7709 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7710 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7711 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7712 }
7713
7714 assert(DstVT.getScalarType() == MVT::bf16 &&
7715 "custom lower FP_ROUND for f16 or bf16");
7716 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7717
7718 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7719 // hardware f32 -> bf16 instruction.
7720 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7721 MVT::f32;
7722 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7723 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7724 DAG.getTargetConstant(0, DL, MVT::i32));
7725}
7726
7727SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7728 SelectionDAG &DAG) const {
7729 EVT VT = Op.getValueType();
7730 const MachineFunction &MF = DAG.getMachineFunction();
7731 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7732 bool IsIEEEMode = Info->getMode().IEEE;
7733
7734 // FIXME: Assert during selection that this is only selected for
7735 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7736 // mode functions, but this happens to be OK since it's only done in cases
7737 // where there is known no sNaN.
7738 if (IsIEEEMode)
7739 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7740
7741 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7742 VT == MVT::v16bf16)
7743 return splitBinaryVectorOp(Op, DAG);
7744 return Op;
7745}
7746
7747SDValue
7748SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7749 SelectionDAG &DAG) const {
7750 EVT VT = Op.getValueType();
7751 const MachineFunction &MF = DAG.getMachineFunction();
7752 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7753 bool IsIEEEMode = Info->getMode().IEEE;
7754
7755 if (IsIEEEMode)
7756 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7757
7758 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7759 VT == MVT::v16bf16)
7760 return splitBinaryVectorOp(Op, DAG);
7761 return Op;
7762}
7763
7764SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7765 SelectionDAG &DAG) const {
7766 EVT VT = Op.getValueType();
7767 if (VT.isVector())
7768 return splitBinaryVectorOp(Op, DAG);
7769
7770 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7771 !Subtarget->hasMinimum3Maximum3F16() &&
7772 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7773 "should not need to widen f16 minimum/maximum to v2f16");
7774
7775 // Widen f16 operation to v2f16
7776
7777 // fminimum f16:x, f16:y ->
7778 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7779 // (v2f16 (scalar_to_vector y))), 0
7780 SDLoc SL(Op);
7781 SDValue WideSrc0 =
7782 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7783 SDValue WideSrc1 =
7784 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7785
7786 SDValue Widened =
7787 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7788
7789 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7790 DAG.getConstant(0, SL, MVT::i32));
7791}
7792
7793SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7794 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7795 EVT VT = Op.getValueType();
7796 assert(VT == MVT::f16);
7797
7798 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7799 EVT ExpVT = Exp.getValueType();
7800 if (ExpVT == MVT::i16)
7801 return Op;
7802
7803 SDLoc DL(Op);
7804
7805 // Correct the exponent type for f16 to i16.
7806 // Clamp the range of the exponent to the instruction's range.
7807
7808 // TODO: This should be a generic narrowing legalization, and can easily be
7809 // for GlobalISel.
7810
7811 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7812 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7813
7814 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7815 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7816
7817 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7818
7819 if (IsStrict) {
7820 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7821 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7822 }
7823
7824 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7825}
7826
7828 switch (Op->getOpcode()) {
7829 case ISD::SRA:
7830 case ISD::SMIN:
7831 case ISD::SMAX:
7832 return ISD::SIGN_EXTEND;
7833 case ISD::SRL:
7834 case ISD::UMIN:
7835 case ISD::UMAX:
7836 return ISD::ZERO_EXTEND;
7837 case ISD::ADD:
7838 case ISD::SUB:
7839 case ISD::AND:
7840 case ISD::OR:
7841 case ISD::XOR:
7842 case ISD::SHL:
7843 case ISD::SELECT:
7844 case ISD::MUL:
7845 // operation result won't be influenced by garbage high bits.
7846 // TODO: are all of those cases correct, and are there more?
7847 return ISD::ANY_EXTEND;
7848 case ISD::SETCC: {
7849 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7851 }
7852 default:
7853 llvm_unreachable("unexpected opcode!");
7854 }
7855}
7856
7857SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7858 DAGCombinerInfo &DCI) const {
7859 const unsigned Opc = Op.getOpcode();
7860 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7861 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7862 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7863 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7864 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7865
7866 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7867 : Op->getOperand(0).getValueType();
7868 auto ExtTy = OpTy.changeElementType(MVT::i32);
7869
7870 if (DCI.isBeforeLegalizeOps() ||
7871 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7872 return SDValue();
7873
7874 auto &DAG = DCI.DAG;
7875
7876 SDLoc DL(Op);
7877 SDValue LHS;
7878 SDValue RHS;
7879 if (Opc == ISD::SELECT) {
7880 LHS = Op->getOperand(1);
7881 RHS = Op->getOperand(2);
7882 } else {
7883 LHS = Op->getOperand(0);
7884 RHS = Op->getOperand(1);
7885 }
7886
7887 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7888 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7889
7890 // Special case: for shifts, the RHS always needs a zext.
7891 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7892 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7893 else
7894 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7895
7896 // setcc always return i1/i1 vec so no need to truncate after.
7897 if (Opc == ISD::SETCC) {
7898 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7899 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7900 }
7901
7902 // For other ops, we extend the operation's return type as well so we need to
7903 // truncate back to the original type.
7904 SDValue NewVal;
7905 if (Opc == ISD::SELECT)
7906 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7907 else
7908 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7909
7910 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7911}
7912
7913SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7914 SDValue Mag = Op.getOperand(0);
7915 EVT MagVT = Mag.getValueType();
7916
7917 if (MagVT.getVectorNumElements() > 2)
7918 return splitBinaryVectorOp(Op, DAG);
7919
7920 SDValue Sign = Op.getOperand(1);
7921 EVT SignVT = Sign.getValueType();
7922
7923 if (MagVT == SignVT)
7924 return Op;
7925
7926 // fcopysign v2f16:mag, v2f32:sign ->
7927 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7928
7929 SDLoc SL(Op);
7930 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7931 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7932
7933 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7934
7935 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7936}
7937
7938// Custom lowering for vector multiplications and s_mul_u64.
7939SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7940 EVT VT = Op.getValueType();
7941
7942 // Split vector operands.
7943 if (VT.isVector())
7944 return splitBinaryVectorOp(Op, DAG);
7945
7946 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7947
7948 // There are four ways to lower s_mul_u64:
7949 //
7950 // 1. If all the operands are uniform, then we lower it as it is.
7951 //
7952 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7953 // multiplications because there is not a vector equivalent of s_mul_u64.
7954 //
7955 // 3. If the cost model decides that it is more efficient to use vector
7956 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7957 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7958 //
7959 // 4. If the cost model decides to use vector registers and both of the
7960 // operands are zero-extended/sign-extended from 32-bits, then we split the
7961 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7962 // possible to check if the operands are zero-extended or sign-extended in
7963 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7964 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7965 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7966 // If the cost model decides that we have to use vector registers, then
7967 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7968 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7969 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7970 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7971 // SIInstrInfo.cpp .
7972
7973 if (Op->isDivergent())
7974 return SDValue();
7975
7976 SDValue Op0 = Op.getOperand(0);
7977 SDValue Op1 = Op.getOperand(1);
7978 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7979 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7980 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7981 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7982 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7983 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7984 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7985 SDLoc SL(Op);
7986 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7987 return SDValue(
7988 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7989 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7990 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7991 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7992 return SDValue(
7993 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7994 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7995 return Op;
7996}
7997
7998SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7999 EVT VT = Op.getValueType();
8000 SDLoc SL(Op);
8001 SDValue LHS = Op.getOperand(0);
8002 SDValue RHS = Op.getOperand(1);
8003 bool isSigned = Op.getOpcode() == ISD::SMULO;
8004
8005 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8006 const APInt &C = RHSC->getAPIntValue();
8007 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8008 if (C.isPowerOf2()) {
8009 // smulo(x, signed_min) is same as umulo(x, signed_min).
8010 bool UseArithShift = isSigned && !C.isMinSignedValue();
8011 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8012 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8013 SDValue Overflow =
8014 DAG.getSetCC(SL, MVT::i1,
8015 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8016 Result, ShiftAmt),
8017 LHS, ISD::SETNE);
8018 return DAG.getMergeValues({Result, Overflow}, SL);
8019 }
8020 }
8021
8022 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8023 SDValue Top =
8024 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8025
8026 SDValue Sign = isSigned
8027 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8028 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8029 SL, MVT::i32))
8030 : DAG.getConstant(0, SL, VT);
8031 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8032
8033 return DAG.getMergeValues({Result, Overflow}, SL);
8034}
8035
8036SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8037 if (Op->isDivergent()) {
8038 // Select to V_MAD_[IU]64_[IU]32.
8039 return Op;
8040 }
8041 if (Subtarget->hasSMulHi()) {
8042 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8043 return SDValue();
8044 }
8045 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8046 // calculate the high part, so we might as well do the whole thing with
8047 // V_MAD_[IU]64_[IU]32.
8048 return Op;
8049}
8050
8051SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8052 if (!Subtarget->isTrapHandlerEnabled() ||
8053 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8054 return lowerTrapEndpgm(Op, DAG);
8055
8056 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8057 : lowerTrapHsaQueuePtr(Op, DAG);
8058}
8059
8060SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8061 SDLoc SL(Op);
8062 SDValue Chain = Op.getOperand(0);
8063 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8064}
8065
8066SDValue
8067SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8068 const SDLoc &DL, Align Alignment,
8069 ImplicitParameter Param) const {
8070 MachineFunction &MF = DAG.getMachineFunction();
8071 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8072 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8073 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8074 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8077}
8078
8079SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8080 SelectionDAG &DAG) const {
8081 SDLoc SL(Op);
8082 SDValue Chain = Op.getOperand(0);
8083
8084 SDValue QueuePtr;
8085 // For code object version 5, QueuePtr is passed through implicit kernarg.
8086 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8088 QueuePtr =
8089 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8090 } else {
8091 MachineFunction &MF = DAG.getMachineFunction();
8092 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8093 Register UserSGPR = Info->getQueuePtrUserSGPR();
8094
8095 if (UserSGPR == AMDGPU::NoRegister) {
8096 // We probably are in a function incorrectly marked with
8097 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8098 // trap, so just use a null pointer.
8099 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8100 } else {
8101 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8102 MVT::i64);
8103 }
8104 }
8105
8106 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8107 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8108
8109 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8110 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8111 ToReg.getValue(1)};
8112 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8113}
8114
8115SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8116 SDLoc SL(Op);
8117 SDValue Chain = Op.getOperand(0);
8118
8119 // We need to simulate the 's_trap 2' instruction on targets that run in
8120 // PRIV=1 (where it is treated as a nop).
8121 if (Subtarget->hasPrivEnabledTrap2NopBug())
8122 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8123
8124 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8125 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8126 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8127}
8128
8129SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8130 SDLoc SL(Op);
8131 SDValue Chain = Op.getOperand(0);
8132 MachineFunction &MF = DAG.getMachineFunction();
8133
8134 if (!Subtarget->isTrapHandlerEnabled() ||
8135 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8136 LLVMContext &Ctx = MF.getFunction().getContext();
8137 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8138 "debugtrap handler not supported",
8139 Op.getDebugLoc(), DS_Warning));
8140 return Chain;
8141 }
8142
8143 uint64_t TrapID =
8144 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8145 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8146 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8147}
8148
8149SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8150 SelectionDAG &DAG) const {
8151 if (Subtarget->hasApertureRegs()) {
8152 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8153 ? AMDGPU::SRC_SHARED_BASE
8154 : AMDGPU::SRC_PRIVATE_BASE;
8155 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8156 !Subtarget->hasGloballyAddressableScratch()) &&
8157 "Cannot use src_private_base with globally addressable scratch!");
8158 // Note: this feature (register) is broken. When used as a 32-bit operand,
8159 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8160 // bits.
8161 //
8162 // To work around the issue, emit a 64 bit copy from this register
8163 // then extract the high bits. Note that this shouldn't even result in a
8164 // shift being emitted and simply become a pair of registers (e.g.):
8165 // s_mov_b64 s[6:7], src_shared_base
8166 // v_mov_b32_e32 v1, s7
8167 SDValue Copy =
8168 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8169 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8170 }
8171
8172 // For code object version 5, private_base and shared_base are passed through
8173 // implicit kernargs.
8174 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8178 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8179 }
8180
8181 MachineFunction &MF = DAG.getMachineFunction();
8182 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8183 Register UserSGPR = Info->getQueuePtrUserSGPR();
8184 if (UserSGPR == AMDGPU::NoRegister) {
8185 // We probably are in a function incorrectly marked with
8186 // amdgpu-no-queue-ptr. This is undefined.
8187 return DAG.getPOISON(MVT::i32);
8188 }
8189
8190 SDValue QueuePtr =
8191 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8192
8193 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8194 // private_segment_aperture_base_hi.
8195 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8196
8197 SDValue Ptr =
8198 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8199
8200 // TODO: Use custom target PseudoSourceValue.
8201 // TODO: We should use the value from the IR intrinsic call, but it might not
8202 // be available and how do we get it?
8203 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8204 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8205 commonAlignment(Align(64), StructOffset),
8208}
8209
8210/// Return true if the value is a known valid address, such that a null check is
8211/// not necessary.
8213 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8215 return true;
8216
8217 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8218 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8219
8220 // TODO: Search through arithmetic, handle arguments and loads
8221 // marked nonnull.
8222 return false;
8223}
8224
8225SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8226 SelectionDAG &DAG) const {
8227 SDLoc SL(Op);
8228
8229 const AMDGPUTargetMachine &TM =
8230 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8231
8232 unsigned DestAS, SrcAS;
8233 SDValue Src;
8234 bool IsNonNull = false;
8235 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8236 SrcAS = ASC->getSrcAddressSpace();
8237 Src = ASC->getOperand(0);
8238 DestAS = ASC->getDestAddressSpace();
8239 } else {
8240 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8241 Op.getConstantOperandVal(0) ==
8242 Intrinsic::amdgcn_addrspacecast_nonnull);
8243 Src = Op->getOperand(1);
8244 SrcAS = Op->getConstantOperandVal(2);
8245 DestAS = Op->getConstantOperandVal(3);
8246 IsNonNull = true;
8247 }
8248
8249 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8250
8251 // flat -> local/private
8252 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8253 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8254 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8255 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8256
8257 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8258 Subtarget->hasGloballyAddressableScratch()) {
8259 // flat -> private with globally addressable scratch: subtract
8260 // src_flat_scratch_base_lo.
8261 SDValue FlatScratchBaseLo(
8262 DAG.getMachineNode(
8263 AMDGPU::S_MOV_B32, SL, MVT::i32,
8264 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8265 0);
8266 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8267 }
8268
8269 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8270 return Ptr;
8271
8272 unsigned NullVal = TM.getNullPointerValue(DestAS);
8273 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8274 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8275
8276 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8277 SegmentNullPtr);
8278 }
8279 }
8280
8281 // local/private -> flat
8282 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8283 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8284 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8285 SDValue CvtPtr;
8286 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8287 Subtarget->hasGloballyAddressableScratch()) {
8288 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8289 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8290 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8291 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8292 ThreadID = DAG.getNode(
8293 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8294 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8295 AllOnes, ThreadID);
8296 if (Subtarget->isWave64())
8297 ThreadID = DAG.getNode(
8298 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8299 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8300 AllOnes, ThreadID);
8301 SDValue ShAmt = DAG.getShiftAmountConstant(
8302 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8303 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8304 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8305 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8306 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8307 // 64-bit hi:lo value.
8308 SDValue FlatScratchBase = {
8309 DAG.getMachineNode(
8310 AMDGPU::S_MOV_B64, SL, MVT::i64,
8311 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8312 0};
8313 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8314 } else {
8315 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8316 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8317 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8318 }
8319
8320 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8321 return CvtPtr;
8322
8323 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8324 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8325
8326 SDValue NonNull =
8327 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8328
8329 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8330 FlatNullPtr);
8331 }
8332 }
8333
8334 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8335 Op.getValueType() == MVT::i64) {
8336 const SIMachineFunctionInfo *Info =
8337 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8338 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8339 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8340 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8341 }
8342
8343 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8344 Src.getValueType() == MVT::i64)
8345 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8346
8347 // global <-> flat are no-ops and never emitted.
8348
8349 // Invalid casts are poison.
8350 return DAG.getPOISON(Op->getValueType(0));
8351}
8352
8353// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8354// the small vector and inserting them into the big vector. That is better than
8355// the default expansion of doing it via a stack slot. Even though the use of
8356// the stack slot would be optimized away afterwards, the stack slot itself
8357// remains.
8358SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8359 SelectionDAG &DAG) const {
8360 SDValue Vec = Op.getOperand(0);
8361 SDValue Ins = Op.getOperand(1);
8362 SDValue Idx = Op.getOperand(2);
8363 EVT VecVT = Vec.getValueType();
8364 EVT InsVT = Ins.getValueType();
8365 EVT EltVT = VecVT.getVectorElementType();
8366 unsigned InsNumElts = InsVT.getVectorNumElements();
8367 unsigned IdxVal = Idx->getAsZExtVal();
8368 SDLoc SL(Op);
8369
8370 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8371 // Insert 32-bit registers at a time.
8372 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8373
8374 unsigned VecNumElts = VecVT.getVectorNumElements();
8375 EVT NewVecVT =
8376 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8377 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8379 MVT::i32, InsNumElts / 2);
8380
8381 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8382 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8383
8384 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8385 SDValue Elt;
8386 if (InsNumElts == 2) {
8387 Elt = Ins;
8388 } else {
8389 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8390 DAG.getConstant(I, SL, MVT::i32));
8391 }
8392 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8393 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8394 }
8395
8396 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8397 }
8398
8399 for (unsigned I = 0; I != InsNumElts; ++I) {
8400 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8401 DAG.getConstant(I, SL, MVT::i32));
8402 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8403 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8404 }
8405 return Vec;
8406}
8407
8408SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8409 SelectionDAG &DAG) const {
8410 SDValue Vec = Op.getOperand(0);
8411 SDValue InsVal = Op.getOperand(1);
8412 SDValue Idx = Op.getOperand(2);
8413 EVT VecVT = Vec.getValueType();
8414 EVT EltVT = VecVT.getVectorElementType();
8415 unsigned VecSize = VecVT.getSizeInBits();
8416 unsigned EltSize = EltVT.getSizeInBits();
8417 SDLoc SL(Op);
8418
8419 // Specially handle the case of v4i16 with static indexing.
8420 unsigned NumElts = VecVT.getVectorNumElements();
8421 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8422 if (NumElts == 4 && EltSize == 16 && KIdx) {
8423 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8424
8425 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8426 DAG.getConstant(0, SL, MVT::i32));
8427 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8428 DAG.getConstant(1, SL, MVT::i32));
8429
8430 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8431 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8432
8433 unsigned Idx = KIdx->getZExtValue();
8434 bool InsertLo = Idx < 2;
8435 SDValue InsHalf = DAG.getNode(
8436 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8437 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8438 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8439
8440 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8441
8442 SDValue Concat =
8443 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8444 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8445
8446 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8447 }
8448
8449 // Static indexing does not lower to stack access, and hence there is no need
8450 // for special custom lowering to avoid stack access.
8451 if (isa<ConstantSDNode>(Idx))
8452 return SDValue();
8453
8454 // Avoid stack access for dynamic indexing by custom lowering to
8455 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8456
8457 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8458
8459 MVT IntVT = MVT::getIntegerVT(VecSize);
8460
8461 // Convert vector index to bit-index and get the required bit mask.
8462 assert(isPowerOf2_32(EltSize));
8463 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8464 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8465 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8466 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8467 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8468
8469 // 1. Create a congruent vector with the target value in each element.
8470 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8471 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8472
8473 // 2. Mask off all other indices except the required index within (1).
8474 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8475
8476 // 3. Mask off the required index within the target vector.
8477 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8478 SDValue RHS =
8479 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8480
8481 // 4. Get (2) and (3) ORed into the target vector.
8482 SDValue BFI =
8483 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8484
8485 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8486}
8487
8488SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8489 SelectionDAG &DAG) const {
8490 SDLoc SL(Op);
8491
8492 EVT ResultVT = Op.getValueType();
8493 SDValue Vec = Op.getOperand(0);
8494 SDValue Idx = Op.getOperand(1);
8495 EVT VecVT = Vec.getValueType();
8496 unsigned VecSize = VecVT.getSizeInBits();
8497 EVT EltVT = VecVT.getVectorElementType();
8498
8499 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8500
8501 // Make sure we do any optimizations that will make it easier to fold
8502 // source modifiers before obscuring it with bit operations.
8503
8504 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8505 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8506 return Combined;
8507
8508 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8509 SDValue Lo, Hi;
8510 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8511
8512 if (VecSize == 128) {
8513 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8514 Lo = DAG.getBitcast(LoVT,
8515 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8516 DAG.getConstant(0, SL, MVT::i32)));
8517 Hi = DAG.getBitcast(HiVT,
8518 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8519 DAG.getConstant(1, SL, MVT::i32)));
8520 } else if (VecSize == 256) {
8521 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8522 SDValue Parts[4];
8523 for (unsigned P = 0; P < 4; ++P) {
8524 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8525 DAG.getConstant(P, SL, MVT::i32));
8526 }
8527
8528 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8529 Parts[0], Parts[1]));
8530 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8531 Parts[2], Parts[3]));
8532 } else {
8533 assert(VecSize == 512);
8534
8535 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8536 SDValue Parts[8];
8537 for (unsigned P = 0; P < 8; ++P) {
8538 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8539 DAG.getConstant(P, SL, MVT::i32));
8540 }
8541
8542 Lo = DAG.getBitcast(LoVT,
8543 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8544 Parts[0], Parts[1], Parts[2], Parts[3]));
8545 Hi = DAG.getBitcast(HiVT,
8546 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8547 Parts[4], Parts[5], Parts[6], Parts[7]));
8548 }
8549
8550 EVT IdxVT = Idx.getValueType();
8551 unsigned NElem = VecVT.getVectorNumElements();
8552 assert(isPowerOf2_32(NElem));
8553 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8554 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8555 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8556 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8557 }
8558
8559 assert(VecSize <= 64);
8560
8561 MVT IntVT = MVT::getIntegerVT(VecSize);
8562
8563 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8564 SDValue VecBC = peekThroughBitcasts(Vec);
8565 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8566 SDValue Src = VecBC.getOperand(0);
8567 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8568 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8569 }
8570
8571 unsigned EltSize = EltVT.getSizeInBits();
8572 assert(isPowerOf2_32(EltSize));
8573
8574 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8575
8576 // Convert vector index to bit-index (* EltSize)
8577 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8578
8579 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8580 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8581
8582 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8583 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8584 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8585 }
8586
8587 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8588}
8589
8590static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8591 assert(Elt % 2 == 0);
8592 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8593}
8594
8595static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8596 assert(Elt % 2 == 0);
8597 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8598 !(Mask[Elt + 1] & 1);
8599}
8600
8601SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8602 SelectionDAG &DAG) const {
8603 SDLoc SL(Op);
8604 EVT ResultVT = Op.getValueType();
8605 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8606 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8607 const int NewSrcNumElts = 2;
8608 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8609 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8610
8611 // Break up the shuffle into registers sized pieces.
8612 //
8613 // We're trying to form sub-shuffles that the register allocation pipeline
8614 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8615 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8616 // pair of copies into a consecutive register copy, so use the ordinary
8617 // extract_vector_elt lowering unless we can use the shuffle.
8618 //
8619 // TODO: This is a bit of hack, and we should probably always use
8620 // extract_subvector for the largest possible subvector we can (or at least
8621 // use it for PackVT aligned pieces). However we have worse support for
8622 // combines on them don't directly treat extract_subvector / insert_subvector
8623 // as legal. The DAG scheduler also ends up doing a worse job with the
8624 // extract_subvectors.
8625 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8626
8627 // vector_shuffle <0,1,6,7> lhs, rhs
8628 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8629 //
8630 // vector_shuffle <6,7,2,3> lhs, rhs
8631 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8632 //
8633 // vector_shuffle <6,7,0,1> lhs, rhs
8634 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8635
8636 // Avoid scalarizing when both halves are reading from consecutive elements.
8637
8638 // If we're treating 2 element shuffles as legal, also create odd-to-even
8639 // shuffles of neighboring pairs.
8640 //
8641 // vector_shuffle <3,2,7,6> lhs, rhs
8642 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8643 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8644
8646 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8647 if (ShouldUseConsecutiveExtract &&
8649 const int Idx = SVN->getMaskElt(I);
8650 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8651 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8652 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8653 SVN->getOperand(VecIdx),
8654 DAG.getConstant(EltIdx, SL, MVT::i32));
8655 Pieces.push_back(SubVec);
8656 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8658 int Idx0 = SVN->getMaskElt(I);
8659 int Idx1 = SVN->getMaskElt(I + 1);
8660
8661 SDValue SrcOp0 = SVN->getOperand(0);
8662 SDValue SrcOp1 = SrcOp0;
8663 if (Idx0 >= SrcNumElts) {
8664 SrcOp0 = SVN->getOperand(1);
8665 Idx0 -= SrcNumElts;
8666 }
8667
8668 if (Idx1 >= SrcNumElts) {
8669 SrcOp1 = SVN->getOperand(1);
8670 Idx1 -= SrcNumElts;
8671 }
8672
8673 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8674 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8675
8676 // Extract nearest even aligned piece.
8677 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8678 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8679 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8680 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8681
8682 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8683 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8684
8685 SDValue Result0 = SubVec0;
8686 SDValue Result1 = SubVec0;
8687
8688 if (SubVec0 != SubVec1) {
8689 NewMaskIdx1 += NewSrcNumElts;
8690 Result1 = SubVec1;
8691 } else {
8692 Result1 = DAG.getPOISON(PackVT);
8693 }
8694
8695 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8696 {NewMaskIdx0, NewMaskIdx1});
8697 Pieces.push_back(Shuf);
8698 } else {
8699 const int Idx0 = SVN->getMaskElt(I);
8700 const int Idx1 = SVN->getMaskElt(I + 1);
8701 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8702 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8703 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8704 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8705
8706 SDValue Vec0 = SVN->getOperand(VecIdx0);
8707 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8708 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8709
8710 SDValue Vec1 = SVN->getOperand(VecIdx1);
8711 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8712 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8713 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8714 }
8715 }
8716
8717 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8718}
8719
8720SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8721 SelectionDAG &DAG) const {
8722 SDValue SVal = Op.getOperand(0);
8723 EVT ResultVT = Op.getValueType();
8724 EVT SValVT = SVal.getValueType();
8725 SDValue UndefVal = DAG.getPOISON(SValVT);
8726 SDLoc SL(Op);
8727
8729 VElts.push_back(SVal);
8730 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8731 VElts.push_back(UndefVal);
8732
8733 return DAG.getBuildVector(ResultVT, SL, VElts);
8734}
8735
8736SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8737 SelectionDAG &DAG) const {
8738 SDLoc SL(Op);
8739 EVT VT = Op.getValueType();
8740
8741 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8742 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8743
8744 SDValue Lo = Op.getOperand(0);
8745 SDValue Hi = Op.getOperand(1);
8746
8747 // Avoid adding defined bits with the zero_extend.
8748 if (Hi.isUndef()) {
8749 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8750 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8751 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8752 }
8753
8754 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8755 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8756
8757 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8758 DAG.getConstant(16, SL, MVT::i32));
8759 if (Lo.isUndef())
8760 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8761
8762 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8763 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8764
8765 SDValue Or =
8766 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8767 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8768 }
8769
8770 // Split into 2-element chunks.
8771 const unsigned NumParts = VT.getVectorNumElements() / 2;
8772 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8773 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8774
8776 for (unsigned P = 0; P < NumParts; ++P) {
8777 SDValue Vec = DAG.getBuildVector(
8778 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8779 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8780 }
8781
8782 SDValue Blend =
8783 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8784 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8785}
8786
8788 const GlobalAddressSDNode *GA) const {
8789 // OSes that use ELF REL relocations (instead of RELA) can only store a
8790 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8791 // which can create arbitrary 64-bit addends. (This is only a problem for
8792 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8793 // the high 32 bits of the addend.)
8794 //
8795 // This should be kept in sync with how HasRelocationAddend is initialized in
8796 // the constructor of ELFAMDGPUAsmBackend.
8797 if (!Subtarget->isAmdHsaOS())
8798 return false;
8799
8800 // We can fold offsets for anything that doesn't require a GOT relocation.
8801 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8805}
8806
8807static SDValue
8809 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8810 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8811 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8812 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8813 // lowered to the following code sequence:
8814 //
8815 // For constant address space:
8816 // s_getpc_b64 s[0:1]
8817 // s_add_u32 s0, s0, $symbol
8818 // s_addc_u32 s1, s1, 0
8819 //
8820 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8821 // a fixup or relocation is emitted to replace $symbol with a literal
8822 // constant, which is a pc-relative offset from the encoding of the $symbol
8823 // operand to the global variable.
8824 //
8825 // For global address space:
8826 // s_getpc_b64 s[0:1]
8827 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8828 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8829 //
8830 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8831 // fixups or relocations are emitted to replace $symbol@*@lo and
8832 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8833 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8834 // operand to the global variable.
8835 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8836 assert(GAFlags != SIInstrInfo::MO_NONE);
8837
8838 SDValue Ptr =
8839 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8840 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8841 }
8842
8843 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8844 SDValue PtrHi;
8845 if (GAFlags == SIInstrInfo::MO_NONE)
8846 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8847 else
8848 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8849 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8850}
8851
8852SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8853 SDValue Op,
8854 SelectionDAG &DAG) const {
8855 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8856 SDLoc DL(GSD);
8857 EVT PtrVT = Op.getValueType();
8858
8859 const GlobalValue *GV = GSD->getGlobal();
8865 GV->hasExternalLinkage()) {
8866 Type *Ty = GV->getValueType();
8867 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8868 // zero-sized type in other languages to declare the dynamic shared
8869 // memory which size is not known at the compile time. They will be
8870 // allocated by the runtime and placed directly after the static
8871 // allocated ones. They all share the same offset.
8872 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8873 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8874 // Adjust alignment for that dynamic shared memory array.
8877 MFI->setUsesDynamicLDS(true);
8878 return SDValue(
8879 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8880 }
8881 }
8883 }
8884
8886 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8888 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8889 }
8890
8891 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8892 if (Subtarget->has64BitLiterals()) {
8894 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8895 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8896 0);
8897 }
8898
8899 SDValue AddrLo = DAG.getTargetGlobalAddress(
8900 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8901 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8902
8903 SDValue AddrHi = DAG.getTargetGlobalAddress(
8904 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8905 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8906
8907 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8908 }
8909
8910 if (shouldEmitFixup(GV))
8911 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8912
8913 if (shouldEmitPCReloc(GV))
8914 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8916
8917 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8919 PointerType *PtrTy =
8921 const DataLayout &DataLayout = DAG.getDataLayout();
8922 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8923 MachinePointerInfo PtrInfo =
8925
8926 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8929}
8930
8932 const SDLoc &DL, SDValue V) const {
8933 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8934 // the destination register.
8935 //
8936 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8937 // so we will end up with redundant moves to m0.
8938 //
8939 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8940
8941 // A Null SDValue creates a glue result.
8942 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8943 V, Chain);
8944 return SDValue(M0, 0);
8945}
8946
8947SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8948 MVT VT,
8949 unsigned Offset) const {
8950 SDLoc SL(Op);
8951 SDValue Param = lowerKernargMemParameter(
8952 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8953 // The local size values will have the hi 16-bits as zero.
8954 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8955 DAG.getValueType(VT));
8956}
8957
8959 EVT VT) {
8962 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8963 return DAG.getPOISON(VT);
8964}
8965
8967 EVT VT) {
8970 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8971 return DAG.getPOISON(VT);
8972}
8973
8975 ArrayRef<SDValue> Elts) {
8976 assert(!Elts.empty());
8977 MVT Type;
8978 unsigned NumElts = Elts.size();
8979
8980 if (NumElts <= 12) {
8981 Type = MVT::getVectorVT(MVT::f32, NumElts);
8982 } else {
8983 assert(Elts.size() <= 16);
8984 Type = MVT::v16f32;
8985 NumElts = 16;
8986 }
8987
8988 SmallVector<SDValue, 16> VecElts(NumElts);
8989 for (unsigned i = 0; i < Elts.size(); ++i) {
8990 SDValue Elt = Elts[i];
8991 if (Elt.getValueType() != MVT::f32)
8992 Elt = DAG.getBitcast(MVT::f32, Elt);
8993 VecElts[i] = Elt;
8994 }
8995 for (unsigned i = Elts.size(); i < NumElts; ++i)
8996 VecElts[i] = DAG.getPOISON(MVT::f32);
8997
8998 if (NumElts == 1)
8999 return VecElts[0];
9000 return DAG.getBuildVector(Type, DL, VecElts);
9001}
9002
9003static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9004 SDValue Src, int ExtraElts) {
9005 EVT SrcVT = Src.getValueType();
9006
9008
9009 if (SrcVT.isVector())
9010 DAG.ExtractVectorElements(Src, Elts);
9011 else
9012 Elts.push_back(Src);
9013
9014 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9015 while (ExtraElts--)
9016 Elts.push_back(Undef);
9017
9018 return DAG.getBuildVector(CastVT, DL, Elts);
9019}
9020
9021// Re-construct the required return value for a image load intrinsic.
9022// This is more complicated due to the optional use TexFailCtrl which means the
9023// required return type is an aggregate
9025 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9026 bool Unpacked, bool IsD16, int DMaskPop,
9027 int NumVDataDwords, bool IsAtomicPacked16Bit,
9028 const SDLoc &DL) {
9029 // Determine the required return type. This is the same regardless of
9030 // IsTexFail flag
9031 EVT ReqRetVT = ResultTypes[0];
9032 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9033 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9034 ? (ReqRetNumElts + 1) / 2
9035 : ReqRetNumElts;
9036
9037 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9038
9039 MVT DataDwordVT =
9040 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9041
9042 MVT MaskPopVT =
9043 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9044
9045 SDValue Data(Result, 0);
9046 SDValue TexFail;
9047
9048 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9049 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9050 if (MaskPopVT.isVector()) {
9051 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9052 SDValue(Result, 0), ZeroIdx);
9053 } else {
9054 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9055 SDValue(Result, 0), ZeroIdx);
9056 }
9057 }
9058
9059 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9060 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9061 NumDataDwords - MaskPopDwords);
9062
9063 if (IsD16)
9064 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9065
9066 EVT LegalReqRetVT = ReqRetVT;
9067 if (!ReqRetVT.isVector()) {
9068 if (!Data.getValueType().isInteger())
9069 Data = DAG.getNode(ISD::BITCAST, DL,
9070 Data.getValueType().changeTypeToInteger(), Data);
9071 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9072 } else {
9073 // We need to widen the return vector to a legal type
9074 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9075 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9076 LegalReqRetVT =
9078 ReqRetVT.getVectorNumElements() + 1);
9079 }
9080 }
9081 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9082
9083 if (IsTexFail) {
9084 TexFail =
9085 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9086 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9087
9088 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9089 }
9090
9091 if (Result->getNumValues() == 1)
9092 return Data;
9093
9094 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9095}
9096
9097static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9098 SDValue *LWE, bool &IsTexFail) {
9099 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9100
9101 uint64_t Value = TexFailCtrlConst->getZExtValue();
9102 if (Value) {
9103 IsTexFail = true;
9104 }
9105
9106 SDLoc DL(TexFailCtrlConst);
9107 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9108 Value &= ~(uint64_t)0x1;
9109 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9110 Value &= ~(uint64_t)0x2;
9111
9112 return Value == 0;
9113}
9114
9116 MVT PackVectorVT,
9117 SmallVectorImpl<SDValue> &PackedAddrs,
9118 unsigned DimIdx, unsigned EndIdx,
9119 unsigned NumGradients) {
9120 SDLoc DL(Op);
9121 for (unsigned I = DimIdx; I < EndIdx; I++) {
9122 SDValue Addr = Op.getOperand(I);
9123
9124 // Gradients are packed with undef for each coordinate.
9125 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9126 // 1D: undef,dx/dh; undef,dx/dv
9127 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9128 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9129 if (((I + 1) >= EndIdx) ||
9130 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9131 I == DimIdx + NumGradients - 1))) {
9132 if (Addr.getValueType() != MVT::i16)
9133 Addr = DAG.getBitcast(MVT::i16, Addr);
9134 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9135 } else {
9136 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9137 I++;
9138 }
9139 Addr = DAG.getBitcast(MVT::f32, Addr);
9140 PackedAddrs.push_back(Addr);
9141 }
9142}
9143
9144SDValue SITargetLowering::lowerImage(SDValue Op,
9146 SelectionDAG &DAG, bool WithChain) const {
9147 SDLoc DL(Op);
9148 MachineFunction &MF = DAG.getMachineFunction();
9149 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9150 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9152 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9153 unsigned IntrOpcode = Intr->BaseOpcode;
9154 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9155 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9156 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9157
9158 SmallVector<EVT, 3> ResultTypes(Op->values());
9159 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9160 bool IsD16 = false;
9161 bool IsG16 = false;
9162 bool IsA16 = false;
9163 SDValue VData;
9164 int NumVDataDwords = 0;
9165 bool AdjustRetType = false;
9166 bool IsAtomicPacked16Bit = false;
9167
9168 // Offset of intrinsic arguments
9169 const unsigned ArgOffset = WithChain ? 2 : 1;
9170
9171 unsigned DMask;
9172 unsigned DMaskLanes = 0;
9173
9174 if (BaseOpcode->Atomic) {
9175 VData = Op.getOperand(2);
9176
9177 IsAtomicPacked16Bit =
9178 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9179 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9180
9181 bool Is64Bit = VData.getValueSizeInBits() == 64;
9182 if (BaseOpcode->AtomicX2) {
9183 SDValue VData2 = Op.getOperand(3);
9184 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9185 {VData, VData2});
9186 if (Is64Bit)
9187 VData = DAG.getBitcast(MVT::v4i32, VData);
9188
9189 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9190 DMask = Is64Bit ? 0xf : 0x3;
9191 NumVDataDwords = Is64Bit ? 4 : 2;
9192 } else {
9193 DMask = Is64Bit ? 0x3 : 0x1;
9194 NumVDataDwords = Is64Bit ? 2 : 1;
9195 }
9196 } else {
9197 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9198 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9199
9200 if (BaseOpcode->Store) {
9201 VData = Op.getOperand(2);
9202
9203 MVT StoreVT = VData.getSimpleValueType();
9204 if (StoreVT.getScalarType() == MVT::f16) {
9205 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9206 return Op; // D16 is unsupported for this instruction
9207
9208 IsD16 = true;
9209 VData = handleD16VData(VData, DAG, true);
9210 }
9211
9212 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9213 } else if (!BaseOpcode->NoReturn) {
9214 // Work out the num dwords based on the dmask popcount and underlying type
9215 // and whether packing is supported.
9216 MVT LoadVT = ResultTypes[0].getSimpleVT();
9217 if (LoadVT.getScalarType() == MVT::f16) {
9218 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9219 return Op; // D16 is unsupported for this instruction
9220
9221 IsD16 = true;
9222 }
9223
9224 // Confirm that the return type is large enough for the dmask specified
9225 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9226 (!LoadVT.isVector() && DMaskLanes > 1))
9227 return Op;
9228
9229 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9230 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9231 // instructions.
9232 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9233 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9234 NumVDataDwords = (DMaskLanes + 1) / 2;
9235 else
9236 NumVDataDwords = DMaskLanes;
9237
9238 AdjustRetType = true;
9239 }
9240 }
9241
9242 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9244
9245 // Check for 16 bit addresses or derivatives and pack if true.
9246 MVT VAddrVT =
9247 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9248 MVT VAddrScalarVT = VAddrVT.getScalarType();
9249 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9250 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9251
9252 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9253 VAddrScalarVT = VAddrVT.getScalarType();
9254 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9255 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9256
9257 // Push back extra arguments.
9258 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9259 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9260 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9261 // Special handling of bias when A16 is on. Bias is of type half but
9262 // occupies full 32-bit.
9263 SDValue Bias = DAG.getBuildVector(
9264 MVT::v2f16, DL,
9265 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9266 VAddrs.push_back(Bias);
9267 } else {
9268 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9269 "Bias needs to be converted to 16 bit in A16 mode");
9270 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9271 }
9272 }
9273
9274 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9275 // 16 bit gradients are supported, but are tied to the A16 control
9276 // so both gradients and addresses must be 16 bit
9277 LLVM_DEBUG(
9278 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9279 "require 16 bit args for both gradients and addresses");
9280 return Op;
9281 }
9282
9283 if (IsA16) {
9284 if (!ST->hasA16()) {
9285 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9286 "support 16 bit addresses\n");
9287 return Op;
9288 }
9289 }
9290
9291 // We've dealt with incorrect input so we know that if IsA16, IsG16
9292 // are set then we have to compress/pack operands (either address,
9293 // gradient or both)
9294 // In the case where a16 and gradients are tied (no G16 support) then we
9295 // have already verified that both IsA16 and IsG16 are true
9296 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9297 // Activate g16
9298 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9300 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9301 }
9302
9303 // Add gradients (packed or unpacked)
9304 if (IsG16) {
9305 // Pack the gradients
9306 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9307 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9308 ArgOffset + Intr->GradientStart,
9309 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9310 } else {
9311 for (unsigned I = ArgOffset + Intr->GradientStart;
9312 I < ArgOffset + Intr->CoordStart; I++)
9313 VAddrs.push_back(Op.getOperand(I));
9314 }
9315
9316 // Add addresses (packed or unpacked)
9317 if (IsA16) {
9318 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9319 ArgOffset + Intr->CoordStart, VAddrEnd,
9320 0 /* No gradients */);
9321 } else {
9322 // Add uncompressed address
9323 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9324 VAddrs.push_back(Op.getOperand(I));
9325 }
9326
9327 // If the register allocator cannot place the address registers contiguously
9328 // without introducing moves, then using the non-sequential address encoding
9329 // is always preferable, since it saves VALU instructions and is usually a
9330 // wash in terms of code size or even better.
9331 //
9332 // However, we currently have no way of hinting to the register allocator that
9333 // MIMG addresses should be placed contiguously when it is possible to do so,
9334 // so force non-NSA for the common 2-address case as a heuristic.
9335 //
9336 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9337 // allocation when possible.
9338 //
9339 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9340 // set of the remaining addresses.
9341 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9342 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9343 const bool UseNSA = ST->hasNSAEncoding() &&
9344 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9345 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9346 const bool UsePartialNSA =
9347 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9348
9349 SDValue VAddr;
9350 if (UsePartialNSA) {
9351 VAddr = getBuildDwordsVector(DAG, DL,
9352 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9353 } else if (!UseNSA) {
9354 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9355 }
9356
9357 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9358 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9359 SDValue Unorm;
9360 if (!BaseOpcode->Sampler) {
9361 Unorm = True;
9362 } else {
9363 uint64_t UnormConst =
9364 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9365
9366 Unorm = UnormConst ? True : False;
9367 }
9368
9369 SDValue TFE;
9370 SDValue LWE;
9371 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9372 bool IsTexFail = false;
9373 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9374 return Op;
9375
9376 if (IsTexFail) {
9377 if (!DMaskLanes) {
9378 // Expecting to get an error flag since TFC is on - and dmask is 0
9379 // Force dmask to be at least 1 otherwise the instruction will fail
9380 DMask = 0x1;
9381 DMaskLanes = 1;
9382 NumVDataDwords = 1;
9383 }
9384 NumVDataDwords += 1;
9385 AdjustRetType = true;
9386 }
9387
9388 // Has something earlier tagged that the return type needs adjusting
9389 // This happens if the instruction is a load or has set TexFailCtrl flags
9390 if (AdjustRetType) {
9391 // NumVDataDwords reflects the true number of dwords required in the return
9392 // type
9393 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9394 // This is a no-op load. This can be eliminated
9395 SDValue Undef = DAG.getPOISON(Op.getValueType());
9396 if (isa<MemSDNode>(Op))
9397 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9398 return Undef;
9399 }
9400
9401 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9402 MVT::i32, NumVDataDwords)
9403 : MVT::i32;
9404
9405 ResultTypes[0] = NewVT;
9406 if (ResultTypes.size() == 3) {
9407 // Original result was aggregate type used for TexFailCtrl results
9408 // The actual instruction returns as a vector type which has now been
9409 // created. Remove the aggregate result.
9410 ResultTypes.erase(&ResultTypes[1]);
9411 }
9412 }
9413
9414 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9415 if (BaseOpcode->Atomic)
9416 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9417 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9419 return Op;
9420
9422 if (BaseOpcode->Store || BaseOpcode->Atomic)
9423 Ops.push_back(VData); // vdata
9424 if (UsePartialNSA) {
9425 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9426 Ops.push_back(VAddr);
9427 } else if (UseNSA)
9428 append_range(Ops, VAddrs);
9429 else
9430 Ops.push_back(VAddr);
9431 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9432 EVT RsrcVT = Rsrc.getValueType();
9433 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9434 return Op;
9435 Ops.push_back(Rsrc);
9436 if (BaseOpcode->Sampler) {
9437 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9438 if (Samp.getValueType() != MVT::v4i32)
9439 return Op;
9440 Ops.push_back(Samp);
9441 }
9442 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9443 if (IsGFX10Plus)
9444 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9445 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9446 Ops.push_back(Unorm);
9447 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9448 Ops.push_back(IsA16 && // r128, a16 for gfx9
9449 ST->hasFeature(AMDGPU::FeatureR128A16)
9450 ? True
9451 : False);
9452 if (IsGFX10Plus)
9453 Ops.push_back(IsA16 ? True : False);
9454
9455 if (!Subtarget->hasGFX90AInsts())
9456 Ops.push_back(TFE); // tfe
9457 else if (TFE->getAsZExtVal()) {
9458 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9460 "TFE is not supported on this GPU", DL.getDebugLoc()));
9461 }
9462
9463 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9464 Ops.push_back(LWE); // lwe
9465 if (!IsGFX10Plus)
9466 Ops.push_back(DimInfo->DA ? True : False);
9467 if (BaseOpcode->HasD16)
9468 Ops.push_back(IsD16 ? True : False);
9469 if (isa<MemSDNode>(Op))
9470 Ops.push_back(Op.getOperand(0)); // chain
9471
9472 int NumVAddrDwords =
9473 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9474 int Opcode = -1;
9475
9476 if (IsGFX12Plus) {
9477 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9478 NumVDataDwords, NumVAddrDwords);
9479 } else if (IsGFX11Plus) {
9480 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9481 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9482 : AMDGPU::MIMGEncGfx11Default,
9483 NumVDataDwords, NumVAddrDwords);
9484 } else if (IsGFX10Plus) {
9485 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9486 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9487 : AMDGPU::MIMGEncGfx10Default,
9488 NumVDataDwords, NumVAddrDwords);
9489 } else {
9490 if (Subtarget->hasGFX90AInsts()) {
9491 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9492 NumVDataDwords, NumVAddrDwords);
9493 if (Opcode == -1) {
9494 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9496 "requested image instruction is not supported on this GPU",
9497 DL.getDebugLoc()));
9498
9499 unsigned Idx = 0;
9500 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9501 for (EVT VT : OrigResultTypes) {
9502 if (VT == MVT::Other)
9503 RetValues[Idx++] = Op.getOperand(0); // Chain
9504 else
9505 RetValues[Idx++] = DAG.getPOISON(VT);
9506 }
9507
9508 return DAG.getMergeValues(RetValues, DL);
9509 }
9510 }
9511 if (Opcode == -1 &&
9512 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9513 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9514 NumVDataDwords, NumVAddrDwords);
9515 if (Opcode == -1)
9516 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9517 NumVDataDwords, NumVAddrDwords);
9518 }
9519 if (Opcode == -1)
9520 return Op;
9521
9522 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9523 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9524 MachineMemOperand *MemRef = MemOp->getMemOperand();
9525 DAG.setNodeMemRefs(NewNode, {MemRef});
9526 }
9527
9528 if (BaseOpcode->AtomicX2) {
9530 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9531 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9532 }
9533 if (BaseOpcode->NoReturn)
9534 return SDValue(NewNode, 0);
9535 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9536 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9537 NumVDataDwords, IsAtomicPacked16Bit, DL);
9538}
9539
9540SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9541 SDValue Offset, SDValue CachePolicy,
9542 SelectionDAG &DAG) const {
9543 MachineFunction &MF = DAG.getMachineFunction();
9544
9545 const DataLayout &DataLayout = DAG.getDataLayout();
9546 Align Alignment =
9547 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9548
9549 MachineMemOperand *MMO = MF.getMachineMemOperand(
9550 MachinePointerInfo(),
9553 VT.getStoreSize(), Alignment);
9554
9555 if (!Offset->isDivergent()) {
9556 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9557
9558 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9559 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9560 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9561 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9562 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9563 SDValue BufferLoad =
9565 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9566 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9567 }
9568
9569 // Widen vec3 load to vec4.
9570 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9571 !Subtarget->hasScalarDwordx3Loads()) {
9572 EVT WidenedVT =
9574 auto WidenedOp = DAG.getMemIntrinsicNode(
9575 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9576 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9577 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9578 DAG.getVectorIdxConstant(0, DL));
9579 return Subvector;
9580 }
9581
9583 DAG.getVTList(VT), Ops, VT, MMO);
9584 }
9585
9586 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9587 // assume that the buffer is unswizzled.
9588 SDValue Ops[] = {
9589 DAG.getEntryNode(), // Chain
9590 Rsrc, // rsrc
9591 DAG.getConstant(0, DL, MVT::i32), // vindex
9592 {}, // voffset
9593 {}, // soffset
9594 {}, // offset
9595 CachePolicy, // cachepolicy
9596 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9597 };
9598 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9599 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9600 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9601 }
9602
9604 unsigned NumLoads = 1;
9605 MVT LoadVT = VT.getSimpleVT();
9606 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9607 assert((LoadVT.getScalarType() == MVT::i32 ||
9608 LoadVT.getScalarType() == MVT::f32));
9609
9610 if (NumElts == 8 || NumElts == 16) {
9611 NumLoads = NumElts / 4;
9612 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9613 }
9614
9615 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9616
9617 // Use the alignment to ensure that the required offsets will fit into the
9618 // immediate offsets.
9619 setBufferOffsets(Offset, DAG, &Ops[3],
9620 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9621
9622 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9623 for (unsigned i = 0; i < NumLoads; ++i) {
9624 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9625 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9626 LoadVT, MMO, DAG));
9627 }
9628
9629 if (NumElts == 8 || NumElts == 16)
9630 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9631
9632 return Loads[0];
9633}
9634
9635SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9636 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9637 if (!Subtarget->hasArchitectedSGPRs())
9638 return {};
9639 SDLoc SL(Op);
9640 MVT VT = MVT::i32;
9641 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9642 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9643 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9644}
9645
9646SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9647 AMDGPU::Hwreg::Id HwReg,
9648 unsigned LowBit,
9649 unsigned Width) const {
9650 SDLoc SL(Op);
9651 using namespace AMDGPU::Hwreg;
9652 return {DAG.getMachineNode(
9653 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9654 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9655 SL, MVT::i32)),
9656 0};
9657}
9658
9659SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9660 unsigned Dim,
9661 const ArgDescriptor &Arg) const {
9662 SDLoc SL(Op);
9663 MachineFunction &MF = DAG.getMachineFunction();
9664 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9665 if (MaxID == 0)
9666 return DAG.getConstant(0, SL, MVT::i32);
9667
9668 // It's undefined behavior if a function marked with the amdgpu-no-*
9669 // attributes uses the corresponding intrinsic.
9670 if (!Arg)
9671 return DAG.getPOISON(Op->getValueType(0));
9672
9673 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9674 SDLoc(DAG.getEntryNode()), Arg);
9675
9676 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9677 // masking operations anyway.
9678 //
9679 // TODO: We could assert the top bit is 0 for the source copy.
9680 if (Arg.isMasked())
9681 return Val;
9682
9683 // Preserve the known bits after expansion to a copy.
9684 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9685 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9686 DAG.getValueType(SmallVT));
9687}
9688
9689SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9690 SelectionDAG &DAG) const {
9691 MachineFunction &MF = DAG.getMachineFunction();
9692 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9693
9694 EVT VT = Op.getValueType();
9695 SDLoc DL(Op);
9696 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9697
9698 // TODO: Should this propagate fast-math-flags?
9699
9700 switch (IntrinsicID) {
9701 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9702 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9703 return emitNonHSAIntrinsicError(DAG, DL, VT);
9704 return getPreloadedValue(DAG, *MFI, VT,
9706 }
9707 case Intrinsic::amdgcn_dispatch_ptr:
9708 case Intrinsic::amdgcn_queue_ptr: {
9709 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9710 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9711 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9712 DL.getDebugLoc()));
9713 return DAG.getPOISON(VT);
9714 }
9715
9716 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9719 return getPreloadedValue(DAG, *MFI, VT, RegID);
9720 }
9721 case Intrinsic::amdgcn_implicitarg_ptr: {
9722 if (MFI->isEntryFunction())
9723 return getImplicitArgPtr(DAG, DL);
9724 return getPreloadedValue(DAG, *MFI, VT,
9726 }
9727 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9729 // This only makes sense to call in a kernel, so just lower to null.
9730 return DAG.getConstant(0, DL, VT);
9731 }
9732
9733 return getPreloadedValue(DAG, *MFI, VT,
9735 }
9736 case Intrinsic::amdgcn_dispatch_id: {
9737 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9738 }
9739 case Intrinsic::amdgcn_rcp:
9740 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9741 case Intrinsic::amdgcn_rsq:
9742 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9743 case Intrinsic::amdgcn_rsq_legacy:
9744 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9745 return emitRemovedIntrinsicError(DAG, DL, VT);
9746 return SDValue();
9747 case Intrinsic::amdgcn_rcp_legacy:
9748 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9749 return emitRemovedIntrinsicError(DAG, DL, VT);
9750 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9751 case Intrinsic::amdgcn_rsq_clamp: {
9752 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9753 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9754
9755 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9756 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9757 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9758
9759 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9760 SDValue Tmp =
9761 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9762 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9763 DAG.getConstantFP(Min, DL, VT));
9764 }
9765 case Intrinsic::r600_read_ngroups_x:
9766 if (Subtarget->isAmdHsaOS())
9767 return emitNonHSAIntrinsicError(DAG, DL, VT);
9768
9769 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9771 false);
9772 case Intrinsic::r600_read_ngroups_y:
9773 if (Subtarget->isAmdHsaOS())
9774 return emitNonHSAIntrinsicError(DAG, DL, VT);
9775
9776 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9778 false);
9779 case Intrinsic::r600_read_ngroups_z:
9780 if (Subtarget->isAmdHsaOS())
9781 return emitNonHSAIntrinsicError(DAG, DL, VT);
9782
9783 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9785 false);
9786 case Intrinsic::r600_read_local_size_x:
9787 if (Subtarget->isAmdHsaOS())
9788 return emitNonHSAIntrinsicError(DAG, DL, VT);
9789
9790 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9792 case Intrinsic::r600_read_local_size_y:
9793 if (Subtarget->isAmdHsaOS())
9794 return emitNonHSAIntrinsicError(DAG, DL, VT);
9795
9796 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9798 case Intrinsic::r600_read_local_size_z:
9799 if (Subtarget->isAmdHsaOS())
9800 return emitNonHSAIntrinsicError(DAG, DL, VT);
9801
9802 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9804 case Intrinsic::amdgcn_workgroup_id_x:
9805 return lowerWorkGroupId(DAG, *MFI, VT,
9809 case Intrinsic::amdgcn_workgroup_id_y:
9810 return lowerWorkGroupId(DAG, *MFI, VT,
9814 case Intrinsic::amdgcn_workgroup_id_z:
9815 return lowerWorkGroupId(DAG, *MFI, VT,
9819 case Intrinsic::amdgcn_cluster_id_x:
9820 return Subtarget->hasClusters()
9821 ? getPreloadedValue(DAG, *MFI, VT,
9823 : DAG.getPOISON(VT);
9824 case Intrinsic::amdgcn_cluster_id_y:
9825 return Subtarget->hasClusters()
9826 ? getPreloadedValue(DAG, *MFI, VT,
9828 : DAG.getPOISON(VT);
9829 case Intrinsic::amdgcn_cluster_id_z:
9830 return Subtarget->hasClusters()
9831 ? getPreloadedValue(DAG, *MFI, VT,
9833 : DAG.getPOISON(VT);
9834 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9835 return Subtarget->hasClusters()
9836 ? getPreloadedValue(
9837 DAG, *MFI, VT,
9839 : DAG.getPOISON(VT);
9840 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9841 return Subtarget->hasClusters()
9842 ? getPreloadedValue(
9843 DAG, *MFI, VT,
9845 : DAG.getPOISON(VT);
9846 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9847 return Subtarget->hasClusters()
9848 ? getPreloadedValue(
9849 DAG, *MFI, VT,
9851 : DAG.getPOISON(VT);
9852 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9853 return Subtarget->hasClusters()
9854 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9855 : SDValue();
9856 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9857 return Subtarget->hasClusters()
9858 ? getPreloadedValue(
9859 DAG, *MFI, VT,
9861 : DAG.getPOISON(VT);
9862 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9863 return Subtarget->hasClusters()
9864 ? getPreloadedValue(
9865 DAG, *MFI, VT,
9867 : DAG.getPOISON(VT);
9868 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9869 return Subtarget->hasClusters()
9870 ? getPreloadedValue(
9871 DAG, *MFI, VT,
9873 : DAG.getPOISON(VT);
9874 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9875 return Subtarget->hasClusters()
9876 ? getPreloadedValue(
9877 DAG, *MFI, VT,
9879 : DAG.getPOISON(VT);
9880 case Intrinsic::amdgcn_wave_id:
9881 return lowerWaveID(DAG, Op);
9882 case Intrinsic::amdgcn_lds_kernel_id: {
9883 if (MFI->isEntryFunction())
9884 return getLDSKernelId(DAG, DL);
9885 return getPreloadedValue(DAG, *MFI, VT,
9887 }
9888 case Intrinsic::amdgcn_workitem_id_x:
9889 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9890 case Intrinsic::amdgcn_workitem_id_y:
9891 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9892 case Intrinsic::amdgcn_workitem_id_z:
9893 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9894 case Intrinsic::amdgcn_wavefrontsize:
9895 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9896 SDLoc(Op), MVT::i32);
9897 case Intrinsic::amdgcn_s_buffer_load: {
9898 unsigned CPol = Op.getConstantOperandVal(3);
9899 // s_buffer_load, because of how it's optimized, can't be volatile
9900 // so reject ones with the volatile bit set.
9901 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9904 return Op;
9905 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9906 Op.getOperand(3), DAG);
9907 }
9908 case Intrinsic::amdgcn_fdiv_fast:
9909 return lowerFDIV_FAST(Op, DAG);
9910 case Intrinsic::amdgcn_sin:
9911 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9912
9913 case Intrinsic::amdgcn_cos:
9914 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9915
9916 case Intrinsic::amdgcn_mul_u24:
9917 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9918 Op.getOperand(2));
9919 case Intrinsic::amdgcn_mul_i24:
9920 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9921 Op.getOperand(2));
9922
9923 case Intrinsic::amdgcn_log_clamp: {
9924 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9925 return SDValue();
9926
9927 return emitRemovedIntrinsicError(DAG, DL, VT);
9928 }
9929 case Intrinsic::amdgcn_fract:
9930 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9931
9932 case Intrinsic::amdgcn_class:
9933 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9934 Op.getOperand(2));
9935 case Intrinsic::amdgcn_div_fmas:
9936 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9937 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9938
9939 case Intrinsic::amdgcn_div_fixup:
9940 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9941 Op.getOperand(2), Op.getOperand(3));
9942
9943 case Intrinsic::amdgcn_div_scale: {
9944 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9945
9946 // Translate to the operands expected by the machine instruction. The
9947 // first parameter must be the same as the first instruction.
9948 SDValue Numerator = Op.getOperand(1);
9949 SDValue Denominator = Op.getOperand(2);
9950
9951 // Note this order is opposite of the machine instruction's operations,
9952 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9953 // intrinsic has the numerator as the first operand to match a normal
9954 // division operation.
9955
9956 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9957
9958 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9959 Denominator, Numerator);
9960 }
9961 case Intrinsic::amdgcn_icmp: {
9962 // There is a Pat that handles this variant, so return it as-is.
9963 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9964 Op.getConstantOperandVal(2) == 0 &&
9965 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9966 return Op;
9967 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9968 }
9969 case Intrinsic::amdgcn_fcmp: {
9970 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9971 }
9972 case Intrinsic::amdgcn_ballot:
9973 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9974 case Intrinsic::amdgcn_fmed3:
9975 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9976 Op.getOperand(2), Op.getOperand(3));
9977 case Intrinsic::amdgcn_fdot2:
9978 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9979 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9980 case Intrinsic::amdgcn_fmul_legacy:
9981 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9982 Op.getOperand(2));
9983 case Intrinsic::amdgcn_sffbh:
9984 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9985 case Intrinsic::amdgcn_sbfe:
9986 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9987 Op.getOperand(2), Op.getOperand(3));
9988 case Intrinsic::amdgcn_ubfe:
9989 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9990 Op.getOperand(2), Op.getOperand(3));
9991 case Intrinsic::amdgcn_cvt_pkrtz:
9992 case Intrinsic::amdgcn_cvt_pknorm_i16:
9993 case Intrinsic::amdgcn_cvt_pknorm_u16:
9994 case Intrinsic::amdgcn_cvt_pk_i16:
9995 case Intrinsic::amdgcn_cvt_pk_u16: {
9996 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9997 EVT VT = Op.getValueType();
9998 unsigned Opcode;
9999
10000 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10002 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10004 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10006 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10008 else
10010
10011 if (isTypeLegal(VT))
10012 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10013
10014 SDValue Node =
10015 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10016 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10017 }
10018 case Intrinsic::amdgcn_fmad_ftz:
10019 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10020 Op.getOperand(2), Op.getOperand(3));
10021
10022 case Intrinsic::amdgcn_if_break:
10023 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10024 Op->getOperand(1), Op->getOperand(2)),
10025 0);
10026
10027 case Intrinsic::amdgcn_groupstaticsize: {
10029 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10030 return Op;
10031
10032 const Module *M = MF.getFunction().getParent();
10033 const GlobalValue *GV =
10034 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10035 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10037 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10038 }
10039 case Intrinsic::amdgcn_is_shared:
10040 case Intrinsic::amdgcn_is_private: {
10041 SDLoc SL(Op);
10042 SDValue SrcVec =
10043 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10044 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10045 DAG.getConstant(1, SL, MVT::i32));
10046
10047 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10049 : AMDGPUAS::PRIVATE_ADDRESS;
10050 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10051 Subtarget->hasGloballyAddressableScratch()) {
10052 SDValue FlatScratchBaseHi(
10053 DAG.getMachineNode(
10054 AMDGPU::S_MOV_B32, DL, MVT::i32,
10055 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10056 0);
10057 // Test bits 63..58 against the aperture address.
10058 return DAG.getSetCC(
10059 SL, MVT::i1,
10060 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10061 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10062 }
10063
10064 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10065 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10066 }
10067 case Intrinsic::amdgcn_perm:
10068 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10069 Op.getOperand(2), Op.getOperand(3));
10070 case Intrinsic::amdgcn_reloc_constant: {
10071 Module *M = MF.getFunction().getParent();
10072 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10073 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10074 auto *RelocSymbol = cast<GlobalVariable>(
10075 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10076 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10078 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10079 }
10080 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10081 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10082 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10083 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10084 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10085 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10086 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10087 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10088 if (Op.getOperand(4).getValueType() == MVT::i32)
10089 return SDValue();
10090
10091 SDLoc SL(Op);
10092 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10093 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10094 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10095 Op.getOperand(3), IndexKeyi32);
10096 }
10097 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10098 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10099 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10100 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10101 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10102 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10103 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10104 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10105 if (Op.getOperand(4).getValueType() == MVT::i64)
10106 return SDValue();
10107
10108 SDLoc SL(Op);
10109 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10110 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10111 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10112 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10113 Op.getOperand(6)});
10114 }
10115 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10116 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10117 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10118 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10119 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10120 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10121 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10122 ? MVT::i64
10123 : MVT::i32;
10124 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10125 return SDValue();
10126
10127 SDLoc SL(Op);
10128 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10129 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10130 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10131 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10132 IndexKey, Op.getOperand(7),
10133 Op.getOperand(8)}); // No clamp operand
10134 }
10135 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10136 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10137 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10138 if (Op.getOperand(6).getValueType() == MVT::i32)
10139 return SDValue();
10140
10141 SDLoc SL(Op);
10142 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10143 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10144 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10145 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10146 IndexKeyi32, Op.getOperand(7)});
10147 }
10148 case Intrinsic::amdgcn_addrspacecast_nonnull:
10149 return lowerADDRSPACECAST(Op, DAG);
10150 case Intrinsic::amdgcn_readlane:
10151 case Intrinsic::amdgcn_readfirstlane:
10152 case Intrinsic::amdgcn_writelane:
10153 case Intrinsic::amdgcn_permlane16:
10154 case Intrinsic::amdgcn_permlanex16:
10155 case Intrinsic::amdgcn_permlane64:
10156 case Intrinsic::amdgcn_set_inactive:
10157 case Intrinsic::amdgcn_set_inactive_chain_arg:
10158 case Intrinsic::amdgcn_mov_dpp8:
10159 case Intrinsic::amdgcn_update_dpp:
10160 return lowerLaneOp(*this, Op.getNode(), DAG);
10161 case Intrinsic::amdgcn_dead: {
10163 for (const EVT ValTy : Op.getNode()->values())
10164 Poisons.push_back(DAG.getPOISON(ValTy));
10165 return DAG.getMergeValues(Poisons, SDLoc(Op));
10166 }
10167 default:
10168 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10170 return lowerImage(Op, ImageDimIntr, DAG, false);
10171
10172 return Op;
10173 }
10174}
10175
10176// On targets not supporting constant in soffset field, turn zero to
10177// SGPR_NULL to avoid generating an extra s_mov with zero.
10179 const GCNSubtarget *Subtarget) {
10180 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10181 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10182 return SOffset;
10183}
10184
10185SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10186 SelectionDAG &DAG,
10187 unsigned NewOpcode) const {
10188 SDLoc DL(Op);
10189
10190 SDValue VData = Op.getOperand(2);
10191 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10192 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10193 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10194 SDValue Ops[] = {
10195 Op.getOperand(0), // Chain
10196 VData, // vdata
10197 Rsrc, // rsrc
10198 DAG.getConstant(0, DL, MVT::i32), // vindex
10199 VOffset, // voffset
10200 SOffset, // soffset
10201 Offset, // offset
10202 Op.getOperand(6), // cachepolicy
10203 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10204 };
10205
10206 auto *M = cast<MemSDNode>(Op);
10207
10208 EVT MemVT = VData.getValueType();
10209 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10210 M->getMemOperand());
10211}
10212
10213SDValue
10214SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10215 unsigned NewOpcode) const {
10216 SDLoc DL(Op);
10217
10218 SDValue VData = Op.getOperand(2);
10219 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10220 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10221 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10222 SDValue Ops[] = {
10223 Op.getOperand(0), // Chain
10224 VData, // vdata
10225 Rsrc, // rsrc
10226 Op.getOperand(4), // vindex
10227 VOffset, // voffset
10228 SOffset, // soffset
10229 Offset, // offset
10230 Op.getOperand(7), // cachepolicy
10231 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10232 };
10233
10234 auto *M = cast<MemSDNode>(Op);
10235
10236 EVT MemVT = VData.getValueType();
10237 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10238 M->getMemOperand());
10239}
10240
10241SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10242 SelectionDAG &DAG) const {
10243 unsigned IntrID = Op.getConstantOperandVal(1);
10244 SDLoc DL(Op);
10245
10246 switch (IntrID) {
10247 case Intrinsic::amdgcn_ds_ordered_add:
10248 case Intrinsic::amdgcn_ds_ordered_swap: {
10249 MemSDNode *M = cast<MemSDNode>(Op);
10250 SDValue Chain = M->getOperand(0);
10251 SDValue M0 = M->getOperand(2);
10252 SDValue Value = M->getOperand(3);
10253 unsigned IndexOperand = M->getConstantOperandVal(7);
10254 unsigned WaveRelease = M->getConstantOperandVal(8);
10255 unsigned WaveDone = M->getConstantOperandVal(9);
10256
10257 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10258 IndexOperand &= ~0x3f;
10259 unsigned CountDw = 0;
10260
10261 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10262 CountDw = (IndexOperand >> 24) & 0xf;
10263 IndexOperand &= ~(0xf << 24);
10264
10265 if (CountDw < 1 || CountDw > 4) {
10266 const Function &Fn = DAG.getMachineFunction().getFunction();
10267 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10268 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10269 DL.getDebugLoc()));
10270 CountDw = 1;
10271 }
10272 }
10273
10274 if (IndexOperand) {
10275 const Function &Fn = DAG.getMachineFunction().getFunction();
10276 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10277 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10278 }
10279
10280 if (WaveDone && !WaveRelease) {
10281 // TODO: Move this to IR verifier
10282 const Function &Fn = DAG.getMachineFunction().getFunction();
10283 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10284 Fn, "ds_ordered_count: wave_done requires wave_release",
10285 DL.getDebugLoc()));
10286 }
10287
10288 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10289 unsigned ShaderType =
10291 unsigned Offset0 = OrderedCountIndex << 2;
10292 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10293
10294 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10295 Offset1 |= (CountDw - 1) << 6;
10296
10297 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10298 Offset1 |= ShaderType << 2;
10299
10300 unsigned Offset = Offset0 | (Offset1 << 8);
10301
10302 SDValue Ops[] = {
10303 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10304 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10305 };
10307 M->getVTList(), Ops, M->getMemoryVT(),
10308 M->getMemOperand());
10309 }
10310 case Intrinsic::amdgcn_raw_buffer_load:
10311 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10312 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10313 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10314 case Intrinsic::amdgcn_raw_buffer_load_format:
10315 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10316 const bool IsFormat =
10317 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10318 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10319
10320 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10321 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10322 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10323 SDValue Ops[] = {
10324 Op.getOperand(0), // Chain
10325 Rsrc, // rsrc
10326 DAG.getConstant(0, DL, MVT::i32), // vindex
10327 VOffset, // voffset
10328 SOffset, // soffset
10329 Offset, // offset
10330 Op.getOperand(5), // cachepolicy, swizzled buffer
10331 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10332 };
10333
10334 auto *M = cast<MemSDNode>(Op);
10335 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10336 }
10337 case Intrinsic::amdgcn_struct_buffer_load:
10338 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10339 case Intrinsic::amdgcn_struct_buffer_load_format:
10340 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10341 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10342 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10343 const bool IsFormat =
10344 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10345 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10346
10347 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10348 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10349 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10350 SDValue Ops[] = {
10351 Op.getOperand(0), // Chain
10352 Rsrc, // rsrc
10353 Op.getOperand(3), // vindex
10354 VOffset, // voffset
10355 SOffset, // soffset
10356 Offset, // offset
10357 Op.getOperand(6), // cachepolicy, swizzled buffer
10358 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10359 };
10360
10361 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10362 }
10363 case Intrinsic::amdgcn_raw_tbuffer_load:
10364 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10365 MemSDNode *M = cast<MemSDNode>(Op);
10366 EVT LoadVT = Op.getValueType();
10367 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10368 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10369 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10370
10371 SDValue Ops[] = {
10372 Op.getOperand(0), // Chain
10373 Rsrc, // rsrc
10374 DAG.getConstant(0, DL, MVT::i32), // vindex
10375 VOffset, // voffset
10376 SOffset, // soffset
10377 Offset, // offset
10378 Op.getOperand(5), // format
10379 Op.getOperand(6), // cachepolicy, swizzled buffer
10380 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10381 };
10382
10383 if (LoadVT.getScalarType() == MVT::f16)
10384 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10385 Ops);
10386 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10387 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10388 DAG);
10389 }
10390 case Intrinsic::amdgcn_struct_tbuffer_load:
10391 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10392 MemSDNode *M = cast<MemSDNode>(Op);
10393 EVT LoadVT = Op.getValueType();
10394 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10395 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10396 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10397
10398 SDValue Ops[] = {
10399 Op.getOperand(0), // Chain
10400 Rsrc, // rsrc
10401 Op.getOperand(3), // vindex
10402 VOffset, // voffset
10403 SOffset, // soffset
10404 Offset, // offset
10405 Op.getOperand(6), // format
10406 Op.getOperand(7), // cachepolicy, swizzled buffer
10407 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10408 };
10409
10410 if (LoadVT.getScalarType() == MVT::f16)
10411 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10412 Ops);
10413 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10414 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10415 DAG);
10416 }
10417 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10418 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10419 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10420 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10421 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10422 return lowerStructBufferAtomicIntrin(Op, DAG,
10424 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10426 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10427 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10428 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10429 return lowerStructBufferAtomicIntrin(Op, DAG,
10431 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10433 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10434 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10435 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10436 return lowerStructBufferAtomicIntrin(Op, DAG,
10438 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10439 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10440 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10441 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10442 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10443 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10444 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10446 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10447 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10448 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10449 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10450 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10451 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10452 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10453 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10454 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10455 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10456 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10458 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10459 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10460 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10461 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10462 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10463 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10464 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10465 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10466 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10467 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10468 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10469 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10470 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10471 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10473 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10474 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10475 return lowerRawBufferAtomicIntrin(Op, DAG,
10477 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10478 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10479 return lowerStructBufferAtomicIntrin(Op, DAG,
10481 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10482 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10483 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10484 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10485 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10486 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10487 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10488 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10489 return lowerStructBufferAtomicIntrin(Op, DAG,
10491 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10492 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10493 return lowerStructBufferAtomicIntrin(Op, DAG,
10495 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10496 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10497 return lowerStructBufferAtomicIntrin(Op, DAG,
10499 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10501 return lowerStructBufferAtomicIntrin(Op, DAG,
10503 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10505 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10506 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10507 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10508 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10509 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10510 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10511 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10512 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10513 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10514 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10515 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10516 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10517 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10518 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10519 return lowerStructBufferAtomicIntrin(Op, DAG,
10521
10522 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10523 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10524 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10525 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10526 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10527 SDValue Ops[] = {
10528 Op.getOperand(0), // Chain
10529 Op.getOperand(2), // src
10530 Op.getOperand(3), // cmp
10531 Rsrc, // rsrc
10532 DAG.getConstant(0, DL, MVT::i32), // vindex
10533 VOffset, // voffset
10534 SOffset, // soffset
10535 Offset, // offset
10536 Op.getOperand(7), // cachepolicy
10537 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10538 };
10539 EVT VT = Op.getValueType();
10540 auto *M = cast<MemSDNode>(Op);
10541
10543 Op->getVTList(), Ops, VT,
10544 M->getMemOperand());
10545 }
10546 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10547 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10548 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10549 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10550 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10551 SDValue Ops[] = {
10552 Op.getOperand(0), // Chain
10553 Op.getOperand(2), // src
10554 Op.getOperand(3), // cmp
10555 Rsrc, // rsrc
10556 Op.getOperand(5), // vindex
10557 VOffset, // voffset
10558 SOffset, // soffset
10559 Offset, // offset
10560 Op.getOperand(8), // cachepolicy
10561 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10562 };
10563 EVT VT = Op.getValueType();
10564 auto *M = cast<MemSDNode>(Op);
10565
10567 Op->getVTList(), Ops, VT,
10568 M->getMemOperand());
10569 }
10570 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10571 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10572 MemSDNode *M = cast<MemSDNode>(Op);
10573 SDValue NodePtr = M->getOperand(2);
10574 SDValue RayExtent = M->getOperand(3);
10575 SDValue InstanceMask = M->getOperand(4);
10576 SDValue RayOrigin = M->getOperand(5);
10577 SDValue RayDir = M->getOperand(6);
10578 SDValue Offsets = M->getOperand(7);
10579 SDValue TDescr = M->getOperand(8);
10580
10581 assert(NodePtr.getValueType() == MVT::i64);
10582 assert(RayDir.getValueType() == MVT::v3f32);
10583
10584 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10585 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10586 return SDValue();
10587 }
10588
10589 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10590 const unsigned NumVDataDwords = 10;
10591 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10592 int Opcode = AMDGPU::getMIMGOpcode(
10593 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10594 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10595 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10596 assert(Opcode != -1);
10597
10599 Ops.push_back(NodePtr);
10600 Ops.push_back(DAG.getBuildVector(
10601 MVT::v2i32, DL,
10602 {DAG.getBitcast(MVT::i32, RayExtent),
10603 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10604 Ops.push_back(RayOrigin);
10605 Ops.push_back(RayDir);
10606 Ops.push_back(Offsets);
10607 Ops.push_back(TDescr);
10608 Ops.push_back(M->getChain());
10609
10610 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10611 MachineMemOperand *MemRef = M->getMemOperand();
10612 DAG.setNodeMemRefs(NewNode, {MemRef});
10613 return SDValue(NewNode, 0);
10614 }
10615 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10616 MemSDNode *M = cast<MemSDNode>(Op);
10617 SDValue NodePtr = M->getOperand(2);
10618 SDValue RayExtent = M->getOperand(3);
10619 SDValue RayOrigin = M->getOperand(4);
10620 SDValue RayDir = M->getOperand(5);
10621 SDValue RayInvDir = M->getOperand(6);
10622 SDValue TDescr = M->getOperand(7);
10623
10624 assert(NodePtr.getValueType() == MVT::i32 ||
10625 NodePtr.getValueType() == MVT::i64);
10626 assert(RayDir.getValueType() == MVT::v3f16 ||
10627 RayDir.getValueType() == MVT::v3f32);
10628
10629 if (!Subtarget->hasGFX10_AEncoding()) {
10630 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10631 return SDValue();
10632 }
10633
10634 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10635 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10636 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10637 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10638 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10639 const unsigned NumVDataDwords = 4;
10640 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10641 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10642 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10643 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10644 IsGFX12Plus;
10645 const unsigned BaseOpcodes[2][2] = {
10646 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10647 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10648 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10649 int Opcode;
10650 if (UseNSA) {
10651 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10652 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10653 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10654 : AMDGPU::MIMGEncGfx10NSA,
10655 NumVDataDwords, NumVAddrDwords);
10656 } else {
10657 assert(!IsGFX12Plus);
10658 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10659 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10660 : AMDGPU::MIMGEncGfx10Default,
10661 NumVDataDwords, NumVAddrDwords);
10662 }
10663 assert(Opcode != -1);
10664
10666
10667 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10669 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10670 if (Lanes[0].getValueSizeInBits() == 32) {
10671 for (unsigned I = 0; I < 3; ++I)
10672 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10673 } else {
10674 if (IsAligned) {
10675 Ops.push_back(DAG.getBitcast(
10676 MVT::i32,
10677 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10678 Ops.push_back(Lanes[2]);
10679 } else {
10680 SDValue Elt0 = Ops.pop_back_val();
10681 Ops.push_back(DAG.getBitcast(
10682 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10683 Ops.push_back(DAG.getBitcast(
10684 MVT::i32,
10685 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10686 }
10687 }
10688 };
10689
10690 if (UseNSA && IsGFX11Plus) {
10691 Ops.push_back(NodePtr);
10692 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10693 Ops.push_back(RayOrigin);
10694 if (IsA16) {
10695 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10696 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10697 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10698 for (unsigned I = 0; I < 3; ++I) {
10699 MergedLanes.push_back(DAG.getBitcast(
10700 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10701 {DirLanes[I], InvDirLanes[I]})));
10702 }
10703 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10704 } else {
10705 Ops.push_back(RayDir);
10706 Ops.push_back(RayInvDir);
10707 }
10708 } else {
10709 if (Is64)
10710 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10711 2);
10712 else
10713 Ops.push_back(NodePtr);
10714
10715 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10716 packLanes(RayOrigin, true);
10717 packLanes(RayDir, true);
10718 packLanes(RayInvDir, false);
10719 }
10720
10721 if (!UseNSA) {
10722 // Build a single vector containing all the operands so far prepared.
10723 if (NumVAddrDwords > 12) {
10724 SDValue Undef = DAG.getPOISON(MVT::i32);
10725 Ops.append(16 - Ops.size(), Undef);
10726 }
10727 assert(Ops.size() >= 8 && Ops.size() <= 12);
10728 SDValue MergedOps =
10729 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10730 Ops.clear();
10731 Ops.push_back(MergedOps);
10732 }
10733
10734 Ops.push_back(TDescr);
10735 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10736 Ops.push_back(M->getChain());
10737
10738 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10739 MachineMemOperand *MemRef = M->getMemOperand();
10740 DAG.setNodeMemRefs(NewNode, {MemRef});
10741 return SDValue(NewNode, 0);
10742 }
10743 case Intrinsic::amdgcn_global_atomic_fmin_num:
10744 case Intrinsic::amdgcn_global_atomic_fmax_num:
10745 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10746 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10747 MemSDNode *M = cast<MemSDNode>(Op);
10748 SDValue Ops[] = {
10749 M->getOperand(0), // Chain
10750 M->getOperand(2), // Ptr
10751 M->getOperand(3) // Value
10752 };
10753 unsigned Opcode = 0;
10754 switch (IntrID) {
10755 case Intrinsic::amdgcn_global_atomic_fmin_num:
10756 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10757 Opcode = ISD::ATOMIC_LOAD_FMIN;
10758 break;
10759 }
10760 case Intrinsic::amdgcn_global_atomic_fmax_num:
10761 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10762 Opcode = ISD::ATOMIC_LOAD_FMAX;
10763 break;
10764 }
10765 default:
10766 llvm_unreachable("unhandled atomic opcode");
10767 }
10768 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10769 Ops, M->getMemOperand());
10770 }
10771 case Intrinsic::amdgcn_s_get_barrier_state:
10772 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10773 SDValue Chain = Op->getOperand(0);
10775 unsigned Opc;
10776
10777 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10778 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10779 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10780 BarID = (BarID >> 4) & 0x3F;
10781 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10782 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10783 Ops.push_back(K);
10784 Ops.push_back(Chain);
10785 } else {
10786 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10787 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10788 SDValue M0Val;
10789 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10790 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10791 M0Val = SDValue(
10792 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10793 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10794 0);
10795 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10796 } else
10797 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10798 }
10799
10800 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10801 return SDValue(NewMI, 0);
10802 }
10803 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10804 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10805 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10806 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10807 SDValue Chain = Op->getOperand(0);
10808 SDValue Ptr = Op->getOperand(2);
10809 EVT VT = Op->getValueType(0);
10810 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10811 Chain, Ptr, MII->getMemOperand());
10812 }
10813 default:
10814
10815 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10817 return lowerImage(Op, ImageDimIntr, DAG, true);
10818
10819 return SDValue();
10820 }
10821}
10822
10823// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10824// dwordx4 if on SI and handle TFE loads.
10825SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10826 SDVTList VTList,
10827 ArrayRef<SDValue> Ops, EVT MemVT,
10828 MachineMemOperand *MMO,
10829 SelectionDAG &DAG) const {
10830 LLVMContext &C = *DAG.getContext();
10831 MachineFunction &MF = DAG.getMachineFunction();
10832 EVT VT = VTList.VTs[0];
10833
10834 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10835 bool IsTFE = VTList.NumVTs == 3;
10836 if (IsTFE) {
10837 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10838 unsigned NumOpDWords = NumValueDWords + 1;
10839 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10840 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10841 MachineMemOperand *OpDWordsMMO =
10842 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10843 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10844 OpDWordsVT, OpDWordsMMO, DAG);
10845 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10846 DAG.getVectorIdxConstant(NumValueDWords, DL));
10847 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10848 SDValue ValueDWords =
10849 NumValueDWords == 1
10850 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10852 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10853 ZeroIdx);
10854 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10855 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10856 }
10857
10858 if (!Subtarget->hasDwordx3LoadStores() &&
10859 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10860 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10861 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10862 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10863 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10864 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10865 WidenedMemVT, WidenedMMO);
10867 DAG.getVectorIdxConstant(0, DL));
10868 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10869 }
10870
10871 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10872}
10873
10874SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10875 bool ImageStore) const {
10876 EVT StoreVT = VData.getValueType();
10877
10878 // No change for f16 and legal vector D16 types.
10879 if (!StoreVT.isVector())
10880 return VData;
10881
10882 SDLoc DL(VData);
10883 unsigned NumElements = StoreVT.getVectorNumElements();
10884
10885 if (Subtarget->hasUnpackedD16VMem()) {
10886 // We need to unpack the packed data to store.
10887 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10888 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10889
10890 EVT EquivStoreVT =
10891 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10892 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10893 return DAG.UnrollVectorOp(ZExt.getNode());
10894 }
10895
10896 // The sq block of gfx8.1 does not estimate register use correctly for d16
10897 // image store instructions. The data operand is computed as if it were not a
10898 // d16 image instruction.
10899 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10900 // Bitcast to i16
10901 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10902 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10903
10904 // Decompose into scalars
10906 DAG.ExtractVectorElements(IntVData, Elts);
10907
10908 // Group pairs of i16 into v2i16 and bitcast to i32
10909 SmallVector<SDValue, 4> PackedElts;
10910 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10911 SDValue Pair =
10912 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10913 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10914 PackedElts.push_back(IntPair);
10915 }
10916 if ((NumElements % 2) == 1) {
10917 // Handle v3i16
10918 unsigned I = Elts.size() / 2;
10919 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10920 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10921 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10922 PackedElts.push_back(IntPair);
10923 }
10924
10925 // Pad using UNDEF
10926 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10927
10928 // Build final vector
10929 EVT VecVT =
10930 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10931 return DAG.getBuildVector(VecVT, DL, PackedElts);
10932 }
10933
10934 if (NumElements == 3) {
10935 EVT IntStoreVT =
10937 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10938
10939 EVT WidenedStoreVT = EVT::getVectorVT(
10940 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10941 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10942 WidenedStoreVT.getStoreSizeInBits());
10943 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10944 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10945 }
10946
10947 assert(isTypeLegal(StoreVT));
10948 return VData;
10949}
10950
10951SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10952 SelectionDAG &DAG) const {
10953 SDLoc DL(Op);
10954 SDValue Chain = Op.getOperand(0);
10955 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10956 MachineFunction &MF = DAG.getMachineFunction();
10957
10958 switch (IntrinsicID) {
10959 case Intrinsic::amdgcn_exp_compr: {
10960 if (!Subtarget->hasCompressedExport()) {
10961 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10963 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10964 }
10965 SDValue Src0 = Op.getOperand(4);
10966 SDValue Src1 = Op.getOperand(5);
10967 // Hack around illegal type on SI by directly selecting it.
10968 if (isTypeLegal(Src0.getValueType()))
10969 return SDValue();
10970
10971 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10972 SDValue Undef = DAG.getPOISON(MVT::f32);
10973 const SDValue Ops[] = {
10974 Op.getOperand(2), // tgt
10975 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10976 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10977 Undef, // src2
10978 Undef, // src3
10979 Op.getOperand(7), // vm
10980 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10981 Op.getOperand(3), // en
10982 Op.getOperand(0) // Chain
10983 };
10984
10985 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10986 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10987 }
10988
10989 case Intrinsic::amdgcn_struct_tbuffer_store:
10990 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10991 SDValue VData = Op.getOperand(2);
10992 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10993 if (IsD16)
10994 VData = handleD16VData(VData, DAG);
10995 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10996 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10997 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10998 SDValue Ops[] = {
10999 Chain,
11000 VData, // vdata
11001 Rsrc, // rsrc
11002 Op.getOperand(4), // vindex
11003 VOffset, // voffset
11004 SOffset, // soffset
11005 Offset, // offset
11006 Op.getOperand(7), // format
11007 Op.getOperand(8), // cachepolicy, swizzled buffer
11008 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11009 };
11010 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11012 MemSDNode *M = cast<MemSDNode>(Op);
11013 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11014 M->getMemoryVT(), M->getMemOperand());
11015 }
11016
11017 case Intrinsic::amdgcn_raw_tbuffer_store:
11018 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11019 SDValue VData = Op.getOperand(2);
11020 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11021 if (IsD16)
11022 VData = handleD16VData(VData, DAG);
11023 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11024 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11025 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11026 SDValue Ops[] = {
11027 Chain,
11028 VData, // vdata
11029 Rsrc, // rsrc
11030 DAG.getConstant(0, DL, MVT::i32), // vindex
11031 VOffset, // voffset
11032 SOffset, // soffset
11033 Offset, // offset
11034 Op.getOperand(6), // format
11035 Op.getOperand(7), // cachepolicy, swizzled buffer
11036 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11037 };
11038 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11040 MemSDNode *M = cast<MemSDNode>(Op);
11041 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11042 M->getMemoryVT(), M->getMemOperand());
11043 }
11044
11045 case Intrinsic::amdgcn_raw_buffer_store:
11046 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11047 case Intrinsic::amdgcn_raw_buffer_store_format:
11048 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11049 const bool IsFormat =
11050 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11051 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11052
11053 SDValue VData = Op.getOperand(2);
11054 EVT VDataVT = VData.getValueType();
11055 EVT EltType = VDataVT.getScalarType();
11056 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11057 if (IsD16) {
11058 VData = handleD16VData(VData, DAG);
11059 VDataVT = VData.getValueType();
11060 }
11061
11062 if (!isTypeLegal(VDataVT)) {
11063 VData =
11064 DAG.getNode(ISD::BITCAST, DL,
11065 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11066 }
11067
11068 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11069 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11070 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11071 SDValue Ops[] = {
11072 Chain,
11073 VData,
11074 Rsrc,
11075 DAG.getConstant(0, DL, MVT::i32), // vindex
11076 VOffset, // voffset
11077 SOffset, // soffset
11078 Offset, // offset
11079 Op.getOperand(6), // cachepolicy, swizzled buffer
11080 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11081 };
11082 unsigned Opc =
11085 MemSDNode *M = cast<MemSDNode>(Op);
11086
11087 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11088 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11089 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11090
11091 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11092 M->getMemoryVT(), M->getMemOperand());
11093 }
11094
11095 case Intrinsic::amdgcn_struct_buffer_store:
11096 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11097 case Intrinsic::amdgcn_struct_buffer_store_format:
11098 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11099 const bool IsFormat =
11100 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11101 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11102
11103 SDValue VData = Op.getOperand(2);
11104 EVT VDataVT = VData.getValueType();
11105 EVT EltType = VDataVT.getScalarType();
11106 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11107
11108 if (IsD16) {
11109 VData = handleD16VData(VData, DAG);
11110 VDataVT = VData.getValueType();
11111 }
11112
11113 if (!isTypeLegal(VDataVT)) {
11114 VData =
11115 DAG.getNode(ISD::BITCAST, DL,
11116 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11117 }
11118
11119 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11120 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11121 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11122 SDValue Ops[] = {
11123 Chain,
11124 VData,
11125 Rsrc,
11126 Op.getOperand(4), // vindex
11127 VOffset, // voffset
11128 SOffset, // soffset
11129 Offset, // offset
11130 Op.getOperand(7), // cachepolicy, swizzled buffer
11131 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11132 };
11133 unsigned Opc =
11136 MemSDNode *M = cast<MemSDNode>(Op);
11137
11138 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11139 EVT VDataType = VData.getValueType().getScalarType();
11140 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11141 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11142
11143 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11144 M->getMemoryVT(), M->getMemOperand());
11145 }
11146 case Intrinsic::amdgcn_raw_buffer_load_lds:
11147 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11148 case Intrinsic::amdgcn_struct_buffer_load_lds:
11149 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11150 if (!Subtarget->hasVMemToLDSLoad())
11151 return SDValue();
11152 unsigned Opc;
11153 bool HasVIndex =
11154 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11155 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11156 unsigned OpOffset = HasVIndex ? 1 : 0;
11157 SDValue VOffset = Op.getOperand(5 + OpOffset);
11158 bool HasVOffset = !isNullConstant(VOffset);
11159 unsigned Size = Op->getConstantOperandVal(4);
11160
11161 switch (Size) {
11162 default:
11163 return SDValue();
11164 case 1:
11165 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11166 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11167 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11168 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11169 break;
11170 case 2:
11171 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11172 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11173 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11174 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11175 break;
11176 case 4:
11177 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11178 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11179 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11180 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11181 break;
11182 case 12:
11183 if (!Subtarget->hasLDSLoadB96_B128())
11184 return SDValue();
11185 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11186 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11187 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11188 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11189 break;
11190 case 16:
11191 if (!Subtarget->hasLDSLoadB96_B128())
11192 return SDValue();
11193 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11194 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11195 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11196 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11197 break;
11198 }
11199
11200 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11201
11203
11204 if (HasVIndex && HasVOffset)
11205 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11206 {Op.getOperand(5), // VIndex
11207 VOffset}));
11208 else if (HasVIndex)
11209 Ops.push_back(Op.getOperand(5));
11210 else if (HasVOffset)
11211 Ops.push_back(VOffset);
11212
11213 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11214 Ops.push_back(Rsrc);
11215 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11216 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11217 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11218 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11219 Ops.push_back(DAG.getTargetConstant(
11220 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11221 DL, MVT::i8)); // cpol
11222 Ops.push_back(DAG.getTargetConstant(
11223 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11224 ? 1
11225 : 0,
11226 DL, MVT::i8)); // swz
11227 Ops.push_back(M0Val.getValue(0)); // Chain
11228 Ops.push_back(M0Val.getValue(1)); // Glue
11229
11230 auto *M = cast<MemSDNode>(Op);
11231 MachineMemOperand *LoadMMO = M->getMemOperand();
11232 // Don't set the offset value here because the pointer points to the base of
11233 // the buffer.
11234 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11235
11236 MachinePointerInfo StorePtrI = LoadPtrI;
11237 LoadPtrI.V = PoisonValue::get(
11241
11242 auto F = LoadMMO->getFlags() &
11244 LoadMMO =
11246 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11247
11248 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11249 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11250 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11251
11252 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11253 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11254
11255 return SDValue(Load, 0);
11256 }
11257 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11258 // for "trust me" that the remaining cases are global pointers until
11259 // such time as we can put two mem operands on an intrinsic.
11260 case Intrinsic::amdgcn_load_to_lds:
11261 case Intrinsic::amdgcn_global_load_lds: {
11262 if (!Subtarget->hasVMemToLDSLoad())
11263 return SDValue();
11264
11265 unsigned Opc;
11266 unsigned Size = Op->getConstantOperandVal(4);
11267 switch (Size) {
11268 default:
11269 return SDValue();
11270 case 1:
11271 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11272 break;
11273 case 2:
11274 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11275 break;
11276 case 4:
11277 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11278 break;
11279 case 12:
11280 if (!Subtarget->hasLDSLoadB96_B128())
11281 return SDValue();
11282 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11283 break;
11284 case 16:
11285 if (!Subtarget->hasLDSLoadB96_B128())
11286 return SDValue();
11287 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11288 break;
11289 }
11290
11291 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11292
11294
11295 SDValue Addr = Op.getOperand(2); // Global ptr
11296 SDValue VOffset;
11297 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11298 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11299 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
11300 SDValue LHS = Addr.getOperand(0);
11301 SDValue RHS = Addr.getOperand(1);
11302
11303 if (LHS->isDivergent())
11304 std::swap(LHS, RHS);
11305
11306 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11307 RHS.getOperand(0).getValueType() == MVT::i32) {
11308 // add (i64 sgpr), (zero_extend (i32 vgpr))
11309 Addr = LHS;
11310 VOffset = RHS.getOperand(0);
11311 }
11312 }
11313
11314 Ops.push_back(Addr);
11315 if (!Addr->isDivergent()) {
11317 if (!VOffset)
11318 VOffset =
11319 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11320 DAG.getTargetConstant(0, DL, MVT::i32)),
11321 0);
11322 Ops.push_back(VOffset);
11323 }
11324
11325 Ops.push_back(Op.getOperand(5)); // Offset
11326 Ops.push_back(Op.getOperand(6)); // CPol
11327 Ops.push_back(M0Val.getValue(0)); // Chain
11328 Ops.push_back(M0Val.getValue(1)); // Glue
11329
11330 auto *M = cast<MemSDNode>(Op);
11331 MachineMemOperand *LoadMMO = M->getMemOperand();
11332 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11333 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11334 MachinePointerInfo StorePtrI = LoadPtrI;
11335 LoadPtrI.V = PoisonValue::get(
11339 auto F = LoadMMO->getFlags() &
11341 LoadMMO =
11343 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11344 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11345 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11346 LoadMMO->getAAInfo());
11347
11348 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11349 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11350
11351 return SDValue(Load, 0);
11352 }
11353 case Intrinsic::amdgcn_end_cf:
11354 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11355 Op->getOperand(2), Chain),
11356 0);
11357 case Intrinsic::amdgcn_s_barrier_init:
11358 case Intrinsic::amdgcn_s_barrier_signal_var: {
11359 // these two intrinsics have two operands: barrier pointer and member count
11360 SDValue Chain = Op->getOperand(0);
11362 SDValue BarOp = Op->getOperand(2);
11363 SDValue CntOp = Op->getOperand(3);
11364 SDValue M0Val;
11365 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11366 ? AMDGPU::S_BARRIER_INIT_M0
11367 : AMDGPU::S_BARRIER_SIGNAL_M0;
11368 // extract the BarrierID from bits 4-9 of BarOp
11369 SDValue BarID;
11370 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11371 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11372 BarID =
11373 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11374 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11375 0);
11376 // Member count should be put into M0[ShAmt:+6]
11377 // Barrier ID should be put into M0[5:0]
11378 M0Val =
11379 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11380 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11381 0);
11382 constexpr unsigned ShAmt = 16;
11383 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11384 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11385
11386 M0Val = SDValue(
11387 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11388
11389 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11390
11391 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11392 return SDValue(NewMI, 0);
11393 }
11394 case Intrinsic::amdgcn_s_barrier_join: {
11395 // these three intrinsics have one operand: barrier pointer
11396 SDValue Chain = Op->getOperand(0);
11398 SDValue BarOp = Op->getOperand(2);
11399 unsigned Opc;
11400
11401 if (isa<ConstantSDNode>(BarOp)) {
11402 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11403 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11404
11405 // extract the BarrierID from bits 4-9 of the immediate
11406 unsigned BarID = (BarVal >> 4) & 0x3F;
11407 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11408 Ops.push_back(K);
11409 Ops.push_back(Chain);
11410 } else {
11411 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11412
11413 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11414 SDValue M0Val;
11415 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11416 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11417 M0Val =
11418 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11419 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11420 0);
11421 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11422 }
11423
11424 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11425 return SDValue(NewMI, 0);
11426 }
11427 case Intrinsic::amdgcn_s_prefetch_data: {
11428 // For non-global address space preserve the chain and remove the call.
11430 return Op.getOperand(0);
11431 return Op;
11432 }
11433 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11434 SDValue Ops[] = {
11435 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11436 Op.getOperand(3), // offset
11437 Op.getOperand(4), // length
11438 };
11439
11440 MemSDNode *M = cast<MemSDNode>(Op);
11442 Op->getVTList(), Ops, M->getMemoryVT(),
11443 M->getMemOperand());
11444 }
11445 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11446 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11447 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11448 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11449 SDValue Chain = Op->getOperand(0);
11450 SDValue Ptr = Op->getOperand(2);
11451 SDValue Val = Op->getOperand(3);
11452 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11453 Ptr, MII->getMemOperand());
11454 }
11455 default: {
11456 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11458 return lowerImage(Op, ImageDimIntr, DAG, true);
11459
11460 return Op;
11461 }
11462 }
11463}
11464
11465// Return whether the operation has NoUnsignedWrap property.
11466static bool isNoUnsignedWrap(SDValue Addr) {
11467 return (Addr.getOpcode() == ISD::ADD &&
11468 Addr->getFlags().hasNoUnsignedWrap()) ||
11469 Addr->getOpcode() == ISD::OR;
11470}
11471
11473 EVT PtrVT) const {
11474 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11475}
11476
11477// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11478// offset (the offset that is included in bounds checking and swizzling, to be
11479// split between the instruction's voffset and immoffset fields) and soffset
11480// (the offset that is excluded from bounds checking and swizzling, to go in
11481// the instruction's soffset field). This function takes the first kind of
11482// offset and figures out how to split it between voffset and immoffset.
11483std::pair<SDValue, SDValue>
11484SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11485 SDLoc DL(Offset);
11486 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11487 SDValue N0 = Offset;
11488 ConstantSDNode *C1 = nullptr;
11489
11490 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11491 N0 = SDValue();
11492 else if (DAG.isBaseWithConstantOffset(N0)) {
11493 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11494 // being added, so we can only safely match a 32-bit addition with no
11495 // unsigned overflow.
11496 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11497 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11498 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11499 N0 = N0.getOperand(0);
11500 }
11501 }
11502
11503 if (C1) {
11504 unsigned ImmOffset = C1->getZExtValue();
11505 // If the immediate value is too big for the immoffset field, put only bits
11506 // that would normally fit in the immoffset field. The remaining value that
11507 // is copied/added for the voffset field is a large power of 2, and it
11508 // stands more chance of being CSEd with the copy/add for another similar
11509 // load/store.
11510 // However, do not do that rounding down if that is a negative
11511 // number, as it appears to be illegal to have a negative offset in the
11512 // vgpr, even if adding the immediate offset makes it positive.
11513 unsigned Overflow = ImmOffset & ~MaxImm;
11514 ImmOffset -= Overflow;
11515 if ((int32_t)Overflow < 0) {
11516 Overflow += ImmOffset;
11517 ImmOffset = 0;
11518 }
11519 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11520 if (Overflow) {
11521 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11522 if (!N0)
11523 N0 = OverflowVal;
11524 else {
11525 SDValue Ops[] = {N0, OverflowVal};
11526 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11527 }
11528 }
11529 }
11530 if (!N0)
11531 N0 = DAG.getConstant(0, DL, MVT::i32);
11532 if (!C1)
11533 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11534 return {N0, SDValue(C1, 0)};
11535}
11536
11537// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11538// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11539// pointed to by Offsets.
11540void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11541 SelectionDAG &DAG, SDValue *Offsets,
11542 Align Alignment) const {
11543 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11544 SDLoc DL(CombinedOffset);
11545 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11546 uint32_t Imm = C->getZExtValue();
11547 uint32_t SOffset, ImmOffset;
11548 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11549 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11550 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11551 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11552 return;
11553 }
11554 }
11555 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11556 SDValue N0 = CombinedOffset.getOperand(0);
11557 SDValue N1 = CombinedOffset.getOperand(1);
11558 uint32_t SOffset, ImmOffset;
11559 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11560 if (Offset >= 0 &&
11561 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11562 Offsets[0] = N0;
11563 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11564 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11565 return;
11566 }
11567 }
11568
11569 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11570 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11571 : DAG.getConstant(0, DL, MVT::i32);
11572
11573 Offsets[0] = CombinedOffset;
11574 Offsets[1] = SOffsetZero;
11575 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11576}
11577
11578SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11579 SelectionDAG &DAG) const {
11580 if (!MaybePointer.getValueType().isScalarInteger())
11581 return MaybePointer;
11582
11583 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11584 return Rsrc;
11585}
11586
11587// Wrap a global or flat pointer into a buffer intrinsic using the flags
11588// specified in the intrinsic.
11589SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11590 SelectionDAG &DAG) const {
11591 SDLoc Loc(Op);
11592
11593 SDValue Pointer = Op->getOperand(1);
11594 SDValue Stride = Op->getOperand(2);
11595 SDValue NumRecords = Op->getOperand(3);
11596 SDValue Flags = Op->getOperand(4);
11597
11598 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11599 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11600 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11601 std::optional<uint32_t> ConstStride = std::nullopt;
11602 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11603 ConstStride = ConstNode->getZExtValue();
11604
11605 SDValue NewHighHalf = Masked;
11606 if (!ConstStride || *ConstStride != 0) {
11607 SDValue ShiftedStride;
11608 if (ConstStride) {
11609 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11610 } else {
11611 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11612 ShiftedStride =
11613 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11614 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11615 }
11616 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11617 }
11618
11619 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11620 NewHighHalf, NumRecords, Flags);
11621 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11622 return RsrcPtr;
11623}
11624
11625// Handle 8 bit and 16 bit buffer loads
11626SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11627 EVT LoadVT, SDLoc DL,
11629 MachineMemOperand *MMO,
11630 bool IsTFE) const {
11631 EVT IntVT = LoadVT.changeTypeToInteger();
11632
11633 if (IsTFE) {
11634 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11637 MachineFunction &MF = DAG.getMachineFunction();
11638 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11639 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11640 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11641 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11642 DAG.getConstant(1, DL, MVT::i32));
11643 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11644 DAG.getConstant(0, DL, MVT::i32));
11645 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11646 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11647 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11648 }
11649
11650 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11653
11654 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11655 SDValue BufferLoad =
11656 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11657 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11658 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11659
11660 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11661}
11662
11663// Handle 8 bit and 16 bit buffer stores
11664SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11665 EVT VDataType, SDLoc DL,
11666 SDValue Ops[],
11667 MemSDNode *M) const {
11668 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11669 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11670
11671 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11672 Ops[1] = BufferStoreExt;
11673 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11674 : AMDGPUISD::BUFFER_STORE_SHORT;
11675 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11676 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11677 M->getMemOperand());
11678}
11679
11681 SDValue Op, const SDLoc &SL, EVT VT) {
11682 if (VT.bitsLT(Op.getValueType()))
11683 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11684
11685 switch (ExtType) {
11686 case ISD::SEXTLOAD:
11687 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11688 case ISD::ZEXTLOAD:
11689 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11690 case ISD::EXTLOAD:
11691 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11692 case ISD::NON_EXTLOAD:
11693 return Op;
11694 }
11695
11696 llvm_unreachable("invalid ext type");
11697}
11698
11699// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11700// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11701SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11702 DAGCombinerInfo &DCI) const {
11703 SelectionDAG &DAG = DCI.DAG;
11704 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11705 return SDValue();
11706
11707 // FIXME: Constant loads should all be marked invariant.
11708 unsigned AS = Ld->getAddressSpace();
11709 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11711 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11712 return SDValue();
11713
11714 // Don't do this early, since it may interfere with adjacent load merging for
11715 // illegal types. We can avoid losing alignment information for exotic types
11716 // pre-legalize.
11717 EVT MemVT = Ld->getMemoryVT();
11718 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11719 MemVT.getSizeInBits() >= 32)
11720 return SDValue();
11721
11722 SDLoc SL(Ld);
11723
11724 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11725 "unexpected vector extload");
11726
11727 // TODO: Drop only high part of range.
11728 SDValue Ptr = Ld->getBasePtr();
11729 SDValue NewLoad = DAG.getLoad(
11730 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11731 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11732 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11733 nullptr); // Drop ranges
11734
11735 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11736 if (MemVT.isFloatingPoint()) {
11738 "unexpected fp extload");
11739 TruncVT = MemVT.changeTypeToInteger();
11740 }
11741
11742 SDValue Cvt = NewLoad;
11743 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11744 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11745 DAG.getValueType(TruncVT));
11746 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11748 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11749 } else {
11751 }
11752
11753 EVT VT = Ld->getValueType(0);
11754 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11755
11756 DCI.AddToWorklist(Cvt.getNode());
11757
11758 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11759 // the appropriate extension from the 32-bit load.
11760 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11761 DCI.AddToWorklist(Cvt.getNode());
11762
11763 // Handle conversion back to floating point if necessary.
11764 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11765
11766 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11767}
11768
11770 const SIMachineFunctionInfo &Info) {
11771 // TODO: Should check if the address can definitely not access stack.
11772 if (Info.isEntryFunction())
11773 return Info.getUserSGPRInfo().hasFlatScratchInit();
11774 return true;
11775}
11776
11777SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11778 SDLoc DL(Op);
11779 LoadSDNode *Load = cast<LoadSDNode>(Op);
11780 ISD::LoadExtType ExtType = Load->getExtensionType();
11781 EVT MemVT = Load->getMemoryVT();
11782 MachineMemOperand *MMO = Load->getMemOperand();
11783
11784 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11785 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11786 return SDValue();
11787
11788 // FIXME: Copied from PPC
11789 // First, load into 32 bits, then truncate to 1 bit.
11790
11791 SDValue Chain = Load->getChain();
11792 SDValue BasePtr = Load->getBasePtr();
11793
11794 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11795
11796 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11797 RealMemVT, MMO);
11798
11799 if (!MemVT.isVector()) {
11800 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11801 NewLD.getValue(1)};
11802
11803 return DAG.getMergeValues(Ops, DL);
11804 }
11805
11807 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11808 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11809 DAG.getConstant(I, DL, MVT::i32));
11810
11811 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11812 }
11813
11814 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11815
11816 return DAG.getMergeValues(Ops, DL);
11817 }
11818
11819 if (!MemVT.isVector())
11820 return SDValue();
11821
11822 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11823 "Custom lowering for non-i32 vectors hasn't been implemented.");
11824
11825 Align Alignment = Load->getAlign();
11826 unsigned AS = Load->getAddressSpace();
11827 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11828 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11829 return SplitVectorLoad(Op, DAG);
11830 }
11831
11832 MachineFunction &MF = DAG.getMachineFunction();
11833 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11834 // If there is a possibility that flat instruction access scratch memory
11835 // then we need to use the same legalization rules we use for private.
11836 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11837 !Subtarget->hasMultiDwordFlatScratchAddressing())
11838 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11841
11842 unsigned NumElements = MemVT.getVectorNumElements();
11843
11844 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11846 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11847 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11849 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11850 Alignment >= Align(4) && NumElements < 32) {
11851 if (MemVT.isPow2VectorType() ||
11852 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11853 return SDValue();
11854 return WidenOrSplitVectorLoad(Op, DAG);
11855 }
11856 // Non-uniform loads will be selected to MUBUF instructions, so they
11857 // have the same legalization requirements as global and private
11858 // loads.
11859 //
11860 }
11861 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11864 if (NumElements > 4)
11865 return SplitVectorLoad(Op, DAG);
11866 // v3 loads not supported on SI.
11867 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11868 return WidenOrSplitVectorLoad(Op, DAG);
11869
11870 // v3 and v4 loads are supported for private and global memory.
11871 return SDValue();
11872 }
11873 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11874 // Depending on the setting of the private_element_size field in the
11875 // resource descriptor, we can only make private accesses up to a certain
11876 // size.
11877 switch (Subtarget->getMaxPrivateElementSize()) {
11878 case 4: {
11879 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11880 return DAG.getMergeValues({Op0, Op1}, DL);
11881 }
11882 case 8:
11883 if (NumElements > 2)
11884 return SplitVectorLoad(Op, DAG);
11885 return SDValue();
11886 case 16:
11887 // Same as global/flat
11888 if (NumElements > 4)
11889 return SplitVectorLoad(Op, DAG);
11890 // v3 loads not supported on SI.
11891 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11892 return WidenOrSplitVectorLoad(Op, DAG);
11893
11894 return SDValue();
11895 default:
11896 llvm_unreachable("unsupported private_element_size");
11897 }
11898 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11899 unsigned Fast = 0;
11900 auto Flags = Load->getMemOperand()->getFlags();
11902 Load->getAlign(), Flags, &Fast) &&
11903 Fast > 1)
11904 return SDValue();
11905
11906 if (MemVT.isVector())
11907 return SplitVectorLoad(Op, DAG);
11908 }
11909
11911 MemVT, *Load->getMemOperand())) {
11912 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11913 return DAG.getMergeValues({Op0, Op1}, DL);
11914 }
11915
11916 return SDValue();
11917}
11918
11919SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11920 EVT VT = Op.getValueType();
11921 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11922 VT.getSizeInBits() == 512)
11923 return splitTernaryVectorOp(Op, DAG);
11924
11925 assert(VT.getSizeInBits() == 64);
11926
11927 SDLoc DL(Op);
11928 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11929
11930 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11931 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11932
11933 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11934 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11935
11936 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11937 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11938
11939 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11940
11941 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11942 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11943
11944 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11945
11946 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11947 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11948}
11949
11950// Catch division cases where we can use shortcuts with rcp and rsq
11951// instructions.
11952SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11953 SelectionDAG &DAG) const {
11954 SDLoc SL(Op);
11955 SDValue LHS = Op.getOperand(0);
11956 SDValue RHS = Op.getOperand(1);
11957 EVT VT = Op.getValueType();
11958 const SDNodeFlags Flags = Op->getFlags();
11959
11960 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11961
11962 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11963 // Without !fpmath accuracy information, we can't do more because we don't
11964 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11965 // f16 is always accurate enough
11966 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11967 return SDValue();
11968
11969 if (CLHS->isExactlyValue(1.0)) {
11970 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11971 // the CI documentation has a worst case error of 1 ulp.
11972 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11973 // use it as long as we aren't trying to use denormals.
11974 //
11975 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11976
11977 // 1.0 / sqrt(x) -> rsq(x)
11978
11979 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11980 // error seems really high at 2^29 ULP.
11981 // 1.0 / x -> rcp(x)
11982 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11983 }
11984
11985 // Same as for 1.0, but expand the sign out of the constant.
11986 if (CLHS->isExactlyValue(-1.0)) {
11987 // -1.0 / x -> rcp (fneg x)
11988 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11989 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11990 }
11991 }
11992
11993 // For f16 and bf16 require afn or arcp.
11994 // For f32 require afn.
11995 if (!AllowInaccurateRcp &&
11996 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
11997 return SDValue();
11998
11999 // Turn into multiply by the reciprocal.
12000 // x / y -> x * (1.0 / y)
12001 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12002 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12003}
12004
12005SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12006 SelectionDAG &DAG) const {
12007 SDLoc SL(Op);
12008 SDValue X = Op.getOperand(0);
12009 SDValue Y = Op.getOperand(1);
12010 EVT VT = Op.getValueType();
12011 const SDNodeFlags Flags = Op->getFlags();
12012
12013 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12014 if (!AllowInaccurateDiv)
12015 return SDValue();
12016
12017 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12018 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12019
12020 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12021 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12022
12023 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12024 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12025 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12026 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12027 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12028 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12029}
12030
12031static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12032 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12033 SDNodeFlags Flags) {
12034 if (GlueChain->getNumValues() <= 1) {
12035 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12036 }
12037
12038 assert(GlueChain->getNumValues() == 3);
12039
12040 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12041 switch (Opcode) {
12042 default:
12043 llvm_unreachable("no chain equivalent for opcode");
12044 case ISD::FMUL:
12045 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12046 break;
12047 }
12048
12049 return DAG.getNode(Opcode, SL, VTList,
12050 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12051 Flags);
12052}
12053
12054static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12055 EVT VT, SDValue A, SDValue B, SDValue C,
12056 SDValue GlueChain, SDNodeFlags Flags) {
12057 if (GlueChain->getNumValues() <= 1) {
12058 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12059 }
12060
12061 assert(GlueChain->getNumValues() == 3);
12062
12063 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12064 switch (Opcode) {
12065 default:
12066 llvm_unreachable("no chain equivalent for opcode");
12067 case ISD::FMA:
12068 Opcode = AMDGPUISD::FMA_W_CHAIN;
12069 break;
12070 }
12071
12072 return DAG.getNode(Opcode, SL, VTList,
12073 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12074 Flags);
12075}
12076
12077SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12078 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12079 return FastLowered;
12080
12081 SDLoc SL(Op);
12082 EVT VT = Op.getValueType();
12083 SDValue LHS = Op.getOperand(0);
12084 SDValue RHS = Op.getOperand(1);
12085
12086 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12087 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12088
12089 if (VT == MVT::bf16) {
12090 SDValue ExtDiv =
12091 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12092 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12093 DAG.getTargetConstant(0, SL, MVT::i32));
12094 }
12095
12096 assert(VT == MVT::f16);
12097
12098 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12099 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12100 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12101 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12102 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12103 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12104 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12105 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12106 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12107 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12108 // q16.u = opx(V_CVT_F16_F32, q32.u);
12109 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12110
12111 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12112 unsigned FMADOpCode =
12114 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12115 SDValue Rcp =
12116 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12117 SDValue Quot =
12118 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12119 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12120 Op->getFlags());
12121 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12122 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12123 Op->getFlags());
12124 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12125 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12126 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12127 DAG.getConstant(0xff800000, SL, MVT::i32));
12128 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12129 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12130 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12131 DAG.getTargetConstant(0, SL, MVT::i32));
12132 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12133 Op->getFlags());
12134}
12135
12136// Faster 2.5 ULP division that does not support denormals.
12137SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12138 SDNodeFlags Flags = Op->getFlags();
12139 SDLoc SL(Op);
12140 SDValue LHS = Op.getOperand(1);
12141 SDValue RHS = Op.getOperand(2);
12142
12143 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12144
12145 const APFloat K0Val(0x1p+96f);
12146 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12147
12148 const APFloat K1Val(0x1p-32f);
12149 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12150
12151 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12152
12153 EVT SetCCVT =
12154 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12155
12156 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12157
12158 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12159
12160 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12161
12162 // rcp does not support denormals.
12163 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12164
12165 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12166
12167 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12168}
12169
12170// Returns immediate value for setting the F32 denorm mode when using the
12171// S_DENORM_MODE instruction.
12174 const GCNSubtarget *ST) {
12175 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12176 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12177 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12178 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12179}
12180
12181SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12182 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12183 return FastLowered;
12184
12185 // The selection matcher assumes anything with a chain selecting to a
12186 // mayRaiseFPException machine instruction. Since we're introducing a chain
12187 // here, we need to explicitly report nofpexcept for the regular fdiv
12188 // lowering.
12189 SDNodeFlags Flags = Op->getFlags();
12190 Flags.setNoFPExcept(true);
12191
12192 SDLoc SL(Op);
12193 SDValue LHS = Op.getOperand(0);
12194 SDValue RHS = Op.getOperand(1);
12195
12196 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12197
12198 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12199
12200 SDValue DenominatorScaled =
12201 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12202 SDValue NumeratorScaled =
12203 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12204
12205 // Denominator is scaled to not be denormal, so using rcp is ok.
12206 SDValue ApproxRcp =
12207 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12208 SDValue NegDivScale0 =
12209 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12210
12211 using namespace AMDGPU::Hwreg;
12212 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12213 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12214
12215 const MachineFunction &MF = DAG.getMachineFunction();
12216 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12217 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12218
12219 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12220 const bool HasDynamicDenormals =
12221 (DenormMode.Input == DenormalMode::Dynamic) ||
12222 (DenormMode.Output == DenormalMode::Dynamic);
12223
12224 SDValue SavedDenormMode;
12225
12226 if (!PreservesDenormals) {
12227 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12228 // lowering. The chain dependence is insufficient, and we need glue. We do
12229 // not need the glue variants in a strictfp function.
12230
12231 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12232
12233 SDValue Glue = DAG.getEntryNode();
12234 if (HasDynamicDenormals) {
12235 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12236 DAG.getVTList(MVT::i32, MVT::Glue),
12237 {BitField, Glue});
12238 SavedDenormMode = SDValue(GetReg, 0);
12239
12240 Glue = DAG.getMergeValues(
12241 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12242 }
12243
12244 SDNode *EnableDenorm;
12245 if (Subtarget->hasDenormModeInst()) {
12246 const SDValue EnableDenormValue =
12248
12249 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12250 EnableDenormValue)
12251 .getNode();
12252 } else {
12253 const SDValue EnableDenormValue =
12254 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12255 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12256 {EnableDenormValue, BitField, Glue});
12257 }
12258
12259 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12260 SDValue(EnableDenorm, 1)};
12261
12262 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12263 }
12264
12265 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12266 ApproxRcp, One, NegDivScale0, Flags);
12267
12268 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12269 ApproxRcp, Fma0, Flags);
12270
12271 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12272 Fma1, Flags);
12273
12274 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12275 NumeratorScaled, Mul, Flags);
12276
12277 SDValue Fma3 =
12278 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12279
12280 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12281 NumeratorScaled, Fma3, Flags);
12282
12283 if (!PreservesDenormals) {
12284 SDNode *DisableDenorm;
12285 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12286 const SDValue DisableDenormValue = getSPDenormModeValue(
12287 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12288
12289 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12290 DisableDenorm =
12291 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12292 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12293 .getNode();
12294 } else {
12295 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12296 const SDValue DisableDenormValue =
12297 HasDynamicDenormals
12298 ? SavedDenormMode
12299 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12300
12301 DisableDenorm = DAG.getMachineNode(
12302 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12303 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12304 }
12305
12306 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12307 SDValue(DisableDenorm, 0), DAG.getRoot());
12308 DAG.setRoot(OutputChain);
12309 }
12310
12311 SDValue Scale = NumeratorScaled.getValue(1);
12312 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12313 {Fma4, Fma1, Fma3, Scale}, Flags);
12314
12315 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12316}
12317
12318SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12319 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12320 return FastLowered;
12321
12322 SDLoc SL(Op);
12323 SDValue X = Op.getOperand(0);
12324 SDValue Y = Op.getOperand(1);
12325
12326 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12327
12328 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12329
12330 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12331
12332 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12333
12334 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12335
12336 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12337
12338 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12339
12340 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12341
12342 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12343
12344 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12345 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12346
12347 SDValue Fma4 =
12348 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12349
12350 SDValue Scale;
12351
12352 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12353 // Workaround a hardware bug on SI where the condition output from div_scale
12354 // is not usable.
12355
12356 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12357
12358 // Figure out if the scale to use for div_fmas.
12359 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12360 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12361 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12362 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12363
12364 SDValue NumHi =
12365 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12366 SDValue DenHi =
12367 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12368
12369 SDValue Scale0Hi =
12370 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12371 SDValue Scale1Hi =
12372 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12373
12374 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12375 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12376 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12377 } else {
12378 Scale = DivScale1.getValue(1);
12379 }
12380
12381 SDValue Fmas =
12382 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12383
12384 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12385}
12386
12387SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12388 EVT VT = Op.getValueType();
12389
12390 if (VT == MVT::f32)
12391 return LowerFDIV32(Op, DAG);
12392
12393 if (VT == MVT::f64)
12394 return LowerFDIV64(Op, DAG);
12395
12396 if (VT == MVT::f16 || VT == MVT::bf16)
12397 return LowerFDIV16(Op, DAG);
12398
12399 llvm_unreachable("Unexpected type for fdiv");
12400}
12401
12402SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12403 SDLoc dl(Op);
12404 SDValue Val = Op.getOperand(0);
12405 EVT VT = Val.getValueType();
12406 EVT ResultExpVT = Op->getValueType(1);
12407 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12408
12409 SDValue Mant = DAG.getNode(
12411 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12412
12413 SDValue Exp = DAG.getNode(
12414 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12415 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12416
12417 if (Subtarget->hasFractBug()) {
12418 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12419 SDValue Inf =
12421
12422 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12423 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12424 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12425 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12426 }
12427
12428 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12429 return DAG.getMergeValues({Mant, CastExp}, dl);
12430}
12431
12432SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12433 SDLoc DL(Op);
12434 StoreSDNode *Store = cast<StoreSDNode>(Op);
12435 EVT VT = Store->getMemoryVT();
12436
12437 if (VT == MVT::i1) {
12438 return DAG.getTruncStore(
12439 Store->getChain(), DL,
12440 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12441 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12442 }
12443
12444 assert(VT.isVector() &&
12445 Store->getValue().getValueType().getScalarType() == MVT::i32);
12446
12447 unsigned AS = Store->getAddressSpace();
12448 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12449 Store->getAlign().value() < VT.getStoreSize() &&
12450 VT.getSizeInBits() > 32) {
12451 return SplitVectorStore(Op, DAG);
12452 }
12453
12454 MachineFunction &MF = DAG.getMachineFunction();
12455 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12456 // If there is a possibility that flat instruction access scratch memory
12457 // then we need to use the same legalization rules we use for private.
12458 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12459 !Subtarget->hasMultiDwordFlatScratchAddressing())
12460 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12463
12464 unsigned NumElements = VT.getVectorNumElements();
12466 if (NumElements > 4)
12467 return SplitVectorStore(Op, DAG);
12468 // v3 stores not supported on SI.
12469 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12470 return SplitVectorStore(Op, DAG);
12471
12473 VT, *Store->getMemOperand()))
12474 return expandUnalignedStore(Store, DAG);
12475
12476 return SDValue();
12477 }
12478 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12479 switch (Subtarget->getMaxPrivateElementSize()) {
12480 case 4:
12481 return scalarizeVectorStore(Store, DAG);
12482 case 8:
12483 if (NumElements > 2)
12484 return SplitVectorStore(Op, DAG);
12485 return SDValue();
12486 case 16:
12487 if (NumElements > 4 ||
12488 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12489 return SplitVectorStore(Op, DAG);
12490 return SDValue();
12491 default:
12492 llvm_unreachable("unsupported private_element_size");
12493 }
12494 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12495 unsigned Fast = 0;
12496 auto Flags = Store->getMemOperand()->getFlags();
12498 Store->getAlign(), Flags, &Fast) &&
12499 Fast > 1)
12500 return SDValue();
12501
12502 if (VT.isVector())
12503 return SplitVectorStore(Op, DAG);
12504
12505 return expandUnalignedStore(Store, DAG);
12506 }
12507
12508 // Probably an invalid store. If so we'll end up emitting a selection error.
12509 return SDValue();
12510}
12511
12512// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12513SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12514 SDLoc SL(Op);
12515 assert(!Subtarget->has16BitInsts());
12516 SDNodeFlags Flags = Op->getFlags();
12517 SDValue Ext =
12518 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12519
12520 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12521 SDValue Sqrt =
12522 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12523
12524 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12525 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12526}
12527
12528SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12529 SDLoc DL(Op);
12530 SDNodeFlags Flags = Op->getFlags();
12531 MVT VT = Op.getValueType().getSimpleVT();
12532 const SDValue X = Op.getOperand(0);
12533
12534 if (allowApproxFunc(DAG, Flags)) {
12535 // Instruction is 1ulp but ignores denormals.
12536 return DAG.getNode(
12538 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12539 }
12540
12541 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12542 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12543
12544 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12545
12546 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12547
12548 SDValue SqrtX =
12549 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12550
12551 SDValue SqrtS;
12552 if (needsDenormHandlingF32(DAG, X, Flags)) {
12553 SDValue SqrtID =
12554 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12555 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12556
12557 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12558 SDValue SqrtSNextDownInt =
12559 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12560 DAG.getAllOnesConstant(DL, MVT::i32));
12561 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12562
12563 SDValue NegSqrtSNextDown =
12564 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12565
12566 SDValue SqrtVP =
12567 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12568
12569 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12570 DAG.getConstant(1, DL, MVT::i32));
12571 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12572
12573 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12574 SDValue SqrtVS =
12575 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12576
12577 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12578 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12579
12580 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12581 Flags);
12582
12583 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12584 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12585 Flags);
12586 } else {
12587 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12588
12589 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12590
12591 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12592 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12593 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12594
12595 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12596 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12597 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12598
12599 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12600 SDValue SqrtD =
12601 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12602 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12603 }
12604
12605 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12606
12607 SDValue ScaledDown =
12608 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12609
12610 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12611 SDValue IsZeroOrInf =
12612 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12613 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12614
12615 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12616}
12617
12618SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12619 // For double type, the SQRT and RSQ instructions don't have required
12620 // precision, we apply Goldschmidt's algorithm to improve the result:
12621 //
12622 // y0 = rsq(x)
12623 // g0 = x * y0
12624 // h0 = 0.5 * y0
12625 //
12626 // r0 = 0.5 - h0 * g0
12627 // g1 = g0 * r0 + g0
12628 // h1 = h0 * r0 + h0
12629 //
12630 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12631 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12632 // h2 = h1 * r1 + h1
12633 //
12634 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12635 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12636 //
12637 // sqrt(x) = g3
12638
12639 SDNodeFlags Flags = Op->getFlags();
12640
12641 SDLoc DL(Op);
12642
12643 SDValue X = Op.getOperand(0);
12644 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12645
12646 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12647
12648 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12649
12650 // Scale up input if it is too small.
12651 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12652 SDValue ScaleUp =
12653 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12654 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12655
12656 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12657
12658 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12659
12660 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12661 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12662
12663 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12664 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12665
12666 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12667
12668 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12669
12670 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12671 SDValue SqrtD0 =
12672 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12673
12674 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12675
12676 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12677 SDValue SqrtD1 =
12678 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12679
12680 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12681
12682 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12683 SDValue ScaleDown =
12684 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12685 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12686
12687 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12688 // with finite only or nsz because rsq(+/-0) = +/-inf
12689
12690 // TODO: Check for DAZ and expand to subnormals
12691 SDValue IsZeroOrInf =
12692 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12693 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12694
12695 // If x is +INF, +0, or -0, use its original value
12696 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12697 Flags);
12698}
12699
12700SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12701 SDLoc DL(Op);
12702 EVT VT = Op.getValueType();
12703 SDValue Arg = Op.getOperand(0);
12704 SDValue TrigVal;
12705
12706 // Propagate fast-math flags so that the multiply we introduce can be folded
12707 // if Arg is already the result of a multiply by constant.
12708 auto Flags = Op->getFlags();
12709
12710 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12711
12712 if (Subtarget->hasTrigReducedRange()) {
12713 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12714 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12715 } else {
12716 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12717 }
12718
12719 switch (Op.getOpcode()) {
12720 case ISD::FCOS:
12721 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12722 case ISD::FSIN:
12723 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12724 default:
12725 llvm_unreachable("Wrong trig opcode");
12726 }
12727}
12728
12729SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12730 SelectionDAG &DAG) const {
12731 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12732 assert(AtomicNode->isCompareAndSwap());
12733 unsigned AS = AtomicNode->getAddressSpace();
12734
12735 // No custom lowering required for local address space
12737 return Op;
12738
12739 // Non-local address space requires custom lowering for atomic compare
12740 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12741 SDLoc DL(Op);
12742 SDValue ChainIn = Op.getOperand(0);
12743 SDValue Addr = Op.getOperand(1);
12744 SDValue Old = Op.getOperand(2);
12745 SDValue New = Op.getOperand(3);
12746 EVT VT = Op.getValueType();
12747 MVT SimpleVT = VT.getSimpleVT();
12748 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12749
12750 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12751 SDValue Ops[] = {ChainIn, Addr, NewOld};
12752
12754 Op->getVTList(), Ops, VT,
12755 AtomicNode->getMemOperand());
12756}
12757
12758//===----------------------------------------------------------------------===//
12759// Custom DAG optimizations
12760//===----------------------------------------------------------------------===//
12761
12762SDValue
12763SITargetLowering::performUCharToFloatCombine(SDNode *N,
12764 DAGCombinerInfo &DCI) const {
12765 EVT VT = N->getValueType(0);
12766 EVT ScalarVT = VT.getScalarType();
12767 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12768 return SDValue();
12769
12770 SelectionDAG &DAG = DCI.DAG;
12771 SDLoc DL(N);
12772
12773 SDValue Src = N->getOperand(0);
12774 EVT SrcVT = Src.getValueType();
12775
12776 // TODO: We could try to match extracting the higher bytes, which would be
12777 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12778 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12779 // about in practice.
12780 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12781 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12782 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12783 DCI.AddToWorklist(Cvt.getNode());
12784
12785 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12786 if (ScalarVT != MVT::f32) {
12787 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12788 DAG.getTargetConstant(0, DL, MVT::i32));
12789 }
12790 return Cvt;
12791 }
12792 }
12793
12794 return SDValue();
12795}
12796
12797SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12798 DAGCombinerInfo &DCI) const {
12799 SDValue MagnitudeOp = N->getOperand(0);
12800 SDValue SignOp = N->getOperand(1);
12801
12802 // The generic combine for fcopysign + fp cast is too conservative with
12803 // vectors, and also gets confused by the splitting we will perform here, so
12804 // peek through FP casts.
12805 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12806 SignOp.getOpcode() == ISD::FP_ROUND)
12807 SignOp = SignOp.getOperand(0);
12808
12809 SelectionDAG &DAG = DCI.DAG;
12810 SDLoc DL(N);
12811 EVT SignVT = SignOp.getValueType();
12812
12813 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12814 // lower half with a copy.
12815 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12816 EVT MagVT = MagnitudeOp.getValueType();
12817
12818 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12819
12820 if (MagVT.getScalarType() == MVT::f64) {
12821 EVT F32VT = MagVT.isVector()
12822 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12823 : MVT::v2f32;
12824
12825 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12826
12828 for (unsigned I = 0; I != NumElts; ++I) {
12829 SDValue MagLo =
12830 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12831 DAG.getConstant(2 * I, DL, MVT::i32));
12832 SDValue MagHi =
12833 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12834 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12835
12836 SDValue SignOpElt =
12837 MagVT.isVector()
12839 SignOp, DAG.getConstant(I, DL, MVT::i32))
12840 : SignOp;
12841
12842 SDValue HiOp =
12843 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12844
12845 SDValue Vector =
12846 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12847
12848 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12849 NewElts.push_back(NewElt);
12850 }
12851
12852 if (NewElts.size() == 1)
12853 return NewElts[0];
12854
12855 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12856 }
12857
12858 if (SignVT.getScalarType() != MVT::f64)
12859 return SDValue();
12860
12861 // Reduce width of sign operand, we only need the highest bit.
12862 //
12863 // fcopysign f64:x, f64:y ->
12864 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12865 // TODO: In some cases it might make sense to go all the way to f16.
12866
12867 EVT F32VT = MagVT.isVector()
12868 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12869 : MVT::v2f32;
12870
12871 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12872
12873 SmallVector<SDValue, 8> F32Signs;
12874 for (unsigned I = 0; I != NumElts; ++I) {
12875 // Take sign from odd elements of cast vector
12876 SDValue SignAsF32 =
12877 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12878 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12879 F32Signs.push_back(SignAsF32);
12880 }
12881
12882 SDValue NewSign =
12883 NumElts == 1
12884 ? F32Signs.back()
12886 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12887 F32Signs);
12888
12889 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12890 NewSign);
12891}
12892
12893// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12894// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12895// bits
12896
12897// This is a variant of
12898// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12899//
12900// The normal DAG combiner will do this, but only if the add has one use since
12901// that would increase the number of instructions.
12902//
12903// This prevents us from seeing a constant offset that can be folded into a
12904// memory instruction's addressing mode. If we know the resulting add offset of
12905// a pointer can be folded into an addressing offset, we can replace the pointer
12906// operand with the add of new constant offset. This eliminates one of the uses,
12907// and may allow the remaining use to also be simplified.
12908//
12909SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12910 EVT MemVT,
12911 DAGCombinerInfo &DCI) const {
12912 SDValue N0 = N->getOperand(0);
12913 SDValue N1 = N->getOperand(1);
12914
12915 // We only do this to handle cases where it's profitable when there are
12916 // multiple uses of the add, so defer to the standard combine.
12917 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12918 N0->hasOneUse())
12919 return SDValue();
12920
12921 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12922 if (!CN1)
12923 return SDValue();
12924
12925 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12926 if (!CAdd)
12927 return SDValue();
12928
12929 SelectionDAG &DAG = DCI.DAG;
12930
12931 if (N0->getOpcode() == ISD::OR &&
12932 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12933 return SDValue();
12934
12935 // If the resulting offset is too large, we can't fold it into the
12936 // addressing mode offset.
12937 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12938 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12939
12940 AddrMode AM;
12941 AM.HasBaseReg = true;
12942 AM.BaseOffs = Offset.getSExtValue();
12943 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12944 return SDValue();
12945
12946 SDLoc SL(N);
12947 EVT VT = N->getValueType(0);
12948
12949 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12950 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12951
12952 SDNodeFlags Flags;
12953 Flags.setNoUnsignedWrap(
12954 N->getFlags().hasNoUnsignedWrap() &&
12955 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12956
12957 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12958}
12959
12960/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12961/// by the chain and intrinsic ID. Theoretically we would also need to check the
12962/// specific intrinsic, but they all place the pointer operand first.
12963static unsigned getBasePtrIndex(const MemSDNode *N) {
12964 switch (N->getOpcode()) {
12965 case ISD::STORE:
12968 return 2;
12969 default:
12970 return 1;
12971 }
12972}
12973
12974SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12975 DAGCombinerInfo &DCI) const {
12976 SelectionDAG &DAG = DCI.DAG;
12977
12978 unsigned PtrIdx = getBasePtrIndex(N);
12979 SDValue Ptr = N->getOperand(PtrIdx);
12980
12981 // TODO: We could also do this for multiplies.
12982 if (Ptr.getOpcode() == ISD::SHL) {
12983 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12984 N->getMemoryVT(), DCI);
12985 if (NewPtr) {
12986 SmallVector<SDValue, 8> NewOps(N->ops());
12987
12988 NewOps[PtrIdx] = NewPtr;
12989 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12990 }
12991 }
12992
12993 return SDValue();
12994}
12995
12996static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12997 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12998 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12999 (Opc == ISD::XOR && Val == 0);
13000}
13001
13002// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13003// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13004// integer combine opportunities since most 64-bit operations are decomposed
13005// this way. TODO: We won't want this for SALU especially if it is an inline
13006// immediate.
13007SDValue SITargetLowering::splitBinaryBitConstantOp(
13008 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13009 const ConstantSDNode *CRHS) const {
13010 uint64_t Val = CRHS->getZExtValue();
13011 uint32_t ValLo = Lo_32(Val);
13012 uint32_t ValHi = Hi_32(Val);
13013 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13014
13015 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13017 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13018 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13019 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13020 !CRHS->user_begin()->isDivergent())
13021 return SDValue();
13022
13023 // If we need to materialize a 64-bit immediate, it will be split up later
13024 // anyway. Avoid creating the harder to understand 64-bit immediate
13025 // materialization.
13026 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13027 }
13028
13029 return SDValue();
13030}
13031
13033 if (V.getValueType() != MVT::i1)
13034 return false;
13035 switch (V.getOpcode()) {
13036 default:
13037 break;
13038 case ISD::SETCC:
13039 case ISD::IS_FPCLASS:
13041 return true;
13042 case ISD::AND:
13043 case ISD::OR:
13044 case ISD::XOR:
13045 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13046 case ISD::SADDO:
13047 case ISD::UADDO:
13048 case ISD::SSUBO:
13049 case ISD::USUBO:
13050 case ISD::SMULO:
13051 case ISD::UMULO:
13052 return V.getResNo() == 1;
13054 unsigned IntrinsicID = V.getConstantOperandVal(0);
13055 switch (IntrinsicID) {
13056 case Intrinsic::amdgcn_is_shared:
13057 case Intrinsic::amdgcn_is_private:
13058 return true;
13059 default:
13060 return false;
13061 }
13062
13063 return false;
13064 }
13065 }
13066 return false;
13067}
13068
13069// If a constant has all zeroes or all ones within each byte return it.
13070// Otherwise return 0.
13072 // 0xff for any zero byte in the mask
13073 uint32_t ZeroByteMask = 0;
13074 if (!(C & 0x000000ff))
13075 ZeroByteMask |= 0x000000ff;
13076 if (!(C & 0x0000ff00))
13077 ZeroByteMask |= 0x0000ff00;
13078 if (!(C & 0x00ff0000))
13079 ZeroByteMask |= 0x00ff0000;
13080 if (!(C & 0xff000000))
13081 ZeroByteMask |= 0xff000000;
13082 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13083 if ((NonZeroByteMask & C) != NonZeroByteMask)
13084 return 0; // Partial bytes selected.
13085 return C;
13086}
13087
13088// Check if a node selects whole bytes from its operand 0 starting at a byte
13089// boundary while masking the rest. Returns select mask as in the v_perm_b32
13090// or -1 if not succeeded.
13091// Note byte select encoding:
13092// value 0-3 selects corresponding source byte;
13093// value 0xc selects zero;
13094// value 0xff selects 0xff.
13096 assert(V.getValueSizeInBits() == 32);
13097
13098 if (V.getNumOperands() != 2)
13099 return ~0;
13100
13101 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13102 if (!N1)
13103 return ~0;
13104
13105 uint32_t C = N1->getZExtValue();
13106
13107 switch (V.getOpcode()) {
13108 default:
13109 break;
13110 case ISD::AND:
13111 if (uint32_t ConstMask = getConstantPermuteMask(C))
13112 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13113 break;
13114
13115 case ISD::OR:
13116 if (uint32_t ConstMask = getConstantPermuteMask(C))
13117 return (0x03020100 & ~ConstMask) | ConstMask;
13118 break;
13119
13120 case ISD::SHL:
13121 if (C % 8)
13122 return ~0;
13123
13124 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13125
13126 case ISD::SRL:
13127 if (C % 8)
13128 return ~0;
13129
13130 return uint32_t(0x0c0c0c0c03020100ull >> C);
13131 }
13132
13133 return ~0;
13134}
13135
13136SDValue SITargetLowering::performAndCombine(SDNode *N,
13137 DAGCombinerInfo &DCI) const {
13138 if (DCI.isBeforeLegalize())
13139 return SDValue();
13140
13141 SelectionDAG &DAG = DCI.DAG;
13142 EVT VT = N->getValueType(0);
13143 SDValue LHS = N->getOperand(0);
13144 SDValue RHS = N->getOperand(1);
13145
13146 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13147 if (VT == MVT::i64 && CRHS) {
13148 if (SDValue Split =
13149 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13150 return Split;
13151 }
13152
13153 if (CRHS && VT == MVT::i32) {
13154 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13155 // nb = number of trailing zeroes in mask
13156 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13157 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13158 uint64_t Mask = CRHS->getZExtValue();
13159 unsigned Bits = llvm::popcount(Mask);
13160 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13161 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13162 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13163 unsigned Shift = CShift->getZExtValue();
13164 unsigned NB = CRHS->getAPIntValue().countr_zero();
13165 unsigned Offset = NB + Shift;
13166 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13167 SDLoc SL(N);
13168 SDValue BFE =
13169 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13170 DAG.getConstant(Offset, SL, MVT::i32),
13171 DAG.getConstant(Bits, SL, MVT::i32));
13172 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13173 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13174 DAG.getValueType(NarrowVT));
13175 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13176 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13177 return Shl;
13178 }
13179 }
13180 }
13181
13182 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13183 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13184 isa<ConstantSDNode>(LHS.getOperand(2))) {
13185 uint32_t Sel = getConstantPermuteMask(Mask);
13186 if (!Sel)
13187 return SDValue();
13188
13189 // Select 0xc for all zero bytes
13190 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13191 SDLoc DL(N);
13192 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13193 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13194 }
13195 }
13196
13197 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13198 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13199 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13200 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13201 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13202
13203 SDValue X = LHS.getOperand(0);
13204 SDValue Y = RHS.getOperand(0);
13205 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13206 !isTypeLegal(X.getValueType()))
13207 return SDValue();
13208
13209 if (LCC == ISD::SETO) {
13210 if (X != LHS.getOperand(1))
13211 return SDValue();
13212
13213 if (RCC == ISD::SETUNE) {
13214 const ConstantFPSDNode *C1 =
13215 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13216 if (!C1 || !C1->isInfinity() || C1->isNegative())
13217 return SDValue();
13218
13219 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13223
13224 static_assert(
13227 0x3ff) == Mask,
13228 "mask not equal");
13229
13230 SDLoc DL(N);
13231 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13232 DAG.getConstant(Mask, DL, MVT::i32));
13233 }
13234 }
13235 }
13236
13237 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13238 std::swap(LHS, RHS);
13239
13240 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13241 RHS.hasOneUse()) {
13242 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13243 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13244 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13245 // | n_nan)
13246 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13247 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13248 (RHS.getOperand(0) == LHS.getOperand(0) &&
13249 LHS.getOperand(0) == LHS.getOperand(1))) {
13250 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13251 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13252 : Mask->getZExtValue() & OrdMask;
13253
13254 SDLoc DL(N);
13255 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13256 DAG.getConstant(NewMask, DL, MVT::i32));
13257 }
13258 }
13259
13260 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13261 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13262 // and x, (sext cc from i1) => select cc, x, 0
13263 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13264 std::swap(LHS, RHS);
13265 if (isBoolSGPR(RHS.getOperand(0)))
13266 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13267 DAG.getConstant(0, SDLoc(N), MVT::i32));
13268 }
13269
13270 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13271 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13272 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13273 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13274 uint32_t LHSMask = getPermuteMask(LHS);
13275 uint32_t RHSMask = getPermuteMask(RHS);
13276 if (LHSMask != ~0u && RHSMask != ~0u) {
13277 // Canonicalize the expression in an attempt to have fewer unique masks
13278 // and therefore fewer registers used to hold the masks.
13279 if (LHSMask > RHSMask) {
13280 std::swap(LHSMask, RHSMask);
13281 std::swap(LHS, RHS);
13282 }
13283
13284 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13285 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13286 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13287 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13288
13289 // Check of we need to combine values from two sources within a byte.
13290 if (!(LHSUsedLanes & RHSUsedLanes) &&
13291 // If we select high and lower word keep it for SDWA.
13292 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13293 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13294 // Each byte in each mask is either selector mask 0-3, or has higher
13295 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13296 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13297 // mask which is not 0xff wins. By anding both masks we have a correct
13298 // result except that 0x0c shall be corrected to give 0x0c only.
13299 uint32_t Mask = LHSMask & RHSMask;
13300 for (unsigned I = 0; I < 32; I += 8) {
13301 uint32_t ByteSel = 0xff << I;
13302 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13303 Mask &= (0x0c << I) & 0xffffffff;
13304 }
13305
13306 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13307 // or 0x0c.
13308 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13309 SDLoc DL(N);
13310
13311 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13312 RHS.getOperand(0),
13313 DAG.getConstant(Sel, DL, MVT::i32));
13314 }
13315 }
13316 }
13317
13318 return SDValue();
13319}
13320
13321// A key component of v_perm is a mapping between byte position of the src
13322// operands, and the byte position of the dest. To provide such, we need: 1. the
13323// node that provides x byte of the dest of the OR, and 2. the byte of the node
13324// used to provide that x byte. calculateByteProvider finds which node provides
13325// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13326// and finds an ultimate src and byte position For example: The supported
13327// LoadCombine pattern for vector loads is as follows
13328// t1
13329// or
13330// / \
13331// t2 t3
13332// zext shl
13333// | | \
13334// t4 t5 16
13335// or anyext
13336// / \ |
13337// t6 t7 t8
13338// srl shl or
13339// / | / \ / \
13340// t9 t10 t11 t12 t13 t14
13341// trunc* 8 trunc* 8 and and
13342// | | / | | \
13343// t15 t16 t17 t18 t19 t20
13344// trunc* 255 srl -256
13345// | / \
13346// t15 t15 16
13347//
13348// *In this example, the truncs are from i32->i16
13349//
13350// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13351// respectively. calculateSrcByte would find (given node) -> ultimate src &
13352// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13353// After finding the mapping, we can combine the tree into vperm t15, t16,
13354// 0x05000407
13355
13356// Find the source and byte position from a node.
13357// \p DestByte is the byte position of the dest of the or that the src
13358// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13359// dest of the or byte. \p Depth tracks how many recursive iterations we have
13360// performed.
13361static const std::optional<ByteProvider<SDValue>>
13362calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13363 unsigned Depth = 0) {
13364 // We may need to recursively traverse a series of SRLs
13365 if (Depth >= 6)
13366 return std::nullopt;
13367
13368 if (Op.getValueSizeInBits() < 8)
13369 return std::nullopt;
13370
13371 if (Op.getValueType().isVector())
13372 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13373
13374 switch (Op->getOpcode()) {
13375 case ISD::TRUNCATE: {
13376 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13377 }
13378
13379 case ISD::SIGN_EXTEND:
13380 case ISD::ZERO_EXTEND:
13382 SDValue NarrowOp = Op->getOperand(0);
13383 auto NarrowVT = NarrowOp.getValueType();
13384 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13385 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13386 NarrowVT = VTSign->getVT();
13387 }
13388 if (!NarrowVT.isByteSized())
13389 return std::nullopt;
13390 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13391
13392 if (SrcIndex >= NarrowByteWidth)
13393 return std::nullopt;
13394 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13395 }
13396
13397 case ISD::SRA:
13398 case ISD::SRL: {
13399 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13400 if (!ShiftOp)
13401 return std::nullopt;
13402
13403 uint64_t BitShift = ShiftOp->getZExtValue();
13404
13405 if (BitShift % 8 != 0)
13406 return std::nullopt;
13407
13408 SrcIndex += BitShift / 8;
13409
13410 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13411 }
13412
13413 default: {
13414 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13415 }
13416 }
13417 llvm_unreachable("fully handled switch");
13418}
13419
13420// For a byte position in the result of an Or, traverse the tree and find the
13421// node (and the byte of the node) which ultimately provides this {Or,
13422// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13423// the byte position of the Op that corresponds with the originally requested
13424// byte of the Or \p Depth tracks how many recursive iterations we have
13425// performed. \p StartingIndex is the originally requested byte of the Or
13426static const std::optional<ByteProvider<SDValue>>
13427calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13428 unsigned StartingIndex = 0) {
13429 // Finding Src tree of RHS of or typically requires at least 1 additional
13430 // depth
13431 if (Depth > 6)
13432 return std::nullopt;
13433
13434 unsigned BitWidth = Op.getScalarValueSizeInBits();
13435 if (BitWidth % 8 != 0)
13436 return std::nullopt;
13437 if (Index > BitWidth / 8 - 1)
13438 return std::nullopt;
13439
13440 bool IsVec = Op.getValueType().isVector();
13441 switch (Op.getOpcode()) {
13442 case ISD::OR: {
13443 if (IsVec)
13444 return std::nullopt;
13445
13446 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13447 StartingIndex);
13448 if (!RHS)
13449 return std::nullopt;
13450 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13451 StartingIndex);
13452 if (!LHS)
13453 return std::nullopt;
13454 // A well formed Or will have two ByteProviders for each byte, one of which
13455 // is constant zero
13456 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13457 return std::nullopt;
13458 if (!LHS || LHS->isConstantZero())
13459 return RHS;
13460 if (!RHS || RHS->isConstantZero())
13461 return LHS;
13462 return std::nullopt;
13463 }
13464
13465 case ISD::AND: {
13466 if (IsVec)
13467 return std::nullopt;
13468
13469 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13470 if (!BitMaskOp)
13471 return std::nullopt;
13472
13473 uint32_t BitMask = BitMaskOp->getZExtValue();
13474 // Bits we expect for our StartingIndex
13475 uint32_t IndexMask = 0xFF << (Index * 8);
13476
13477 if ((IndexMask & BitMask) != IndexMask) {
13478 // If the result of the and partially provides the byte, then it
13479 // is not well formatted
13480 if (IndexMask & BitMask)
13481 return std::nullopt;
13483 }
13484
13485 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13486 }
13487
13488 case ISD::FSHR: {
13489 if (IsVec)
13490 return std::nullopt;
13491
13492 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13493 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13494 if (!ShiftOp || Op.getValueType().isVector())
13495 return std::nullopt;
13496
13497 uint64_t BitsProvided = Op.getValueSizeInBits();
13498 if (BitsProvided % 8 != 0)
13499 return std::nullopt;
13500
13501 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13502 if (BitShift % 8)
13503 return std::nullopt;
13504
13505 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13506 uint64_t ByteShift = BitShift / 8;
13507
13508 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13509 uint64_t BytesProvided = BitsProvided / 8;
13510 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13511 NewIndex %= BytesProvided;
13512 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13513 }
13514
13515 case ISD::SRA:
13516 case ISD::SRL: {
13517 if (IsVec)
13518 return std::nullopt;
13519
13520 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13521 if (!ShiftOp)
13522 return std::nullopt;
13523
13524 uint64_t BitShift = ShiftOp->getZExtValue();
13525 if (BitShift % 8)
13526 return std::nullopt;
13527
13528 auto BitsProvided = Op.getScalarValueSizeInBits();
13529 if (BitsProvided % 8 != 0)
13530 return std::nullopt;
13531
13532 uint64_t BytesProvided = BitsProvided / 8;
13533 uint64_t ByteShift = BitShift / 8;
13534 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13535 // If the byte we are trying to provide (as tracked by index) falls in this
13536 // range, then the SRL provides the byte. The byte of interest of the src of
13537 // the SRL is Index + ByteShift
13538 return BytesProvided - ByteShift > Index
13539 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13540 Index + ByteShift)
13542 }
13543
13544 case ISD::SHL: {
13545 if (IsVec)
13546 return std::nullopt;
13547
13548 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13549 if (!ShiftOp)
13550 return std::nullopt;
13551
13552 uint64_t BitShift = ShiftOp->getZExtValue();
13553 if (BitShift % 8 != 0)
13554 return std::nullopt;
13555 uint64_t ByteShift = BitShift / 8;
13556
13557 // If we are shifting by an amount greater than (or equal to)
13558 // the index we are trying to provide, then it provides 0s. If not,
13559 // then this bytes are not definitively 0s, and the corresponding byte
13560 // of interest is Index - ByteShift of the src
13561 return Index < ByteShift
13563 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13564 Depth + 1, StartingIndex);
13565 }
13566 case ISD::ANY_EXTEND:
13567 case ISD::SIGN_EXTEND:
13568 case ISD::ZERO_EXTEND:
13570 case ISD::AssertZext:
13571 case ISD::AssertSext: {
13572 if (IsVec)
13573 return std::nullopt;
13574
13575 SDValue NarrowOp = Op->getOperand(0);
13576 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13577 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13578 Op->getOpcode() == ISD::AssertZext ||
13579 Op->getOpcode() == ISD::AssertSext) {
13580 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13581 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13582 }
13583 if (NarrowBitWidth % 8 != 0)
13584 return std::nullopt;
13585 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13586
13587 if (Index >= NarrowByteWidth)
13588 return Op.getOpcode() == ISD::ZERO_EXTEND
13589 ? std::optional<ByteProvider<SDValue>>(
13591 : std::nullopt;
13592 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13593 }
13594
13595 case ISD::TRUNCATE: {
13596 if (IsVec)
13597 return std::nullopt;
13598
13599 uint64_t NarrowByteWidth = BitWidth / 8;
13600
13601 if (NarrowByteWidth >= Index) {
13602 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13603 StartingIndex);
13604 }
13605
13606 return std::nullopt;
13607 }
13608
13609 case ISD::CopyFromReg: {
13610 if (BitWidth / 8 > Index)
13611 return calculateSrcByte(Op, StartingIndex, Index);
13612
13613 return std::nullopt;
13614 }
13615
13616 case ISD::LOAD: {
13617 auto *L = cast<LoadSDNode>(Op.getNode());
13618
13619 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13620 if (NarrowBitWidth % 8 != 0)
13621 return std::nullopt;
13622 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13623
13624 // If the width of the load does not reach byte we are trying to provide for
13625 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13626 // question
13627 if (Index >= NarrowByteWidth) {
13628 return L->getExtensionType() == ISD::ZEXTLOAD
13629 ? std::optional<ByteProvider<SDValue>>(
13631 : std::nullopt;
13632 }
13633
13634 if (NarrowByteWidth > Index) {
13635 return calculateSrcByte(Op, StartingIndex, Index);
13636 }
13637
13638 return std::nullopt;
13639 }
13640
13641 case ISD::BSWAP: {
13642 if (IsVec)
13643 return std::nullopt;
13644
13645 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13646 Depth + 1, StartingIndex);
13647 }
13648
13650 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13651 if (!IdxOp)
13652 return std::nullopt;
13653 auto VecIdx = IdxOp->getZExtValue();
13654 auto ScalarSize = Op.getScalarValueSizeInBits();
13655 if (ScalarSize < 32)
13656 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13657 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13658 StartingIndex, Index);
13659 }
13660
13661 case AMDGPUISD::PERM: {
13662 if (IsVec)
13663 return std::nullopt;
13664
13665 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13666 if (!PermMask)
13667 return std::nullopt;
13668
13669 auto IdxMask =
13670 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13671 if (IdxMask > 0x07 && IdxMask != 0x0c)
13672 return std::nullopt;
13673
13674 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13675 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13676
13677 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13680 }
13681
13682 default: {
13683 return std::nullopt;
13684 }
13685 }
13686
13687 llvm_unreachable("fully handled switch");
13688}
13689
13690// Returns true if the Operand is a scalar and is 16 bits
13691static bool isExtendedFrom16Bits(SDValue &Operand) {
13692
13693 switch (Operand.getOpcode()) {
13694 case ISD::ANY_EXTEND:
13695 case ISD::SIGN_EXTEND:
13696 case ISD::ZERO_EXTEND: {
13697 auto OpVT = Operand.getOperand(0).getValueType();
13698 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13699 }
13700 case ISD::LOAD: {
13701 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13702 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13703 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13704 ExtType == ISD::EXTLOAD) {
13705 auto MemVT = L->getMemoryVT();
13706 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13707 }
13708 return L->getMemoryVT().getSizeInBits() == 16;
13709 }
13710 default:
13711 return false;
13712 }
13713}
13714
13715// Returns true if the mask matches consecutive bytes, and the first byte
13716// begins at a power of 2 byte offset from 0th byte
13717static bool addresses16Bits(int Mask) {
13718 int Low8 = Mask & 0xff;
13719 int Hi8 = (Mask & 0xff00) >> 8;
13720
13721 assert(Low8 < 8 && Hi8 < 8);
13722 // Are the bytes contiguous in the order of increasing addresses.
13723 bool IsConsecutive = (Hi8 - Low8 == 1);
13724 // Is the first byte at location that is aligned for 16 bit instructions.
13725 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13726 // In this case, we still need code to extract the 16 bit operand, so it
13727 // is better to use i8 v_perm
13728 bool Is16Aligned = !(Low8 % 2);
13729
13730 return IsConsecutive && Is16Aligned;
13731}
13732
13733// Do not lower into v_perm if the operands are actually 16 bit
13734// and the selected bits (based on PermMask) correspond with two
13735// easily addressable 16 bit operands.
13737 SDValue &OtherOp) {
13738 int Low16 = PermMask & 0xffff;
13739 int Hi16 = (PermMask & 0xffff0000) >> 16;
13740
13741 auto TempOp = peekThroughBitcasts(Op);
13742 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13743
13744 auto OpIs16Bit =
13745 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13746 if (!OpIs16Bit)
13747 return true;
13748
13749 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13750 isExtendedFrom16Bits(TempOtherOp);
13751 if (!OtherOpIs16Bit)
13752 return true;
13753
13754 // Do we cleanly address both
13755 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13756}
13757
13759 unsigned DWordOffset) {
13760 SDValue Ret;
13761
13762 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13763 // ByteProvider must be at least 8 bits
13764 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13765
13766 if (TypeSize <= 32)
13767 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13768
13769 if (Src.getValueType().isVector()) {
13770 auto ScalarTySize = Src.getScalarValueSizeInBits();
13771 auto ScalarTy = Src.getValueType().getScalarType();
13772 if (ScalarTySize == 32) {
13773 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13774 DAG.getConstant(DWordOffset, SL, MVT::i32));
13775 }
13776 if (ScalarTySize > 32) {
13777 Ret = DAG.getNode(
13778 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13779 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13780 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13781 if (ShiftVal)
13782 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13783 DAG.getConstant(ShiftVal, SL, MVT::i32));
13784 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13785 }
13786
13787 assert(ScalarTySize < 32);
13788 auto NumElements = TypeSize / ScalarTySize;
13789 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13790 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13791 auto NumElementsIn32 = 32 / ScalarTySize;
13792 auto NumAvailElements = DWordOffset < Trunc32Elements
13793 ? NumElementsIn32
13794 : NumElements - NormalizedTrunc;
13795
13797 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13798 NumAvailElements);
13799
13800 Ret = DAG.getBuildVector(
13801 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13802 VecSrcs);
13803 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13804 }
13805
13806 /// Scalar Type
13807 auto ShiftVal = 32 * DWordOffset;
13808 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13809 DAG.getConstant(ShiftVal, SL, MVT::i32));
13810 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13811}
13812
13814 SelectionDAG &DAG = DCI.DAG;
13815 [[maybe_unused]] EVT VT = N->getValueType(0);
13817
13818 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13819 assert(VT == MVT::i32);
13820 for (int i = 0; i < 4; i++) {
13821 // Find the ByteProvider that provides the ith byte of the result of OR
13822 std::optional<ByteProvider<SDValue>> P =
13823 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13824 // TODO support constantZero
13825 if (!P || P->isConstantZero())
13826 return SDValue();
13827
13828 PermNodes.push_back(*P);
13829 }
13830 if (PermNodes.size() != 4)
13831 return SDValue();
13832
13833 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13834 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13835 uint64_t PermMask = 0x00000000;
13836 for (size_t i = 0; i < PermNodes.size(); i++) {
13837 auto PermOp = PermNodes[i];
13838 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13839 // by sizeof(Src2) = 4
13840 int SrcByteAdjust = 4;
13841
13842 // If the Src uses a byte from a different DWORD, then it corresponds
13843 // with a difference source
13844 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13845 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13846 if (SecondSrc)
13847 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13848 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13849 return SDValue();
13850
13851 // Set the index of the second distinct Src node
13852 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13853 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13854 SrcByteAdjust = 0;
13855 }
13856 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13858 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13859 }
13860 SDLoc DL(N);
13861 SDValue Op = *PermNodes[FirstSrc.first].Src;
13862 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13863 assert(Op.getValueSizeInBits() == 32);
13864
13865 // Check that we are not just extracting the bytes in order from an op
13866 if (!SecondSrc) {
13867 int Low16 = PermMask & 0xffff;
13868 int Hi16 = (PermMask & 0xffff0000) >> 16;
13869
13870 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13871 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13872
13873 // The perm op would really just produce Op. So combine into Op
13874 if (WellFormedLow && WellFormedHi)
13875 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13876 }
13877
13878 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13879
13880 if (SecondSrc) {
13881 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13882 assert(OtherOp.getValueSizeInBits() == 32);
13883 }
13884
13885 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13886
13887 assert(Op.getValueType().isByteSized() &&
13888 OtherOp.getValueType().isByteSized());
13889
13890 // If the ultimate src is less than 32 bits, then we will only be
13891 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13892 // CalculateByteProvider would not have returned Op as source if we
13893 // used a byte that is outside its ValueType. Thus, we are free to
13894 // ANY_EXTEND as the extended bits are dont-cares.
13895 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13896 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13897
13898 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13899 DAG.getConstant(PermMask, DL, MVT::i32));
13900 }
13901 return SDValue();
13902}
13903
13904SDValue SITargetLowering::performOrCombine(SDNode *N,
13905 DAGCombinerInfo &DCI) const {
13906 SelectionDAG &DAG = DCI.DAG;
13907 SDValue LHS = N->getOperand(0);
13908 SDValue RHS = N->getOperand(1);
13909
13910 EVT VT = N->getValueType(0);
13911 if (VT == MVT::i1) {
13912 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13913 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13914 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13915 SDValue Src = LHS.getOperand(0);
13916 if (Src != RHS.getOperand(0))
13917 return SDValue();
13918
13919 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13920 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13921 if (!CLHS || !CRHS)
13922 return SDValue();
13923
13924 // Only 10 bits are used.
13925 static const uint32_t MaxMask = 0x3ff;
13926
13927 uint32_t NewMask =
13928 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13929 SDLoc DL(N);
13930 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13931 DAG.getConstant(NewMask, DL, MVT::i32));
13932 }
13933
13934 return SDValue();
13935 }
13936
13937 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13939 LHS.getOpcode() == AMDGPUISD::PERM &&
13940 isa<ConstantSDNode>(LHS.getOperand(2))) {
13941 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13942 if (!Sel)
13943 return SDValue();
13944
13945 Sel |= LHS.getConstantOperandVal(2);
13946 SDLoc DL(N);
13947 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13948 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13949 }
13950
13951 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13952 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13953 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13954 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13955
13956 // If all the uses of an or need to extract the individual elements, do not
13957 // attempt to lower into v_perm
13958 auto usesCombinedOperand = [](SDNode *OrUse) {
13959 // If we have any non-vectorized use, then it is a candidate for v_perm
13960 if (OrUse->getOpcode() != ISD::BITCAST ||
13961 !OrUse->getValueType(0).isVector())
13962 return true;
13963
13964 // If we have any non-vectorized use, then it is a candidate for v_perm
13965 for (auto *VUser : OrUse->users()) {
13966 if (!VUser->getValueType(0).isVector())
13967 return true;
13968
13969 // If the use of a vector is a store, then combining via a v_perm
13970 // is beneficial.
13971 // TODO -- whitelist more uses
13972 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13973 if (VUser->getOpcode() == VectorwiseOp)
13974 return true;
13975 }
13976 return false;
13977 };
13978
13979 if (!any_of(N->users(), usesCombinedOperand))
13980 return SDValue();
13981
13982 uint32_t LHSMask = getPermuteMask(LHS);
13983 uint32_t RHSMask = getPermuteMask(RHS);
13984
13985 if (LHSMask != ~0u && RHSMask != ~0u) {
13986 // Canonicalize the expression in an attempt to have fewer unique masks
13987 // and therefore fewer registers used to hold the masks.
13988 if (LHSMask > RHSMask) {
13989 std::swap(LHSMask, RHSMask);
13990 std::swap(LHS, RHS);
13991 }
13992
13993 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13994 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13995 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13996 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13997
13998 // Check of we need to combine values from two sources within a byte.
13999 if (!(LHSUsedLanes & RHSUsedLanes) &&
14000 // If we select high and lower word keep it for SDWA.
14001 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14002 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14003 // Kill zero bytes selected by other mask. Zero value is 0xc.
14004 LHSMask &= ~RHSUsedLanes;
14005 RHSMask &= ~LHSUsedLanes;
14006 // Add 4 to each active LHS lane
14007 LHSMask |= LHSUsedLanes & 0x04040404;
14008 // Combine masks
14009 uint32_t Sel = LHSMask | RHSMask;
14010 SDLoc DL(N);
14011
14012 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14013 RHS.getOperand(0),
14014 DAG.getConstant(Sel, DL, MVT::i32));
14015 }
14016 }
14017 if (LHSMask == ~0u || RHSMask == ~0u) {
14018 if (SDValue Perm = matchPERM(N, DCI))
14019 return Perm;
14020 }
14021 }
14022
14023 // Detect identity v2i32 OR and replace with identity source node.
14024 // Specifically an Or that has operands constructed from the same source node
14025 // via extract_vector_elt and build_vector. I.E.
14026 // v2i32 or(
14027 // v2i32 build_vector(
14028 // i32 extract_elt(%IdentitySrc, 0),
14029 // i32 0
14030 // ),
14031 // v2i32 build_vector(
14032 // i32 0,
14033 // i32 extract_elt(%IdentitySrc, 1)
14034 // ) )
14035 // =>
14036 // v2i32 %IdentitySrc
14037
14038 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14039 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14040
14041 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14042 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14043
14044 // Test for and normalise build vectors.
14045 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14046
14047 // Get the extract_vector_element operands.
14048 SDValue LEVE = LHS->getOperand(0);
14049 SDValue REVE = RHS->getOperand(1);
14050
14051 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14053 // Check that different elements from the same vector are
14054 // extracted.
14055 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14056 LEVE->getOperand(1) != REVE->getOperand(1)) {
14057 SDValue IdentitySrc = LEVE.getOperand(0);
14058 return IdentitySrc;
14059 }
14060 }
14061 }
14062 }
14063
14064 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14065 return SDValue();
14066
14067 // TODO: This could be a generic combine with a predicate for extracting the
14068 // high half of an integer being free.
14069
14070 // (or i64:x, (zero_extend i32:y)) ->
14071 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14072 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14073 RHS.getOpcode() != ISD::ZERO_EXTEND)
14074 std::swap(LHS, RHS);
14075
14076 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14077 SDValue ExtSrc = RHS.getOperand(0);
14078 EVT SrcVT = ExtSrc.getValueType();
14079 if (SrcVT == MVT::i32) {
14080 SDLoc SL(N);
14081 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14082 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14083
14084 DCI.AddToWorklist(LowOr.getNode());
14085 DCI.AddToWorklist(HiBits.getNode());
14086
14087 SDValue Vec =
14088 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14089 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14090 }
14091 }
14092
14093 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14094 if (CRHS) {
14095 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14096 N->getOperand(0), CRHS))
14097 return Split;
14098 }
14099
14100 return SDValue();
14101}
14102
14103SDValue SITargetLowering::performXorCombine(SDNode *N,
14104 DAGCombinerInfo &DCI) const {
14105 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14106 return RV;
14107
14108 SDValue LHS = N->getOperand(0);
14109 SDValue RHS = N->getOperand(1);
14110
14111 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14112 SelectionDAG &DAG = DCI.DAG;
14113
14114 EVT VT = N->getValueType(0);
14115 if (CRHS && VT == MVT::i64) {
14116 if (SDValue Split =
14117 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14118 return Split;
14119 }
14120
14121 // v2i32 (xor (vselect cc, x, y), K) ->
14122 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14123 // replaced with source modifiers when the select is lowered to CNDMASK.
14124 unsigned Opc = LHS.getOpcode();
14125 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14126 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14127 CRHS && CRHS->getAPIntValue().isSignMask()) {
14128 SDValue CC = LHS->getOperand(0);
14129 SDValue TRUE = LHS->getOperand(1);
14130 SDValue FALSE = LHS->getOperand(2);
14131 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14132 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14133 SDValue XSelect =
14134 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14135 return XSelect;
14136 }
14137
14138 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14139 // fneg-like xors into 64-bit select.
14140 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14141 // This looks like an fneg, try to fold as a source modifier.
14142 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14144 // xor (select c, a, b), 0x80000000 ->
14145 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14146 SDLoc DL(N);
14147 SDValue CastLHS =
14148 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14149 SDValue CastRHS =
14150 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14151 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14152 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14153 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14154 LHS->getOperand(0), FNegLHS, FNegRHS);
14155 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14156 }
14157 }
14158
14159 return SDValue();
14160}
14161
14162SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14163 DAGCombinerInfo &DCI) const {
14164 if (!Subtarget->has16BitInsts() ||
14165 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14166 return SDValue();
14167
14168 EVT VT = N->getValueType(0);
14169 if (VT != MVT::i32)
14170 return SDValue();
14171
14172 SDValue Src = N->getOperand(0);
14173 if (Src.getValueType() != MVT::i16)
14174 return SDValue();
14175
14176 return SDValue();
14177}
14178
14179SDValue
14180SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14181 DAGCombinerInfo &DCI) const {
14182 SDValue Src = N->getOperand(0);
14183 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14184
14185 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14186 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14187 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14188 VTSign->getVT() == MVT::i8) ||
14189 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14190 VTSign->getVT() == MVT::i16))) {
14191 assert(Subtarget->hasScalarSubwordLoads() &&
14192 "s_buffer_load_{u8, i8} are supported "
14193 "in GFX12 (or newer) architectures.");
14194 EVT VT = Src.getValueType();
14195 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14198 SDLoc DL(N);
14199 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14200 SDValue Ops[] = {
14201 Src.getOperand(0), // source register
14202 Src.getOperand(1), // offset
14203 Src.getOperand(2) // cachePolicy
14204 };
14205 auto *M = cast<MemSDNode>(Src);
14206 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14207 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14208 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14209 return LoadVal;
14210 }
14211 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14212 VTSign->getVT() == MVT::i8) ||
14213 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14214 VTSign->getVT() == MVT::i16)) &&
14215 Src.hasOneUse()) {
14216 auto *M = cast<MemSDNode>(Src);
14217 SDValue Ops[] = {Src.getOperand(0), // Chain
14218 Src.getOperand(1), // rsrc
14219 Src.getOperand(2), // vindex
14220 Src.getOperand(3), // voffset
14221 Src.getOperand(4), // soffset
14222 Src.getOperand(5), // offset
14223 Src.getOperand(6), Src.getOperand(7)};
14224 // replace with BUFFER_LOAD_BYTE/SHORT
14225 SDVTList ResList =
14226 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14227 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14230 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14231 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14232 return DCI.DAG.getMergeValues(
14233 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14234 }
14235 return SDValue();
14236}
14237
14238SDValue SITargetLowering::performClassCombine(SDNode *N,
14239 DAGCombinerInfo &DCI) const {
14240 SelectionDAG &DAG = DCI.DAG;
14241 SDValue Mask = N->getOperand(1);
14242
14243 // fp_class x, 0 -> false
14244 if (isNullConstant(Mask))
14245 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14246
14247 if (N->getOperand(0).isUndef())
14248 return DAG.getUNDEF(MVT::i1);
14249
14250 return SDValue();
14251}
14252
14253SDValue SITargetLowering::performRcpCombine(SDNode *N,
14254 DAGCombinerInfo &DCI) const {
14255 EVT VT = N->getValueType(0);
14256 SDValue N0 = N->getOperand(0);
14257
14258 if (N0.isUndef()) {
14259 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14260 SDLoc(N), VT);
14261 }
14262
14263 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14264 N0.getOpcode() == ISD::SINT_TO_FP)) {
14265 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14266 N->getFlags());
14267 }
14268
14269 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14270 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14271 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14272 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14273 N->getFlags());
14274 }
14275
14277}
14278
14280 unsigned MaxDepth) const {
14281 unsigned Opcode = Op.getOpcode();
14282 if (Opcode == ISD::FCANONICALIZE)
14283 return true;
14284
14285 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14286 const auto &F = CFP->getValueAPF();
14287 if (F.isNaN() && F.isSignaling())
14288 return false;
14289 if (!F.isDenormal())
14290 return true;
14291
14292 DenormalMode Mode =
14293 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14294 return Mode == DenormalMode::getIEEE();
14295 }
14296
14297 // If source is a result of another standard FP operation it is already in
14298 // canonical form.
14299 if (MaxDepth == 0)
14300 return false;
14301
14302 switch (Opcode) {
14303 // These will flush denorms if required.
14304 case ISD::FADD:
14305 case ISD::FSUB:
14306 case ISD::FMUL:
14307 case ISD::FCEIL:
14308 case ISD::FFLOOR:
14309 case ISD::FMA:
14310 case ISD::FMAD:
14311 case ISD::FSQRT:
14312 case ISD::FDIV:
14313 case ISD::FREM:
14314 case ISD::FP_ROUND:
14315 case ISD::FP_EXTEND:
14316 case ISD::FP16_TO_FP:
14317 case ISD::FP_TO_FP16:
14318 case ISD::BF16_TO_FP:
14319 case ISD::FP_TO_BF16:
14320 case ISD::FLDEXP:
14323 case AMDGPUISD::RCP:
14324 case AMDGPUISD::RSQ:
14328 case AMDGPUISD::LOG:
14329 case AMDGPUISD::EXP:
14333 case AMDGPUISD::FRACT:
14340 case AMDGPUISD::SIN_HW:
14341 case AMDGPUISD::COS_HW:
14342 return true;
14343
14344 // It can/will be lowered or combined as a bit operation.
14345 // Need to check their input recursively to handle.
14346 case ISD::FNEG:
14347 case ISD::FABS:
14348 case ISD::FCOPYSIGN:
14349 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14350
14351 case ISD::AND:
14352 if (Op.getValueType() == MVT::i32) {
14353 // Be careful as we only know it is a bitcast floating point type. It
14354 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14355 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14356 // is valid to optimize for all types.
14357 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14358 if (RHS->getZExtValue() == 0xffff0000) {
14359 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14360 }
14361 }
14362 }
14363 break;
14364
14365 case ISD::FSIN:
14366 case ISD::FCOS:
14367 case ISD::FSINCOS:
14368 return Op.getValueType().getScalarType() != MVT::f16;
14369
14370 case ISD::FMINNUM:
14371 case ISD::FMAXNUM:
14372 case ISD::FMINNUM_IEEE:
14373 case ISD::FMAXNUM_IEEE:
14374 case ISD::FMINIMUM:
14375 case ISD::FMAXIMUM:
14376 case ISD::FMINIMUMNUM:
14377 case ISD::FMAXIMUMNUM:
14378 case AMDGPUISD::CLAMP:
14379 case AMDGPUISD::FMED3:
14380 case AMDGPUISD::FMAX3:
14381 case AMDGPUISD::FMIN3:
14383 case AMDGPUISD::FMINIMUM3: {
14384 // FIXME: Shouldn't treat the generic operations different based these.
14385 // However, we aren't really required to flush the result from
14386 // minnum/maxnum..
14387
14388 // snans will be quieted, so we only need to worry about denormals.
14389 if (Subtarget->supportsMinMaxDenormModes() ||
14390 // FIXME: denormalsEnabledForType is broken for dynamic
14391 denormalsEnabledForType(DAG, Op.getValueType()))
14392 return true;
14393
14394 // Flushing may be required.
14395 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14396 // targets need to check their input recursively.
14397
14398 // FIXME: Does this apply with clamp? It's implemented with max.
14399 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14400 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14401 return false;
14402 }
14403
14404 return true;
14405 }
14406 case ISD::SELECT: {
14407 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14408 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14409 }
14410 case ISD::BUILD_VECTOR: {
14411 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14412 SDValue SrcOp = Op.getOperand(i);
14413 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14414 return false;
14415 }
14416
14417 return true;
14418 }
14421 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14422 }
14424 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14425 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14426 }
14427 case ISD::UNDEF:
14428 // Could be anything.
14429 return false;
14430
14431 case ISD::BITCAST:
14432 // TODO: This is incorrect as it loses track of the operand's type. We may
14433 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14434 // same bits that are canonicalized in one type need not be in the other.
14435 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14436 case ISD::TRUNCATE: {
14437 // Hack round the mess we make when legalizing extract_vector_elt
14438 if (Op.getValueType() == MVT::i16) {
14439 SDValue TruncSrc = Op.getOperand(0);
14440 if (TruncSrc.getValueType() == MVT::i32 &&
14441 TruncSrc.getOpcode() == ISD::BITCAST &&
14442 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14443 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14444 }
14445 }
14446 return false;
14447 }
14449 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14450 // TODO: Handle more intrinsics
14451 switch (IntrinsicID) {
14452 case Intrinsic::amdgcn_cvt_pkrtz:
14453 case Intrinsic::amdgcn_cubeid:
14454 case Intrinsic::amdgcn_frexp_mant:
14455 case Intrinsic::amdgcn_fdot2:
14456 case Intrinsic::amdgcn_rcp:
14457 case Intrinsic::amdgcn_rsq:
14458 case Intrinsic::amdgcn_rsq_clamp:
14459 case Intrinsic::amdgcn_rcp_legacy:
14460 case Intrinsic::amdgcn_rsq_legacy:
14461 case Intrinsic::amdgcn_trig_preop:
14462 case Intrinsic::amdgcn_tanh:
14463 case Intrinsic::amdgcn_log:
14464 case Intrinsic::amdgcn_exp2:
14465 case Intrinsic::amdgcn_sqrt:
14466 return true;
14467 default:
14468 break;
14469 }
14470
14471 break;
14472 }
14473 default:
14474 break;
14475 }
14476
14477 // FIXME: denormalsEnabledForType is broken for dynamic
14478 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14479 DAG.isKnownNeverSNaN(Op);
14480}
14481
14483 unsigned MaxDepth) const {
14484 const MachineRegisterInfo &MRI = MF.getRegInfo();
14485 MachineInstr *MI = MRI.getVRegDef(Reg);
14486 unsigned Opcode = MI->getOpcode();
14487
14488 if (Opcode == AMDGPU::G_FCANONICALIZE)
14489 return true;
14490
14491 std::optional<FPValueAndVReg> FCR;
14492 // Constant splat (can be padded with undef) or scalar constant.
14494 if (FCR->Value.isSignaling())
14495 return false;
14496 if (!FCR->Value.isDenormal())
14497 return true;
14498
14499 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14500 return Mode == DenormalMode::getIEEE();
14501 }
14502
14503 if (MaxDepth == 0)
14504 return false;
14505
14506 switch (Opcode) {
14507 case AMDGPU::G_FADD:
14508 case AMDGPU::G_FSUB:
14509 case AMDGPU::G_FMUL:
14510 case AMDGPU::G_FCEIL:
14511 case AMDGPU::G_FFLOOR:
14512 case AMDGPU::G_FRINT:
14513 case AMDGPU::G_FNEARBYINT:
14514 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14515 case AMDGPU::G_INTRINSIC_TRUNC:
14516 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14517 case AMDGPU::G_FMA:
14518 case AMDGPU::G_FMAD:
14519 case AMDGPU::G_FSQRT:
14520 case AMDGPU::G_FDIV:
14521 case AMDGPU::G_FREM:
14522 case AMDGPU::G_FPOW:
14523 case AMDGPU::G_FPEXT:
14524 case AMDGPU::G_FLOG:
14525 case AMDGPU::G_FLOG2:
14526 case AMDGPU::G_FLOG10:
14527 case AMDGPU::G_FPTRUNC:
14528 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14529 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14530 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14531 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14532 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14533 return true;
14534 case AMDGPU::G_FNEG:
14535 case AMDGPU::G_FABS:
14536 case AMDGPU::G_FCOPYSIGN:
14537 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14538 case AMDGPU::G_FMINNUM:
14539 case AMDGPU::G_FMAXNUM:
14540 case AMDGPU::G_FMINNUM_IEEE:
14541 case AMDGPU::G_FMAXNUM_IEEE:
14542 case AMDGPU::G_FMINIMUM:
14543 case AMDGPU::G_FMAXIMUM:
14544 case AMDGPU::G_FMINIMUMNUM:
14545 case AMDGPU::G_FMAXIMUMNUM: {
14546 if (Subtarget->supportsMinMaxDenormModes() ||
14547 // FIXME: denormalsEnabledForType is broken for dynamic
14548 denormalsEnabledForType(MRI.getType(Reg), MF))
14549 return true;
14550
14551 [[fallthrough]];
14552 }
14553 case AMDGPU::G_BUILD_VECTOR:
14554 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14555 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14556 return false;
14557 return true;
14558 case AMDGPU::G_INTRINSIC:
14559 case AMDGPU::G_INTRINSIC_CONVERGENT:
14560 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14561 case Intrinsic::amdgcn_fmul_legacy:
14562 case Intrinsic::amdgcn_fmad_ftz:
14563 case Intrinsic::amdgcn_sqrt:
14564 case Intrinsic::amdgcn_fmed3:
14565 case Intrinsic::amdgcn_sin:
14566 case Intrinsic::amdgcn_cos:
14567 case Intrinsic::amdgcn_log:
14568 case Intrinsic::amdgcn_exp2:
14569 case Intrinsic::amdgcn_log_clamp:
14570 case Intrinsic::amdgcn_rcp:
14571 case Intrinsic::amdgcn_rcp_legacy:
14572 case Intrinsic::amdgcn_rsq:
14573 case Intrinsic::amdgcn_rsq_clamp:
14574 case Intrinsic::amdgcn_rsq_legacy:
14575 case Intrinsic::amdgcn_div_scale:
14576 case Intrinsic::amdgcn_div_fmas:
14577 case Intrinsic::amdgcn_div_fixup:
14578 case Intrinsic::amdgcn_fract:
14579 case Intrinsic::amdgcn_cvt_pkrtz:
14580 case Intrinsic::amdgcn_cubeid:
14581 case Intrinsic::amdgcn_cubema:
14582 case Intrinsic::amdgcn_cubesc:
14583 case Intrinsic::amdgcn_cubetc:
14584 case Intrinsic::amdgcn_frexp_mant:
14585 case Intrinsic::amdgcn_fdot2:
14586 case Intrinsic::amdgcn_trig_preop:
14587 case Intrinsic::amdgcn_tanh:
14588 return true;
14589 default:
14590 break;
14591 }
14592
14593 [[fallthrough]];
14594 default:
14595 return false;
14596 }
14597
14598 llvm_unreachable("invalid operation");
14599}
14600
14601// Constant fold canonicalize.
14602SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14603 const SDLoc &SL, EVT VT,
14604 const APFloat &C) const {
14605 // Flush denormals to 0 if not enabled.
14606 if (C.isDenormal()) {
14607 DenormalMode Mode =
14608 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14609 if (Mode == DenormalMode::getPreserveSign()) {
14610 return DAG.getConstantFP(
14611 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14612 }
14613
14614 if (Mode != DenormalMode::getIEEE())
14615 return SDValue();
14616 }
14617
14618 if (C.isNaN()) {
14619 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14620 if (C.isSignaling()) {
14621 // Quiet a signaling NaN.
14622 // FIXME: Is this supposed to preserve payload bits?
14623 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14624 }
14625
14626 // Make sure it is the canonical NaN bitpattern.
14627 //
14628 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14629 // immediate?
14630 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14631 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14632 }
14633
14634 // Already canonical.
14635 return DAG.getConstantFP(C, SL, VT);
14636}
14637
14639 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14640}
14641
14642SDValue
14643SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14644 DAGCombinerInfo &DCI) const {
14645 SelectionDAG &DAG = DCI.DAG;
14646 SDValue N0 = N->getOperand(0);
14647 EVT VT = N->getValueType(0);
14648
14649 // fcanonicalize undef -> qnan
14650 if (N0.isUndef()) {
14652 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14653 }
14654
14655 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14656 EVT VT = N->getValueType(0);
14657 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14658 }
14659
14660 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14661 // (fcanonicalize k)
14662 //
14663 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14664
14665 // TODO: This could be better with wider vectors that will be split to v2f16,
14666 // and to consider uses since there aren't that many packed operations.
14667 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14668 isTypeLegal(MVT::v2f16)) {
14669 SDLoc SL(N);
14670 SDValue NewElts[2];
14671 SDValue Lo = N0.getOperand(0);
14672 SDValue Hi = N0.getOperand(1);
14673 EVT EltVT = Lo.getValueType();
14674
14676 for (unsigned I = 0; I != 2; ++I) {
14677 SDValue Op = N0.getOperand(I);
14678 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14679 NewElts[I] =
14680 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14681 } else if (Op.isUndef()) {
14682 // Handled below based on what the other operand is.
14683 NewElts[I] = Op;
14684 } else {
14685 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14686 }
14687 }
14688
14689 // If one half is undef, and one is constant, prefer a splat vector rather
14690 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14691 // cheaper to use and may be free with a packed operation.
14692 if (NewElts[0].isUndef()) {
14693 if (isa<ConstantFPSDNode>(NewElts[1]))
14694 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14695 ? NewElts[1]
14696 : DAG.getConstantFP(0.0f, SL, EltVT);
14697 }
14698
14699 if (NewElts[1].isUndef()) {
14700 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14701 ? NewElts[0]
14702 : DAG.getConstantFP(0.0f, SL, EltVT);
14703 }
14704
14705 return DAG.getBuildVector(VT, SL, NewElts);
14706 }
14707 }
14708
14709 return SDValue();
14710}
14711
14712static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14713 switch (Opc) {
14714 case ISD::FMAXNUM:
14715 case ISD::FMAXNUM_IEEE:
14716 case ISD::FMAXIMUMNUM:
14717 return AMDGPUISD::FMAX3;
14718 case ISD::FMAXIMUM:
14719 return AMDGPUISD::FMAXIMUM3;
14720 case ISD::SMAX:
14721 return AMDGPUISD::SMAX3;
14722 case ISD::UMAX:
14723 return AMDGPUISD::UMAX3;
14724 case ISD::FMINNUM:
14725 case ISD::FMINNUM_IEEE:
14726 case ISD::FMINIMUMNUM:
14727 return AMDGPUISD::FMIN3;
14728 case ISD::FMINIMUM:
14729 return AMDGPUISD::FMINIMUM3;
14730 case ISD::SMIN:
14731 return AMDGPUISD::SMIN3;
14732 case ISD::UMIN:
14733 return AMDGPUISD::UMIN3;
14734 default:
14735 llvm_unreachable("Not a min/max opcode");
14736 }
14737}
14738
14739SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14740 const SDLoc &SL, SDValue Src,
14741 SDValue MinVal,
14742 SDValue MaxVal,
14743 bool Signed) const {
14744
14745 // med3 comes from
14746 // min(max(x, K0), K1), K0 < K1
14747 // max(min(x, K0), K1), K1 < K0
14748 //
14749 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14750 // min/max op.
14751 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14752 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14753
14754 if (!MinK || !MaxK)
14755 return SDValue();
14756
14757 if (Signed) {
14758 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14759 return SDValue();
14760 } else {
14761 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14762 return SDValue();
14763 }
14764
14765 EVT VT = MinK->getValueType(0);
14766 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14767 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14768 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14769
14770 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14771 // not available, but this is unlikely to be profitable as constants
14772 // will often need to be materialized & extended, especially on
14773 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14774 return SDValue();
14775}
14776
14779 return C;
14780
14782 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14783 return C;
14784 }
14785
14786 return nullptr;
14787}
14788
14789SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14790 const SDLoc &SL, SDValue Op0,
14791 SDValue Op1) const {
14792 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14793 if (!K1)
14794 return SDValue();
14795
14796 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14797 if (!K0)
14798 return SDValue();
14799
14800 // Ordered >= (although NaN inputs should have folded away by now).
14801 if (K0->getValueAPF() > K1->getValueAPF())
14802 return SDValue();
14803
14804 // med3 with a nan input acts like
14805 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14806 //
14807 // So the result depends on whether the IEEE mode bit is enabled or not with a
14808 // signaling nan input.
14809 // ieee=1
14810 // s0 snan: yields s2
14811 // s1 snan: yields s2
14812 // s2 snan: qnan
14813
14814 // s0 qnan: min(s1, s2)
14815 // s1 qnan: min(s0, s2)
14816 // s2 qnan: min(s0, s1)
14817
14818 // ieee=0
14819 // s0 snan: min(s1, s2)
14820 // s1 snan: min(s0, s2)
14821 // s2 snan: qnan
14822
14823 // s0 qnan: min(s1, s2)
14824 // s1 qnan: min(s0, s2)
14825 // s2 qnan: min(s0, s1)
14826 const MachineFunction &MF = DAG.getMachineFunction();
14827 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14828
14829 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14830 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14831 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14832 EVT VT = Op0.getValueType();
14833 if (Info->getMode().DX10Clamp) {
14834 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14835 // hardware fmed3 behavior converting to a min.
14836 // FIXME: Should this be allowing -0.0?
14837 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14838 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14839 }
14840
14841 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14842 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14843 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14844 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14845 // then give the other result, which is different from med3 with a NaN
14846 // input.
14847 SDValue Var = Op0.getOperand(0);
14848 if (!DAG.isKnownNeverSNaN(Var))
14849 return SDValue();
14850
14851 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14852
14853 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14854 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14855 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14856 SDValue(K0, 0), SDValue(K1, 0));
14857 }
14858 }
14859
14860 return SDValue();
14861}
14862
14863/// \return true if the subtarget supports minimum3 and maximum3 with the given
14864/// base min/max opcode \p Opc for type \p VT.
14865static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14866 EVT VT) {
14867 switch (Opc) {
14868 case ISD::FMINNUM:
14869 case ISD::FMAXNUM:
14870 case ISD::FMINNUM_IEEE:
14871 case ISD::FMAXNUM_IEEE:
14872 case ISD::FMINIMUMNUM:
14873 case ISD::FMAXIMUMNUM:
14876 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14877 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14878 case ISD::FMINIMUM:
14879 case ISD::FMAXIMUM:
14880 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14881 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14882 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14883 case ISD::SMAX:
14884 case ISD::SMIN:
14885 case ISD::UMAX:
14886 case ISD::UMIN:
14887 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14888 default:
14889 return false;
14890 }
14891
14892 llvm_unreachable("not a min/max opcode");
14893}
14894
14895SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14896 DAGCombinerInfo &DCI) const {
14897 SelectionDAG &DAG = DCI.DAG;
14898
14899 EVT VT = N->getValueType(0);
14900 unsigned Opc = N->getOpcode();
14901 SDValue Op0 = N->getOperand(0);
14902 SDValue Op1 = N->getOperand(1);
14903
14904 // Only do this if the inner op has one use since this will just increases
14905 // register pressure for no benefit.
14906
14907 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14908 // max(max(a, b), c) -> max3(a, b, c)
14909 // min(min(a, b), c) -> min3(a, b, c)
14910 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14911 SDLoc DL(N);
14912 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14913 Op0.getOperand(0), Op0.getOperand(1), Op1);
14914 }
14915
14916 // Try commuted.
14917 // max(a, max(b, c)) -> max3(a, b, c)
14918 // min(a, min(b, c)) -> min3(a, b, c)
14919 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14920 SDLoc DL(N);
14921 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14922 Op0, Op1.getOperand(0), Op1.getOperand(1));
14923 }
14924 }
14925
14926 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14927 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14928 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14929 if (SDValue Med3 = performIntMed3ImmCombine(
14930 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14931 return Med3;
14932 }
14933 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14934 if (SDValue Med3 = performIntMed3ImmCombine(
14935 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14936 return Med3;
14937 }
14938
14939 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14940 if (SDValue Med3 = performIntMed3ImmCombine(
14941 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14942 return Med3;
14943 }
14944 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14945 if (SDValue Med3 = performIntMed3ImmCombine(
14946 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14947 return Med3;
14948 }
14949
14950 // if !is_snan(x):
14951 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14952 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14953 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14954 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14955 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14956 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14957 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14959 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14960 (VT == MVT::f32 || VT == MVT::f64 ||
14961 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14962 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14963 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14964 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14965 Op0.hasOneUse()) {
14966 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14967 return Res;
14968 }
14969
14970 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14971 // for some types, but at a higher cost since it's implemented with a 3
14972 // operand form.
14973 const SDNodeFlags Flags = N->getFlags();
14974 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14975 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14976 unsigned NewOpc =
14977 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14978 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14979 }
14980
14981 return SDValue();
14982}
14983
14987 // FIXME: Should this be allowing -0.0?
14988 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14989 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14990 }
14991 }
14992
14993 return false;
14994}
14995
14996// FIXME: Should only worry about snans for version with chain.
14997SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14998 DAGCombinerInfo &DCI) const {
14999 EVT VT = N->getValueType(0);
15000 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15001 // NaNs. With a NaN input, the order of the operands may change the result.
15002
15003 SelectionDAG &DAG = DCI.DAG;
15004 SDLoc SL(N);
15005
15006 SDValue Src0 = N->getOperand(0);
15007 SDValue Src1 = N->getOperand(1);
15008 SDValue Src2 = N->getOperand(2);
15009
15010 if (isClampZeroToOne(Src0, Src1)) {
15011 // const_a, const_b, x -> clamp is safe in all cases including signaling
15012 // nans.
15013 // FIXME: Should this be allowing -0.0?
15014 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15015 }
15016
15017 const MachineFunction &MF = DAG.getMachineFunction();
15018 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15019
15020 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15021 // handling no dx10-clamp?
15022 if (Info->getMode().DX10Clamp) {
15023 // If NaNs is clamped to 0, we are free to reorder the inputs.
15024
15025 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15026 std::swap(Src0, Src1);
15027
15028 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15029 std::swap(Src1, Src2);
15030
15031 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15032 std::swap(Src0, Src1);
15033
15034 if (isClampZeroToOne(Src1, Src2))
15035 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15036 }
15037
15038 return SDValue();
15039}
15040
15041SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15042 DAGCombinerInfo &DCI) const {
15043 SDValue Src0 = N->getOperand(0);
15044 SDValue Src1 = N->getOperand(1);
15045 if (Src0.isUndef() && Src1.isUndef())
15046 return DCI.DAG.getUNDEF(N->getValueType(0));
15047 return SDValue();
15048}
15049
15050// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15051// expanded into a set of cmp/select instructions.
15053 unsigned NumElem,
15054 bool IsDivergentIdx,
15055 const GCNSubtarget *Subtarget) {
15057 return false;
15058
15059 unsigned VecSize = EltSize * NumElem;
15060
15061 // Sub-dword vectors of size 2 dword or less have better implementation.
15062 if (VecSize <= 64 && EltSize < 32)
15063 return false;
15064
15065 // Always expand the rest of sub-dword instructions, otherwise it will be
15066 // lowered via memory.
15067 if (EltSize < 32)
15068 return true;
15069
15070 // Always do this if var-idx is divergent, otherwise it will become a loop.
15071 if (IsDivergentIdx)
15072 return true;
15073
15074 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15075 unsigned NumInsts = NumElem /* Number of compares */ +
15076 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15077
15078 // On some architectures (GFX9) movrel is not available and it's better
15079 // to expand.
15080 if (Subtarget->useVGPRIndexMode())
15081 return NumInsts <= 16;
15082
15083 // If movrel is available, use it instead of expanding for vector of 8
15084 // elements.
15085 if (Subtarget->hasMovrel())
15086 return NumInsts <= 15;
15087
15088 return true;
15089}
15090
15092 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15093 if (isa<ConstantSDNode>(Idx))
15094 return false;
15095
15096 SDValue Vec = N->getOperand(0);
15097 EVT VecVT = Vec.getValueType();
15098 EVT EltVT = VecVT.getVectorElementType();
15099 unsigned EltSize = EltVT.getSizeInBits();
15100 unsigned NumElem = VecVT.getVectorNumElements();
15101
15103 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15104}
15105
15106SDValue
15107SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15108 DAGCombinerInfo &DCI) const {
15109 SDValue Vec = N->getOperand(0);
15110 SelectionDAG &DAG = DCI.DAG;
15111
15112 EVT VecVT = Vec.getValueType();
15113 EVT VecEltVT = VecVT.getVectorElementType();
15114 EVT ResVT = N->getValueType(0);
15115
15116 unsigned VecSize = VecVT.getSizeInBits();
15117 unsigned VecEltSize = VecEltVT.getSizeInBits();
15118
15119 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15121 SDLoc SL(N);
15122 SDValue Idx = N->getOperand(1);
15123 SDValue Elt =
15124 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15125 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15126 }
15127
15128 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15129 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15130 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15131 // depending on the shift operand. See e.g. performSraCombine().
15132 // This combine ensures that the optimisation is compatible with v2i32
15133 // legalised AND.
15134 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15135 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15136
15138 if (!C || C->getZExtValue() != 0x1f)
15139 return SDValue();
15140
15141 SDLoc SL(N);
15142 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15143 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15144 Vec->getOperand(0), N->getOperand(1));
15145 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15146 DAG.ReplaceAllUsesWith(N, A.getNode());
15147 }
15148
15149 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15150 // =>
15151 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15152 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15153 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15154 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15155 SDLoc SL(N);
15156 SDValue Idx = N->getOperand(1);
15157 unsigned Opc = Vec.getOpcode();
15158
15159 switch (Opc) {
15160 default:
15161 break;
15162 // TODO: Support other binary operations.
15163 case ISD::FADD:
15164 case ISD::FSUB:
15165 case ISD::FMUL:
15166 case ISD::ADD:
15167 case ISD::UMIN:
15168 case ISD::UMAX:
15169 case ISD::SMIN:
15170 case ISD::SMAX:
15171 case ISD::FMAXNUM:
15172 case ISD::FMINNUM:
15173 case ISD::FMAXNUM_IEEE:
15174 case ISD::FMINNUM_IEEE:
15175 case ISD::FMAXIMUM:
15176 case ISD::FMINIMUM: {
15177 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15178 Vec.getOperand(0), Idx);
15179 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15180 Vec.getOperand(1), Idx);
15181
15182 DCI.AddToWorklist(Elt0.getNode());
15183 DCI.AddToWorklist(Elt1.getNode());
15184 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15185 }
15186 }
15187 }
15188
15189 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15191 SDLoc SL(N);
15192 SDValue Idx = N->getOperand(1);
15193 SDValue V;
15194 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15195 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15196 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15197 if (I == 0)
15198 V = Elt;
15199 else
15200 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15201 }
15202 return V;
15203 }
15204
15205 if (!DCI.isBeforeLegalize())
15206 return SDValue();
15207
15208 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15209 // elements. This exposes more load reduction opportunities by replacing
15210 // multiple small extract_vector_elements with a single 32-bit extract.
15211 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15212 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15213 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15214 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15215
15216 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15217 unsigned EltIdx = BitIndex / 32;
15218 unsigned LeftoverBitIdx = BitIndex % 32;
15219 SDLoc SL(N);
15220
15221 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15222 DCI.AddToWorklist(Cast.getNode());
15223
15224 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15225 DAG.getConstant(EltIdx, SL, MVT::i32));
15226 DCI.AddToWorklist(Elt.getNode());
15227 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15228 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15229 DCI.AddToWorklist(Srl.getNode());
15230
15231 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15232 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15233 DCI.AddToWorklist(Trunc.getNode());
15234
15235 if (VecEltVT == ResVT) {
15236 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15237 }
15238
15239 assert(ResVT.isScalarInteger());
15240 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15241 }
15242
15243 return SDValue();
15244}
15245
15246SDValue
15247SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15248 DAGCombinerInfo &DCI) const {
15249 SDValue Vec = N->getOperand(0);
15250 SDValue Idx = N->getOperand(2);
15251 EVT VecVT = Vec.getValueType();
15252 EVT EltVT = VecVT.getVectorElementType();
15253
15254 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15255 // => BUILD_VECTOR n x select (e, const-idx)
15257 return SDValue();
15258
15259 SelectionDAG &DAG = DCI.DAG;
15260 SDLoc SL(N);
15261 SDValue Ins = N->getOperand(1);
15262 EVT IdxVT = Idx.getValueType();
15263
15265 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15266 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15267 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15268 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15269 Ops.push_back(V);
15270 }
15271
15272 return DAG.getBuildVector(VecVT, SL, Ops);
15273}
15274
15275/// Return the source of an fp_extend from f16 to f32, or a converted FP
15276/// constant.
15278 if (Src.getOpcode() == ISD::FP_EXTEND &&
15279 Src.getOperand(0).getValueType() == MVT::f16) {
15280 return Src.getOperand(0);
15281 }
15282
15283 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15284 APFloat Val = CFP->getValueAPF();
15285 bool LosesInfo = true;
15287 if (!LosesInfo)
15288 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15289 }
15290
15291 return SDValue();
15292}
15293
15294SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15295 DAGCombinerInfo &DCI) const {
15296 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15297 "combine only useful on gfx8");
15298
15299 SDValue TruncSrc = N->getOperand(0);
15300 EVT VT = N->getValueType(0);
15301 if (VT != MVT::f16)
15302 return SDValue();
15303
15304 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15305 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15306 return SDValue();
15307
15308 SelectionDAG &DAG = DCI.DAG;
15309 SDLoc SL(N);
15310
15311 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15312 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15313 // casting back.
15314
15315 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15316 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15317 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15318 if (!A)
15319 return SDValue();
15320
15321 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15322 if (!B)
15323 return SDValue();
15324
15325 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15326 if (!C)
15327 return SDValue();
15328
15329 // This changes signaling nan behavior. If an input is a signaling nan, it
15330 // would have been quieted by the fpext originally. We don't care because
15331 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15332 // we would be worse off than just doing the promotion.
15333 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15334 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15335 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15336 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15337}
15338
15339unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15340 const SDNode *N0,
15341 const SDNode *N1) const {
15342 EVT VT = N0->getValueType(0);
15343
15344 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15345 // support denormals ever.
15346 if (((VT == MVT::f32 &&
15348 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15351 return ISD::FMAD;
15352
15353 const TargetOptions &Options = DAG.getTarget().Options;
15354 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15355 (N0->getFlags().hasAllowContract() &&
15356 N1->getFlags().hasAllowContract())) &&
15358 return ISD::FMA;
15359 }
15360
15361 return 0;
15362}
15363
15364// For a reassociatable opcode perform:
15365// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15366SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15367 SelectionDAG &DAG) const {
15368 EVT VT = N->getValueType(0);
15369 if (VT != MVT::i32 && VT != MVT::i64)
15370 return SDValue();
15371
15372 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15373 return SDValue();
15374
15375 unsigned Opc = N->getOpcode();
15376 SDValue Op0 = N->getOperand(0);
15377 SDValue Op1 = N->getOperand(1);
15378
15379 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15380 return SDValue();
15381
15382 if (Op0->isDivergent())
15383 std::swap(Op0, Op1);
15384
15385 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15386 return SDValue();
15387
15388 SDValue Op2 = Op1.getOperand(1);
15389 Op1 = Op1.getOperand(0);
15390 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15391 return SDValue();
15392
15393 if (Op1->isDivergent())
15394 std::swap(Op1, Op2);
15395
15396 SDLoc SL(N);
15397 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15398 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15399}
15400
15401static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15402 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15404 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15405 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15406 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15407}
15408
15409// Fold
15410// y = lshr i64 x, 32
15411// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15412// with Const.hi == -1
15413// To
15414// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15416 SDValue MulLHS, SDValue MulRHS,
15417 SDValue AddRHS) {
15418 if (MulRHS.getOpcode() == ISD::SRL)
15419 std::swap(MulLHS, MulRHS);
15420
15421 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15422 return SDValue();
15423
15424 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15425 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15426 MulLHS.getOperand(0) != AddRHS)
15427 return SDValue();
15428
15430 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15431 return SDValue();
15432
15433 SDValue ConstMul =
15434 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15435 return getMad64_32(DAG, SL, MVT::i64,
15436 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15437 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15438}
15439
15440// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15441// multiplies, if any.
15442//
15443// Full 64-bit multiplies that feed into an addition are lowered here instead
15444// of using the generic expansion. The generic expansion ends up with
15445// a tree of ADD nodes that prevents us from using the "add" part of the
15446// MAD instruction. The expansion produced here results in a chain of ADDs
15447// instead of a tree.
15448SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15449 DAGCombinerInfo &DCI) const {
15450 assert(N->isAnyAdd());
15451
15452 SelectionDAG &DAG = DCI.DAG;
15453 EVT VT = N->getValueType(0);
15454 SDLoc SL(N);
15455 SDValue LHS = N->getOperand(0);
15456 SDValue RHS = N->getOperand(1);
15457
15458 if (VT.isVector())
15459 return SDValue();
15460
15461 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15462 // result in scalar registers for uniform values.
15463 if (!N->isDivergent() && Subtarget->hasSMulHi())
15464 return SDValue();
15465
15466 unsigned NumBits = VT.getScalarSizeInBits();
15467 if (NumBits <= 32 || NumBits > 64)
15468 return SDValue();
15469
15470 if (LHS.getOpcode() != ISD::MUL) {
15471 assert(RHS.getOpcode() == ISD::MUL);
15472 std::swap(LHS, RHS);
15473 }
15474
15475 // Avoid the fold if it would unduly increase the number of multiplies due to
15476 // multiple uses, except on hardware with full-rate multiply-add (which is
15477 // part of full-rate 64-bit ops).
15478 if (!Subtarget->hasFullRate64Ops()) {
15479 unsigned NumUsers = 0;
15480 for (SDNode *User : LHS->users()) {
15481 // There is a use that does not feed into addition, so the multiply can't
15482 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15483 if (!User->isAnyAdd())
15484 return SDValue();
15485
15486 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15487 // MUL + 3xADD + 3xADDC over 3xMAD.
15488 ++NumUsers;
15489 if (NumUsers >= 3)
15490 return SDValue();
15491 }
15492 }
15493
15494 SDValue MulLHS = LHS.getOperand(0);
15495 SDValue MulRHS = LHS.getOperand(1);
15496 SDValue AddRHS = RHS;
15497
15498 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15499 return FoldedMAD;
15500
15501 // Always check whether operands are small unsigned values, since that
15502 // knowledge is useful in more cases. Check for small signed values only if
15503 // doing so can unlock a shorter code sequence.
15504 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15505 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15506
15507 bool MulSignedLo = false;
15508 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15509 MulSignedLo =
15510 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15511 }
15512
15513 // The operands and final result all have the same number of bits. If
15514 // operands need to be extended, they can be extended with garbage. The
15515 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15516 // truncated away in the end.
15517 if (VT != MVT::i64) {
15518 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15519 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15520 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15521 }
15522
15523 // The basic code generated is conceptually straightforward. Pseudo code:
15524 //
15525 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15526 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15527 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15528 //
15529 // The second and third lines are optional, depending on whether the factors
15530 // are {sign,zero}-extended or not.
15531 //
15532 // The actual DAG is noisier than the pseudo code, but only due to
15533 // instructions that disassemble values into low and high parts, and
15534 // assemble the final result.
15535 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15536
15537 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15538 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15539 SDValue Accum =
15540 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15541
15542 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15543 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15544
15545 if (!MulLHSUnsigned32) {
15546 auto MulLHSHi =
15547 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15548 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15549 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15550 }
15551
15552 if (!MulRHSUnsigned32) {
15553 auto MulRHSHi =
15554 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15555 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15556 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15557 }
15558
15559 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15560 Accum = DAG.getBitcast(MVT::i64, Accum);
15561 }
15562
15563 if (VT != MVT::i64)
15564 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15565 return Accum;
15566}
15567
15568SDValue
15569SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15570 DAGCombinerInfo &DCI) const {
15571 SDValue RHS = N->getOperand(1);
15572 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15573 if (!CRHS)
15574 return SDValue();
15575
15576 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15577 // common.
15578 uint64_t Val = CRHS->getZExtValue();
15579 if (countr_zero(Val) >= 32) {
15580 SelectionDAG &DAG = DCI.DAG;
15581 SDLoc SL(N);
15582 SDValue LHS = N->getOperand(0);
15583
15584 // Avoid carry machinery if we know the low half of the add does not
15585 // contribute to the final result.
15586 //
15587 // add i64:x, K if computeTrailingZeros(K) >= 32
15588 // => build_pair (add x.hi, K.hi), x.lo
15589
15590 // Breaking the 64-bit add here with this strange constant is unlikely
15591 // to interfere with addressing mode patterns.
15592
15593 SDValue Hi = getHiHalf64(LHS, DAG);
15594 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15595 unsigned Opcode = N->getOpcode();
15596 if (Opcode == ISD::PTRADD)
15597 Opcode = ISD::ADD;
15598 SDValue AddHi =
15599 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15600
15601 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15602 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15603 }
15604
15605 return SDValue();
15606}
15607
15608// Collect the ultimate src of each of the mul node's operands, and confirm
15609// each operand is 8 bytes.
15610static std::optional<ByteProvider<SDValue>>
15611handleMulOperand(const SDValue &MulOperand) {
15612 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15613 if (!Byte0 || Byte0->isConstantZero()) {
15614 return std::nullopt;
15615 }
15616 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15617 if (Byte1 && !Byte1->isConstantZero()) {
15618 return std::nullopt;
15619 }
15620 return Byte0;
15621}
15622
15623static unsigned addPermMasks(unsigned First, unsigned Second) {
15624 unsigned FirstCs = First & 0x0c0c0c0c;
15625 unsigned SecondCs = Second & 0x0c0c0c0c;
15626 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15627 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15628
15629 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15630 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15631 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15632 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15633
15634 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15635}
15636
15637struct DotSrc {
15639 int64_t PermMask;
15641};
15642
15646 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15647
15648 assert(Src0.Src.has_value() && Src1.Src.has_value());
15649 // Src0s and Src1s are empty, just place arbitrarily.
15650 if (Step == 0) {
15651 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15652 Src0.SrcOffset / 4});
15653 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15654 Src1.SrcOffset / 4});
15655 return;
15656 }
15657
15658 for (int BPI = 0; BPI < 2; BPI++) {
15659 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15660 if (BPI == 1) {
15661 BPP = {Src1, Src0};
15662 }
15663 unsigned ZeroMask = 0x0c0c0c0c;
15664 unsigned FMask = 0xFF << (8 * (3 - Step));
15665
15666 unsigned FirstMask =
15667 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15668 unsigned SecondMask =
15669 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15670 // Attempt to find Src vector which contains our SDValue, if so, add our
15671 // perm mask to the existing one. If we are unable to find a match for the
15672 // first SDValue, attempt to find match for the second.
15673 int FirstGroup = -1;
15674 for (int I = 0; I < 2; I++) {
15675 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15676 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15677 return IterElt.SrcOp == *BPP.first.Src &&
15678 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15679 };
15680
15681 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15682 if (Match != Srcs.end()) {
15683 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15684 FirstGroup = I;
15685 break;
15686 }
15687 }
15688 if (FirstGroup != -1) {
15689 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15690 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15691 return IterElt.SrcOp == *BPP.second.Src &&
15692 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15693 };
15694 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15695 if (Match != Srcs.end()) {
15696 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15697 } else
15698 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15699 return;
15700 }
15701 }
15702
15703 // If we have made it here, then we could not find a match in Src0s or Src1s
15704 // for either Src0 or Src1, so just place them arbitrarily.
15705
15706 unsigned ZeroMask = 0x0c0c0c0c;
15707 unsigned FMask = 0xFF << (8 * (3 - Step));
15708
15709 Src0s.push_back(
15710 {*Src0.Src,
15711 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15712 Src0.SrcOffset / 4});
15713 Src1s.push_back(
15714 {*Src1.Src,
15715 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15716 Src1.SrcOffset / 4});
15717}
15718
15720 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15721 bool IsAny) {
15722
15723 // If we just have one source, just permute it accordingly.
15724 if (Srcs.size() == 1) {
15725 auto *Elt = Srcs.begin();
15726 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15727
15728 // v_perm will produce the original value
15729 if (Elt->PermMask == 0x3020100)
15730 return EltOp;
15731
15732 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15733 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15734 }
15735
15736 auto *FirstElt = Srcs.begin();
15737 auto *SecondElt = std::next(FirstElt);
15738
15740
15741 // If we have multiple sources in the chain, combine them via perms (using
15742 // calculated perm mask) and Ors.
15743 while (true) {
15744 auto FirstMask = FirstElt->PermMask;
15745 auto SecondMask = SecondElt->PermMask;
15746
15747 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15748 unsigned FirstPlusFour = FirstMask | 0x04040404;
15749 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15750 // original 0x0C.
15751 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15752
15753 auto PermMask = addPermMasks(FirstMask, SecondMask);
15754 auto FirstVal =
15755 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15756 auto SecondVal =
15757 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15758
15759 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15760 SecondVal,
15761 DAG.getConstant(PermMask, SL, MVT::i32)));
15762
15763 FirstElt = std::next(SecondElt);
15764 if (FirstElt == Srcs.end())
15765 break;
15766
15767 SecondElt = std::next(FirstElt);
15768 // If we only have a FirstElt, then just combine that into the cumulative
15769 // source node.
15770 if (SecondElt == Srcs.end()) {
15771 auto EltOp =
15772 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15773
15774 Perms.push_back(
15775 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15776 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15777 break;
15778 }
15779 }
15780
15781 assert(Perms.size() == 1 || Perms.size() == 2);
15782 return Perms.size() == 2
15783 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15784 : Perms[0];
15785}
15786
15787static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15788 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15789 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15790 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15791 EntryMask += ZeroMask;
15792 }
15793}
15794
15795static bool isMul(const SDValue Op) {
15796 auto Opcode = Op.getOpcode();
15797
15798 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15799 Opcode == AMDGPUISD::MUL_I24);
15800}
15801
15802static std::optional<bool>
15804 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15805 const SDValue &S1Op, const SelectionDAG &DAG) {
15806 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15807 // of the dot4 is irrelevant.
15808 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15809 return false;
15810
15811 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15812 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15813 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15814 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15815 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15816 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15817
15818 assert(!(S0IsUnsigned && S0IsSigned));
15819 assert(!(S1IsUnsigned && S1IsSigned));
15820
15821 // There are 9 possible permutations of
15822 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15823
15824 // In two permutations, the sign bits are known to be the same for both Ops,
15825 // so simply return Signed / Unsigned corresponding to the MSB
15826
15827 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15828 return S0IsSigned;
15829
15830 // In another two permutations, the sign bits are known to be opposite. In
15831 // this case return std::nullopt to indicate a bad match.
15832
15833 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15834 return std::nullopt;
15835
15836 // In the remaining five permutations, we don't know the value of the sign
15837 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15838 // the upper bits must be extension bits. Thus, the only ways for the sign
15839 // bit to be unknown is if it was sign extended from unknown value, or if it
15840 // was any extended. In either case, it is correct to use the signed
15841 // version of the signedness semantics of dot4
15842
15843 // In two of such permutations, we known the sign bit is set for
15844 // one op, and the other is unknown. It is okay to used signed version of
15845 // dot4.
15846 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15847 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15848 return true;
15849
15850 // In one such permutation, we don't know either of the sign bits. It is okay
15851 // to used the signed version of dot4.
15852 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15853 return true;
15854
15855 // In two of such permutations, we known the sign bit is unset for
15856 // one op, and the other is unknown. Return std::nullopt to indicate a
15857 // bad match.
15858 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15859 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15860 return std::nullopt;
15861
15862 llvm_unreachable("Fully covered condition");
15863}
15864
15865SDValue SITargetLowering::performAddCombine(SDNode *N,
15866 DAGCombinerInfo &DCI) const {
15867 SelectionDAG &DAG = DCI.DAG;
15868 EVT VT = N->getValueType(0);
15869 SDLoc SL(N);
15870 SDValue LHS = N->getOperand(0);
15871 SDValue RHS = N->getOperand(1);
15872
15873 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15874 if (Subtarget->hasMad64_32()) {
15875 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15876 return Folded;
15877 }
15878 }
15879
15880 if (SDValue V = reassociateScalarOps(N, DAG)) {
15881 return V;
15882 }
15883
15884 if (VT == MVT::i64) {
15885 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15886 return Folded;
15887 }
15888
15889 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15890 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15891 SDValue TempNode(N, 0);
15892 std::optional<bool> IsSigned;
15896
15897 // Match the v_dot4 tree, while collecting src nodes.
15898 int ChainLength = 0;
15899 for (int I = 0; I < 4; I++) {
15900 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15901 if (MulIdx == -1)
15902 break;
15903 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15904 if (!Src0)
15905 break;
15906 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15907 if (!Src1)
15908 break;
15909
15910 auto IterIsSigned = checkDot4MulSignedness(
15911 TempNode->getOperand(MulIdx), *Src0, *Src1,
15912 TempNode->getOperand(MulIdx)->getOperand(0),
15913 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15914 if (!IterIsSigned)
15915 break;
15916 if (!IsSigned)
15917 IsSigned = *IterIsSigned;
15918 if (*IterIsSigned != *IsSigned)
15919 break;
15920 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15921 auto AddIdx = 1 - MulIdx;
15922 // Allow the special case where add (add (mul24, 0), mul24) became ->
15923 // add (mul24, mul24).
15924 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15925 Src2s.push_back(TempNode->getOperand(AddIdx));
15926 auto Src0 =
15927 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15928 if (!Src0)
15929 break;
15930 auto Src1 =
15931 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15932 if (!Src1)
15933 break;
15934 auto IterIsSigned = checkDot4MulSignedness(
15935 TempNode->getOperand(AddIdx), *Src0, *Src1,
15936 TempNode->getOperand(AddIdx)->getOperand(0),
15937 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15938 if (!IterIsSigned)
15939 break;
15940 assert(IsSigned);
15941 if (*IterIsSigned != *IsSigned)
15942 break;
15943 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15944 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15945 ChainLength = I + 2;
15946 break;
15947 }
15948
15949 TempNode = TempNode->getOperand(AddIdx);
15950 Src2s.push_back(TempNode);
15951 ChainLength = I + 1;
15952 if (TempNode->getNumOperands() < 2)
15953 break;
15954 LHS = TempNode->getOperand(0);
15955 RHS = TempNode->getOperand(1);
15956 }
15957
15958 if (ChainLength < 2)
15959 return SDValue();
15960
15961 // Masks were constructed with assumption that we would find a chain of
15962 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15963 // 0x0c) so they do not affect dot calculation.
15964 if (ChainLength < 4) {
15965 fixMasks(Src0s, ChainLength);
15966 fixMasks(Src1s, ChainLength);
15967 }
15968
15969 SDValue Src0, Src1;
15970
15971 // If we are just using a single source for both, and have permuted the
15972 // bytes consistently, we can just use the sources without permuting
15973 // (commutation).
15974 bool UseOriginalSrc = false;
15975 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15976 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15977 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15978 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15979 SmallVector<unsigned, 4> SrcBytes;
15980 auto Src0Mask = Src0s.begin()->PermMask;
15981 SrcBytes.push_back(Src0Mask & 0xFF000000);
15982 bool UniqueEntries = true;
15983 for (auto I = 1; I < 4; I++) {
15984 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15985
15986 if (is_contained(SrcBytes, NextByte)) {
15987 UniqueEntries = false;
15988 break;
15989 }
15990 SrcBytes.push_back(NextByte);
15991 }
15992
15993 if (UniqueEntries) {
15994 UseOriginalSrc = true;
15995
15996 auto *FirstElt = Src0s.begin();
15997 auto FirstEltOp =
15998 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15999
16000 auto *SecondElt = Src1s.begin();
16001 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16002 SecondElt->DWordOffset);
16003
16004 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16005 MVT::getIntegerVT(32));
16006 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16007 MVT::getIntegerVT(32));
16008 }
16009 }
16010
16011 if (!UseOriginalSrc) {
16012 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16013 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16014 }
16015
16016 assert(IsSigned);
16017 SDValue Src2 =
16018 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16019
16020 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16021 : Intrinsic::amdgcn_udot4,
16022 SL, MVT::i64);
16023
16024 assert(!VT.isVector());
16025 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16026 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16027
16028 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16029 }
16030
16031 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16032 return SDValue();
16033
16034 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16035 // add x, sext (setcc) => usubo_carry x, 0, setcc
16036 unsigned Opc = LHS.getOpcode();
16039 std::swap(RHS, LHS);
16040
16041 Opc = RHS.getOpcode();
16042 switch (Opc) {
16043 default:
16044 break;
16045 case ISD::ZERO_EXTEND:
16046 case ISD::SIGN_EXTEND:
16047 case ISD::ANY_EXTEND: {
16048 auto Cond = RHS.getOperand(0);
16049 // If this won't be a real VOPC output, we would still need to insert an
16050 // extra instruction anyway.
16051 if (!isBoolSGPR(Cond))
16052 break;
16053 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16054 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16056 return DAG.getNode(Opc, SL, VTList, Args);
16057 }
16058 case ISD::UADDO_CARRY: {
16059 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16060 if (!isNullConstant(RHS.getOperand(1)))
16061 break;
16062 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16063 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16064 }
16065 }
16066 return SDValue();
16067}
16068
16069SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16070 DAGCombinerInfo &DCI) const {
16071 SelectionDAG &DAG = DCI.DAG;
16072 SDLoc DL(N);
16073 EVT VT = N->getValueType(0);
16074 SDValue N0 = N->getOperand(0);
16075 SDValue N1 = N->getOperand(1);
16076
16077 // The following folds transform PTRADDs into regular arithmetic in cases
16078 // where the PTRADD wouldn't be folded as an immediate offset into memory
16079 // instructions anyway. They are target-specific in that other targets might
16080 // prefer to not lose information about the pointer arithmetic.
16081
16082 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16083 // Adapted from DAGCombiner::visitADDLikeCommutative.
16084 SDValue V, K;
16085 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16086 SDNodeFlags ShlFlags = N1->getFlags();
16087 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16088 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16089 // preserved.
16090 SDNodeFlags NewShlFlags =
16091 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16093 : SDNodeFlags();
16094 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16095 DCI.AddToWorklist(Inner.getNode());
16096 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16097 }
16098
16099 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16100 // performAddCombine.
16101 if (N1.getOpcode() == ISD::MUL) {
16102 if (Subtarget->hasMad64_32()) {
16103 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16104 return Folded;
16105 }
16106 }
16107
16108 // If the 32 low bits of the constant are all zero, there is nothing to fold
16109 // into an immediate offset, so it's better to eliminate the unnecessary
16110 // addition for the lower 32 bits than to preserve the PTRADD.
16111 // Analogous to a fold in performAddCombine.
16112 if (VT == MVT::i64) {
16113 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16114 return Folded;
16115 }
16116
16117 if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
16118 // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
16119 // global address GA and constant c, such that c can be folded into GA.
16120 SDValue GAValue = N0.getOperand(0);
16121 if (const GlobalAddressSDNode *GA =
16123 if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
16124 // If both additions in the original were NUW, reassociation preserves
16125 // that.
16126 SDNodeFlags Flags =
16127 (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16128 SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
16129 DCI.AddToWorklist(Inner.getNode());
16130 return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
16131 }
16132 }
16133 }
16134
16135 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16136 return SDValue();
16137
16138 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
16139 // y is not, and (add y, z) is used only once.
16140 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
16141 // z is not, and (add y, z) is used only once.
16142 // The goal is to move constant offsets to the outermost ptradd, to create
16143 // more opportunities to fold offsets into memory instructions.
16144 // Together with the generic combines in DAGCombiner.cpp, this also
16145 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
16146 //
16147 // This transform is here instead of in the general DAGCombiner as it can
16148 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
16149 // AArch64's CPA.
16150 SDValue X = N0;
16151 SDValue Y = N1.getOperand(0);
16152 SDValue Z = N1.getOperand(1);
16153 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16154 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16155
16156 // If both additions in the original were NUW, reassociation preserves that.
16157 SDNodeFlags ReassocFlags =
16158 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16159
16160 if (ZIsConstant != YIsConstant) {
16161 if (YIsConstant)
16162 std::swap(Y, Z);
16163 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16164 DCI.AddToWorklist(Inner.getNode());
16165 return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
16166 }
16167
16168 // If one of Y and Z is constant, they have been handled above. If both were
16169 // constant, the addition would have been folded in SelectionDAG::getNode
16170 // already. This ensures that the generic DAG combines won't undo the
16171 // following reassociation.
16172 assert(!YIsConstant && !ZIsConstant);
16173
16174 if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
16175 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16176 // y are uniform and z isn't.
16177 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16178 // z are uniform and y isn't.
16179 // The goal is to push uniform operands up in the computation, so that they
16180 // can be handled with scalar operations. We can't use reassociateScalarOps
16181 // for this since it requires two identical commutative operations to
16182 // reassociate.
16183 if (Y->isDivergent())
16184 std::swap(Y, Z);
16185 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16186 DCI.AddToWorklist(UniformInner.getNode());
16187 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16188 }
16189
16190 return SDValue();
16191}
16192
16193SDValue SITargetLowering::performSubCombine(SDNode *N,
16194 DAGCombinerInfo &DCI) const {
16195 SelectionDAG &DAG = DCI.DAG;
16196 EVT VT = N->getValueType(0);
16197
16198 if (VT == MVT::i64) {
16199 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16200 return Folded;
16201 }
16202
16203 if (VT != MVT::i32)
16204 return SDValue();
16205
16206 SDLoc SL(N);
16207 SDValue LHS = N->getOperand(0);
16208 SDValue RHS = N->getOperand(1);
16209
16210 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16211 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16212 unsigned Opc = RHS.getOpcode();
16213 switch (Opc) {
16214 default:
16215 break;
16216 case ISD::ZERO_EXTEND:
16217 case ISD::SIGN_EXTEND:
16218 case ISD::ANY_EXTEND: {
16219 auto Cond = RHS.getOperand(0);
16220 // If this won't be a real VOPC output, we would still need to insert an
16221 // extra instruction anyway.
16222 if (!isBoolSGPR(Cond))
16223 break;
16224 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16225 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16227 return DAG.getNode(Opc, SL, VTList, Args);
16228 }
16229 }
16230
16231 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16232 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16233 if (!isNullConstant(LHS.getOperand(1)))
16234 return SDValue();
16235 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16236 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16237 }
16238 return SDValue();
16239}
16240
16241SDValue
16242SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16243 DAGCombinerInfo &DCI) const {
16244
16245 if (N->getValueType(0) != MVT::i32)
16246 return SDValue();
16247
16248 if (!isNullConstant(N->getOperand(1)))
16249 return SDValue();
16250
16251 SelectionDAG &DAG = DCI.DAG;
16252 SDValue LHS = N->getOperand(0);
16253
16254 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16255 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16256 unsigned LHSOpc = LHS.getOpcode();
16257 unsigned Opc = N->getOpcode();
16258 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16259 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16260 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16261 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16262 }
16263 return SDValue();
16264}
16265
16266SDValue SITargetLowering::performFAddCombine(SDNode *N,
16267 DAGCombinerInfo &DCI) const {
16268 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16269 return SDValue();
16270
16271 SelectionDAG &DAG = DCI.DAG;
16272 EVT VT = N->getValueType(0);
16273
16274 SDLoc SL(N);
16275 SDValue LHS = N->getOperand(0);
16276 SDValue RHS = N->getOperand(1);
16277
16278 // These should really be instruction patterns, but writing patterns with
16279 // source modifiers is a pain.
16280
16281 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16282 if (LHS.getOpcode() == ISD::FADD) {
16283 SDValue A = LHS.getOperand(0);
16284 if (A == LHS.getOperand(1)) {
16285 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16286 if (FusedOp != 0) {
16287 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16288 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16289 }
16290 }
16291 }
16292
16293 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16294 if (RHS.getOpcode() == ISD::FADD) {
16295 SDValue A = RHS.getOperand(0);
16296 if (A == RHS.getOperand(1)) {
16297 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16298 if (FusedOp != 0) {
16299 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16300 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16301 }
16302 }
16303 }
16304
16305 return SDValue();
16306}
16307
16308SDValue SITargetLowering::performFSubCombine(SDNode *N,
16309 DAGCombinerInfo &DCI) const {
16310 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16311 return SDValue();
16312
16313 SelectionDAG &DAG = DCI.DAG;
16314 SDLoc SL(N);
16315 EVT VT = N->getValueType(0);
16316 assert(!VT.isVector());
16317
16318 // Try to get the fneg to fold into the source modifier. This undoes generic
16319 // DAG combines and folds them into the mad.
16320 //
16321 // Only do this if we are not trying to support denormals. v_mad_f32 does
16322 // not support denormals ever.
16323 SDValue LHS = N->getOperand(0);
16324 SDValue RHS = N->getOperand(1);
16325 if (LHS.getOpcode() == ISD::FADD) {
16326 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16327 SDValue A = LHS.getOperand(0);
16328 if (A == LHS.getOperand(1)) {
16329 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16330 if (FusedOp != 0) {
16331 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16332 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16333
16334 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16335 }
16336 }
16337 }
16338
16339 if (RHS.getOpcode() == ISD::FADD) {
16340 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16341
16342 SDValue A = RHS.getOperand(0);
16343 if (A == RHS.getOperand(1)) {
16344 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16345 if (FusedOp != 0) {
16346 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16347 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16348 }
16349 }
16350 }
16351
16352 return SDValue();
16353}
16354
16355SDValue SITargetLowering::performFDivCombine(SDNode *N,
16356 DAGCombinerInfo &DCI) const {
16357 SelectionDAG &DAG = DCI.DAG;
16358 SDLoc SL(N);
16359 EVT VT = N->getValueType(0);
16360 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16361 return SDValue();
16362
16363 SDValue LHS = N->getOperand(0);
16364 SDValue RHS = N->getOperand(1);
16365
16366 SDNodeFlags Flags = N->getFlags();
16367 SDNodeFlags RHSFlags = RHS->getFlags();
16368 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16369 !RHS->hasOneUse())
16370 return SDValue();
16371
16372 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16373 bool IsNegative = false;
16374 if (CLHS->isExactlyValue(1.0) ||
16375 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16376 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16377 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16378 if (RHS.getOpcode() == ISD::FSQRT) {
16379 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16380 SDValue Rsq =
16381 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16382 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16383 }
16384 }
16385 }
16386
16387 return SDValue();
16388}
16389
16390SDValue SITargetLowering::performFMulCombine(SDNode *N,
16391 DAGCombinerInfo &DCI) const {
16392 SelectionDAG &DAG = DCI.DAG;
16393 EVT VT = N->getValueType(0);
16394 EVT ScalarVT = VT.getScalarType();
16395 EVT IntVT = VT.changeElementType(MVT::i32);
16396
16397 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16398 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16399 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16400 return SDValue();
16401 }
16402
16403 SDValue LHS = N->getOperand(0);
16404 SDValue RHS = N->getOperand(1);
16405
16406 // It is cheaper to realize i32 inline constants as compared against
16407 // materializing f16 or f64 (or even non-inline f32) values,
16408 // possible via ldexp usage, as shown below :
16409 //
16410 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16411 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16412 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16413 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16414 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16415 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16416 if (!TrueNode)
16417 return SDValue();
16418 const ConstantFPSDNode *FalseNode =
16419 isConstOrConstSplatFP(RHS.getOperand(2));
16420 if (!FalseNode)
16421 return SDValue();
16422
16423 if (TrueNode->isNegative() != FalseNode->isNegative())
16424 return SDValue();
16425
16426 // For f32, only non-inline constants should be transformed.
16427 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16428 if (ScalarVT == MVT::f32 &&
16429 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16430 TII->isInlineConstant(FalseNode->getValueAPF()))
16431 return SDValue();
16432
16433 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16434 if (TrueNodeExpVal == INT_MIN)
16435 return SDValue();
16436 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16437 if (FalseNodeExpVal == INT_MIN)
16438 return SDValue();
16439
16440 SDLoc SL(N);
16441 SDValue SelectNode =
16442 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16443 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16444 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16445
16446 LHS = TrueNode->isNegative()
16447 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16448 : LHS;
16449
16450 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16451 }
16452
16453 return SDValue();
16454}
16455
16456SDValue SITargetLowering::performFMACombine(SDNode *N,
16457 DAGCombinerInfo &DCI) const {
16458 SelectionDAG &DAG = DCI.DAG;
16459 EVT VT = N->getValueType(0);
16460 SDLoc SL(N);
16461
16462 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16463 return SDValue();
16464
16465 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16466 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16467 SDValue Op1 = N->getOperand(0);
16468 SDValue Op2 = N->getOperand(1);
16469 SDValue FMA = N->getOperand(2);
16470
16471 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16472 Op2.getOpcode() != ISD::FP_EXTEND)
16473 return SDValue();
16474
16475 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16476 // regardless of the denorm mode setting. Therefore,
16477 // fp-contract is sufficient to allow generating fdot2.
16478 const TargetOptions &Options = DAG.getTarget().Options;
16479 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16480 (N->getFlags().hasAllowContract() &&
16481 FMA->getFlags().hasAllowContract())) {
16482 Op1 = Op1.getOperand(0);
16483 Op2 = Op2.getOperand(0);
16484 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16486 return SDValue();
16487
16488 SDValue Vec1 = Op1.getOperand(0);
16489 SDValue Idx1 = Op1.getOperand(1);
16490 SDValue Vec2 = Op2.getOperand(0);
16491
16492 SDValue FMAOp1 = FMA.getOperand(0);
16493 SDValue FMAOp2 = FMA.getOperand(1);
16494 SDValue FMAAcc = FMA.getOperand(2);
16495
16496 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16497 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16498 return SDValue();
16499
16500 FMAOp1 = FMAOp1.getOperand(0);
16501 FMAOp2 = FMAOp2.getOperand(0);
16502 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16504 return SDValue();
16505
16506 SDValue Vec3 = FMAOp1.getOperand(0);
16507 SDValue Vec4 = FMAOp2.getOperand(0);
16508 SDValue Idx2 = FMAOp1.getOperand(1);
16509
16510 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16511 // Idx1 and Idx2 cannot be the same.
16512 Idx1 == Idx2)
16513 return SDValue();
16514
16515 if (Vec1 == Vec2 || Vec3 == Vec4)
16516 return SDValue();
16517
16518 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16519 return SDValue();
16520
16521 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16522 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16523 DAG.getTargetConstant(0, SL, MVT::i1));
16524 }
16525 }
16526 return SDValue();
16527}
16528
16529SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16530 DAGCombinerInfo &DCI) const {
16531 SelectionDAG &DAG = DCI.DAG;
16532 SDLoc SL(N);
16533
16534 SDValue LHS = N->getOperand(0);
16535 SDValue RHS = N->getOperand(1);
16536 EVT VT = LHS.getValueType();
16537 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16538
16539 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16540 if (!CRHS) {
16542 if (CRHS) {
16543 std::swap(LHS, RHS);
16544 CC = getSetCCSwappedOperands(CC);
16545 }
16546 }
16547
16548 if (CRHS) {
16549 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16550 isBoolSGPR(LHS.getOperand(0))) {
16551 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16552 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16553 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16554 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16555 if ((CRHS->isAllOnes() &&
16556 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16557 (CRHS->isZero() &&
16558 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16559 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16560 DAG.getAllOnesConstant(SL, MVT::i1));
16561 if ((CRHS->isAllOnes() &&
16562 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16563 (CRHS->isZero() &&
16564 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16565 return LHS.getOperand(0);
16566 }
16567
16568 const APInt &CRHSVal = CRHS->getAPIntValue();
16569 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16570 LHS.getOpcode() == ISD::SELECT &&
16571 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16572 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16573 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16574 isBoolSGPR(LHS.getOperand(0))) {
16575 // Given CT != FT:
16576 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16577 // setcc (select cc, CT, CF), CF, ne => cc
16578 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16579 // setcc (select cc, CT, CF), CT, eq => cc
16580 const APInt &CT = LHS.getConstantOperandAPInt(1);
16581 const APInt &CF = LHS.getConstantOperandAPInt(2);
16582
16583 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16584 (CT == CRHSVal && CC == ISD::SETNE))
16585 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16586 DAG.getAllOnesConstant(SL, MVT::i1));
16587 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16588 (CT == CRHSVal && CC == ISD::SETEQ))
16589 return LHS.getOperand(0);
16590 }
16591 }
16592
16593 if (VT != MVT::f32 && VT != MVT::f64 &&
16594 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16595 return SDValue();
16596
16597 // Match isinf/isfinite pattern
16598 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16599 // (fcmp one (fabs x), inf) -> (fp_class x,
16600 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16601 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16602 LHS.getOpcode() == ISD::FABS) {
16603 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16604 if (!CRHS)
16605 return SDValue();
16606
16607 const APFloat &APF = CRHS->getValueAPF();
16608 if (APF.isInfinity() && !APF.isNegative()) {
16609 const unsigned IsInfMask =
16611 const unsigned IsFiniteMask =
16615 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16616 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16617 DAG.getConstant(Mask, SL, MVT::i32));
16618 }
16619 }
16620
16621 return SDValue();
16622}
16623
16624SDValue
16625SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16626 DAGCombinerInfo &DCI) const {
16627 SelectionDAG &DAG = DCI.DAG;
16628 SDLoc SL(N);
16629 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16630
16631 SDValue Src = N->getOperand(0);
16632 SDValue Shift = N->getOperand(0);
16633
16634 // TODO: Extend type shouldn't matter (assuming legal types).
16635 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16636 Shift = Shift.getOperand(0);
16637
16638 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16639 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16640 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16641 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16642 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16643 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16644 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16645 SDValue Shifted = DAG.getZExtOrTrunc(
16646 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16647
16648 unsigned ShiftOffset = 8 * Offset;
16649 if (Shift.getOpcode() == ISD::SHL)
16650 ShiftOffset -= C->getZExtValue();
16651 else
16652 ShiftOffset += C->getZExtValue();
16653
16654 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16655 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16656 MVT::f32, Shifted);
16657 }
16658 }
16659 }
16660
16661 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16662 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16663 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16664 // We simplified Src. If this node is not dead, visit it again so it is
16665 // folded properly.
16666 if (N->getOpcode() != ISD::DELETED_NODE)
16667 DCI.AddToWorklist(N);
16668 return SDValue(N, 0);
16669 }
16670
16671 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16672 if (SDValue DemandedSrc =
16673 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16674 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16675
16676 return SDValue();
16677}
16678
16679SDValue SITargetLowering::performClampCombine(SDNode *N,
16680 DAGCombinerInfo &DCI) const {
16681 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16682 if (!CSrc)
16683 return SDValue();
16684
16685 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16686 const APFloat &F = CSrc->getValueAPF();
16687 APFloat Zero = APFloat::getZero(F.getSemantics());
16688 if (F < Zero ||
16689 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16690 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16691 }
16692
16693 APFloat One(F.getSemantics(), "1.0");
16694 if (F > One)
16695 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16696
16697 return SDValue(CSrc, 0);
16698}
16699
16700SDValue SITargetLowering::performSelectCombine(SDNode *N,
16701 DAGCombinerInfo &DCI) const {
16702
16703 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16704 // integer).
16705 // Detect when CMP and SELECT use the same constant and fold them to avoid
16706 // loading the constant twice. Specifically handles patterns like:
16707 // %cmp = icmp eq i32 %val, 4242
16708 // %sel = select i1 %cmp, i32 4242, i32 %other
16709 // It can be optimized to reuse %val instead of 4242 in select.
16710 SDValue Cond = N->getOperand(0);
16711 SDValue TrueVal = N->getOperand(1);
16712 SDValue FalseVal = N->getOperand(2);
16713
16714 // Check if condition is a comparison.
16715 if (Cond.getOpcode() != ISD::SETCC)
16716 return SDValue();
16717
16718 SDValue LHS = Cond.getOperand(0);
16719 SDValue RHS = Cond.getOperand(1);
16720 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16721
16722 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16723 bool isInteger = LHS.getValueType().isInteger();
16724
16725 // Handle simple floating-point and integer types only.
16726 if (!isFloatingPoint && !isInteger)
16727 return SDValue();
16728
16729 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16730 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16731 if (!isEquality && !isNonEquality)
16732 return SDValue();
16733
16734 SDValue ArgVal, ConstVal;
16735 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16736 (isInteger && isa<ConstantSDNode>(RHS))) {
16737 ConstVal = RHS;
16738 ArgVal = LHS;
16739 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16740 (isInteger && isa<ConstantSDNode>(LHS))) {
16741 ConstVal = LHS;
16742 ArgVal = RHS;
16743 } else {
16744 return SDValue();
16745 }
16746
16747 // Skip optimization for inlinable immediates.
16748 if (isFloatingPoint) {
16749 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16750 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16751 return SDValue();
16752 } else {
16754 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16755 return SDValue();
16756 }
16757
16758 // For equality and non-equality comparisons, patterns:
16759 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16760 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16761 if (!(isEquality && TrueVal == ConstVal) &&
16762 !(isNonEquality && FalseVal == ConstVal))
16763 return SDValue();
16764
16765 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16766 SDValue SelectRHS =
16767 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16768 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16769 SelectLHS, SelectRHS);
16770}
16771
16773 DAGCombinerInfo &DCI) const {
16774 switch (N->getOpcode()) {
16775 case ISD::ADD:
16776 case ISD::SUB:
16777 case ISD::SHL:
16778 case ISD::SRL:
16779 case ISD::SRA:
16780 case ISD::AND:
16781 case ISD::OR:
16782 case ISD::XOR:
16783 case ISD::MUL:
16784 case ISD::SETCC:
16785 case ISD::SELECT:
16786 case ISD::SMIN:
16787 case ISD::SMAX:
16788 case ISD::UMIN:
16789 case ISD::UMAX:
16790 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16791 return Res;
16792 break;
16793 default:
16794 break;
16795 }
16796
16797 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16798 return SDValue();
16799
16800 switch (N->getOpcode()) {
16801 case ISD::ADD:
16802 return performAddCombine(N, DCI);
16803 case ISD::PTRADD:
16804 return performPtrAddCombine(N, DCI);
16805 case ISD::SUB:
16806 return performSubCombine(N, DCI);
16807 case ISD::UADDO_CARRY:
16808 case ISD::USUBO_CARRY:
16809 return performAddCarrySubCarryCombine(N, DCI);
16810 case ISD::FADD:
16811 return performFAddCombine(N, DCI);
16812 case ISD::FSUB:
16813 return performFSubCombine(N, DCI);
16814 case ISD::FDIV:
16815 return performFDivCombine(N, DCI);
16816 case ISD::FMUL:
16817 return performFMulCombine(N, DCI);
16818 case ISD::SETCC:
16819 return performSetCCCombine(N, DCI);
16820 case ISD::SELECT:
16821 if (auto Res = performSelectCombine(N, DCI))
16822 return Res;
16823 break;
16824 case ISD::FMAXNUM:
16825 case ISD::FMINNUM:
16826 case ISD::FMAXNUM_IEEE:
16827 case ISD::FMINNUM_IEEE:
16828 case ISD::FMAXIMUM:
16829 case ISD::FMINIMUM:
16830 case ISD::FMAXIMUMNUM:
16831 case ISD::FMINIMUMNUM:
16832 case ISD::SMAX:
16833 case ISD::SMIN:
16834 case ISD::UMAX:
16835 case ISD::UMIN:
16838 return performMinMaxCombine(N, DCI);
16839 case ISD::FMA:
16840 return performFMACombine(N, DCI);
16841 case ISD::AND:
16842 return performAndCombine(N, DCI);
16843 case ISD::OR:
16844 return performOrCombine(N, DCI);
16845 case ISD::FSHR: {
16847 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16848 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16849 return matchPERM(N, DCI);
16850 }
16851 break;
16852 }
16853 case ISD::XOR:
16854 return performXorCombine(N, DCI);
16855 case ISD::ZERO_EXTEND:
16856 return performZeroExtendCombine(N, DCI);
16858 return performSignExtendInRegCombine(N, DCI);
16860 return performClassCombine(N, DCI);
16861 case ISD::FCANONICALIZE:
16862 return performFCanonicalizeCombine(N, DCI);
16863 case AMDGPUISD::RCP:
16864 return performRcpCombine(N, DCI);
16865 case ISD::FLDEXP:
16866 case AMDGPUISD::FRACT:
16867 case AMDGPUISD::RSQ:
16870 case AMDGPUISD::RSQ_CLAMP: {
16871 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16872 SDValue Src = N->getOperand(0);
16873 if (Src.isUndef())
16874 return Src;
16875 break;
16876 }
16877 case ISD::SINT_TO_FP:
16878 case ISD::UINT_TO_FP:
16879 return performUCharToFloatCombine(N, DCI);
16880 case ISD::FCOPYSIGN:
16881 return performFCopySignCombine(N, DCI);
16886 return performCvtF32UByteNCombine(N, DCI);
16887 case AMDGPUISD::FMED3:
16888 return performFMed3Combine(N, DCI);
16890 return performCvtPkRTZCombine(N, DCI);
16891 case AMDGPUISD::CLAMP:
16892 return performClampCombine(N, DCI);
16893 case ISD::SCALAR_TO_VECTOR: {
16894 SelectionDAG &DAG = DCI.DAG;
16895 EVT VT = N->getValueType(0);
16896
16897 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16898 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16899 SDLoc SL(N);
16900 SDValue Src = N->getOperand(0);
16901 EVT EltVT = Src.getValueType();
16902 if (EltVT != MVT::i16)
16903 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16904
16905 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16906 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16907 }
16908
16909 break;
16910 }
16912 return performExtractVectorEltCombine(N, DCI);
16914 return performInsertVectorEltCombine(N, DCI);
16915 case ISD::FP_ROUND:
16916 return performFPRoundCombine(N, DCI);
16917 case ISD::LOAD: {
16918 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16919 return Widened;
16920 [[fallthrough]];
16921 }
16922 default: {
16923 if (!DCI.isBeforeLegalize()) {
16924 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16925 return performMemSDNodeCombine(MemNode, DCI);
16926 }
16927
16928 break;
16929 }
16930 }
16931
16933}
16934
16935/// Helper function for adjustWritemask
16936static unsigned SubIdx2Lane(unsigned Idx) {
16937 switch (Idx) {
16938 default:
16939 return ~0u;
16940 case AMDGPU::sub0:
16941 return 0;
16942 case AMDGPU::sub1:
16943 return 1;
16944 case AMDGPU::sub2:
16945 return 2;
16946 case AMDGPU::sub3:
16947 return 3;
16948 case AMDGPU::sub4:
16949 return 4; // Possible with TFE/LWE
16950 }
16951}
16952
16953/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16954SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16955 SelectionDAG &DAG) const {
16956 unsigned Opcode = Node->getMachineOpcode();
16957
16958 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16959 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16960 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16961 return Node; // not implemented for D16
16962
16963 SDNode *Users[5] = {nullptr};
16964 unsigned Lane = 0;
16965 unsigned DmaskIdx =
16966 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16967 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16968 unsigned NewDmask = 0;
16969 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16970 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16971 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16972 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16973 unsigned TFCLane = 0;
16974 bool HasChain = Node->getNumValues() > 1;
16975
16976 if (OldDmask == 0) {
16977 // These are folded out, but on the chance it happens don't assert.
16978 return Node;
16979 }
16980
16981 unsigned OldBitsSet = llvm::popcount(OldDmask);
16982 // Work out which is the TFE/LWE lane if that is enabled.
16983 if (UsesTFC) {
16984 TFCLane = OldBitsSet;
16985 }
16986
16987 // Try to figure out the used register components
16988 for (SDUse &Use : Node->uses()) {
16989
16990 // Don't look at users of the chain.
16991 if (Use.getResNo() != 0)
16992 continue;
16993
16994 SDNode *User = Use.getUser();
16995
16996 // Abort if we can't understand the usage
16997 if (!User->isMachineOpcode() ||
16998 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16999 return Node;
17000
17001 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17002 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17003 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17004 // set, etc.
17005 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17006 if (Lane == ~0u)
17007 return Node;
17008
17009 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17010 if (UsesTFC && Lane == TFCLane) {
17011 Users[Lane] = User;
17012 } else {
17013 // Set which texture component corresponds to the lane.
17014 unsigned Comp;
17015 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17016 Comp = llvm::countr_zero(Dmask);
17017 Dmask &= ~(1 << Comp);
17018 }
17019
17020 // Abort if we have more than one user per component.
17021 if (Users[Lane])
17022 return Node;
17023
17024 Users[Lane] = User;
17025 NewDmask |= 1 << Comp;
17026 }
17027 }
17028
17029 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17030 bool NoChannels = !NewDmask;
17031 if (NoChannels) {
17032 if (!UsesTFC) {
17033 // No uses of the result and not using TFC. Then do nothing.
17034 return Node;
17035 }
17036 // If the original dmask has one channel - then nothing to do
17037 if (OldBitsSet == 1)
17038 return Node;
17039 // Use an arbitrary dmask - required for the instruction to work
17040 NewDmask = 1;
17041 }
17042 // Abort if there's no change
17043 if (NewDmask == OldDmask)
17044 return Node;
17045
17046 unsigned BitsSet = llvm::popcount(NewDmask);
17047
17048 // Check for TFE or LWE - increase the number of channels by one to account
17049 // for the extra return value
17050 // This will need adjustment for D16 if this is also included in
17051 // adjustWriteMask (this function) but at present D16 are excluded.
17052 unsigned NewChannels = BitsSet + UsesTFC;
17053
17054 int NewOpcode =
17055 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17056 assert(NewOpcode != -1 &&
17057 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17058 "failed to find equivalent MIMG op");
17059
17060 // Adjust the writemask in the node
17062 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17063 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17064 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17065
17066 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17067
17068 MVT ResultVT = NewChannels == 1
17069 ? SVT
17070 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17071 : NewChannels == 5 ? 8
17072 : NewChannels);
17073 SDVTList NewVTList =
17074 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17075
17076 MachineSDNode *NewNode =
17077 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17078
17079 if (HasChain) {
17080 // Update chain.
17081 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17082 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17083 }
17084
17085 if (NewChannels == 1) {
17086 assert(Node->hasNUsesOfValue(1, 0));
17087 SDNode *Copy =
17088 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17089 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17090 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17091 return nullptr;
17092 }
17093
17094 // Update the users of the node with the new indices
17095 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17096 SDNode *User = Users[i];
17097 if (!User) {
17098 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17099 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17100 if (i || !NoChannels)
17101 continue;
17102 } else {
17103 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17104 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17105 if (NewUser != User) {
17106 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17107 DAG.RemoveDeadNode(User);
17108 }
17109 }
17110
17111 switch (Idx) {
17112 default:
17113 break;
17114 case AMDGPU::sub0:
17115 Idx = AMDGPU::sub1;
17116 break;
17117 case AMDGPU::sub1:
17118 Idx = AMDGPU::sub2;
17119 break;
17120 case AMDGPU::sub2:
17121 Idx = AMDGPU::sub3;
17122 break;
17123 case AMDGPU::sub3:
17124 Idx = AMDGPU::sub4;
17125 break;
17126 }
17127 }
17128
17129 DAG.RemoveDeadNode(Node);
17130 return nullptr;
17131}
17132
17134 if (Op.getOpcode() == ISD::AssertZext)
17135 Op = Op.getOperand(0);
17136
17137 return isa<FrameIndexSDNode>(Op);
17138}
17139
17140/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17141/// with frame index operands.
17142/// LLVM assumes that inputs are to these instructions are registers.
17143SDNode *
17145 SelectionDAG &DAG) const {
17146 if (Node->getOpcode() == ISD::CopyToReg) {
17147 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17148 SDValue SrcVal = Node->getOperand(2);
17149
17150 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17151 // to try understanding copies to physical registers.
17152 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17153 SDLoc SL(Node);
17155 SDValue VReg = DAG.getRegister(
17156 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17157
17158 SDNode *Glued = Node->getGluedNode();
17159 SDValue ToVReg = DAG.getCopyToReg(
17160 Node->getOperand(0), SL, VReg, SrcVal,
17161 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17162 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17163 VReg, ToVReg.getValue(1));
17164 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17165 DAG.RemoveDeadNode(Node);
17166 return ToResultReg.getNode();
17167 }
17168 }
17169
17171 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17172 if (!isFrameIndexOp(Node->getOperand(i))) {
17173 Ops.push_back(Node->getOperand(i));
17174 continue;
17175 }
17176
17177 SDLoc DL(Node);
17178 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17179 Node->getOperand(i).getValueType(),
17180 Node->getOperand(i)),
17181 0));
17182 }
17183
17184 return DAG.UpdateNodeOperands(Node, Ops);
17185}
17186
17187/// Fold the instructions after selecting them.
17188/// Returns null if users were already updated.
17190 SelectionDAG &DAG) const {
17192 unsigned Opcode = Node->getMachineOpcode();
17193
17194 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17195 !TII->isGather4(Opcode) &&
17196 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17197 return adjustWritemask(Node, DAG);
17198 }
17199
17200 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17202 return Node;
17203 }
17204
17205 switch (Opcode) {
17206 case AMDGPU::V_DIV_SCALE_F32_e64:
17207 case AMDGPU::V_DIV_SCALE_F64_e64: {
17208 // Satisfy the operand register constraint when one of the inputs is
17209 // undefined. Ordinarily each undef value will have its own implicit_def of
17210 // a vreg, so force these to use a single register.
17211 SDValue Src0 = Node->getOperand(1);
17212 SDValue Src1 = Node->getOperand(3);
17213 SDValue Src2 = Node->getOperand(5);
17214
17215 if ((Src0.isMachineOpcode() &&
17216 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17217 (Src0 == Src1 || Src0 == Src2))
17218 break;
17219
17220 MVT VT = Src0.getValueType().getSimpleVT();
17221 const TargetRegisterClass *RC =
17222 getRegClassFor(VT, Src0.getNode()->isDivergent());
17223
17225 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17226
17227 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17228 Src0, SDValue());
17229
17230 // src0 must be the same register as src1 or src2, even if the value is
17231 // undefined, so make sure we don't violate this constraint.
17232 if (Src0.isMachineOpcode() &&
17233 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17234 if (Src1.isMachineOpcode() &&
17235 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17236 Src0 = Src1;
17237 else if (Src2.isMachineOpcode() &&
17238 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17239 Src0 = Src2;
17240 else {
17241 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17242 Src0 = UndefReg;
17243 Src1 = UndefReg;
17244 }
17245 } else
17246 break;
17247
17249 Ops[1] = Src0;
17250 Ops[3] = Src1;
17251 Ops[5] = Src2;
17252 Ops.push_back(ImpDef.getValue(1));
17253 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17254 }
17255 default:
17256 break;
17257 }
17258
17259 return Node;
17260}
17261
17262// Any MIMG instructions that use tfe or lwe require an initialization of the
17263// result register that will be written in the case of a memory access failure.
17264// The required code is also added to tie this init code to the result of the
17265// img instruction.
17268 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17269 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17270 MachineBasicBlock &MBB = *MI.getParent();
17271
17272 int DstIdx =
17273 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17274 unsigned InitIdx = 0;
17275
17276 if (TII->isImage(MI)) {
17277 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17278 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17279 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17280
17281 if (!TFE && !LWE) // intersect_ray
17282 return;
17283
17284 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17285 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17286 unsigned D16Val = D16 ? D16->getImm() : 0;
17287
17288 if (!TFEVal && !LWEVal)
17289 return;
17290
17291 // At least one of TFE or LWE are non-zero
17292 // We have to insert a suitable initialization of the result value and
17293 // tie this to the dest of the image instruction.
17294
17295 // Calculate which dword we have to initialize to 0.
17296 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17297
17298 // check that dmask operand is found.
17299 assert(MO_Dmask && "Expected dmask operand in instruction");
17300
17301 unsigned dmask = MO_Dmask->getImm();
17302 // Determine the number of active lanes taking into account the
17303 // Gather4 special case
17304 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17305
17306 bool Packed = !Subtarget->hasUnpackedD16VMem();
17307
17308 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17309
17310 // Abandon attempt if the dst size isn't large enough
17311 // - this is in fact an error but this is picked up elsewhere and
17312 // reported correctly.
17313 uint32_t DstSize =
17314 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17315 if (DstSize < InitIdx)
17316 return;
17317 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17318 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17319 } else {
17320 return;
17321 }
17322
17323 const DebugLoc &DL = MI.getDebugLoc();
17324
17325 // Create a register for the initialization value.
17326 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17327 unsigned NewDst = 0; // Final initialized value will be in here
17328
17329 // If PRTStrictNull feature is enabled (the default) then initialize
17330 // all the result registers to 0, otherwise just the error indication
17331 // register (VGPRn+1)
17332 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17333 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17334
17335 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17336 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17337 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17338 // Initialize dword
17339 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17340 // clang-format off
17341 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17342 .addImm(0);
17343 // clang-format on
17344 // Insert into the super-reg
17345 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17346 .addReg(PrevDst)
17347 .addReg(SubReg)
17349
17350 PrevDst = NewDst;
17351 }
17352
17353 // Add as an implicit operand
17354 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17355
17356 // Tie the just added implicit operand to the dst
17357 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17358}
17359
17360/// Assign the register class depending on the number of
17361/// bits set in the writemask
17363 SDNode *Node) const {
17365
17366 MachineFunction *MF = MI.getParent()->getParent();
17369
17370 if (TII->isVOP3(MI.getOpcode())) {
17371 // Make sure constant bus requirements are respected.
17372 TII->legalizeOperandsVOP3(MRI, MI);
17373
17374 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17375 // This saves a chain-copy of registers and better balance register
17376 // use between vgpr and agpr as agpr tuples tend to be big.
17377 if (!MI.getDesc().operands().empty()) {
17378 unsigned Opc = MI.getOpcode();
17379 bool HasAGPRs = Info->mayNeedAGPRs();
17380 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17381 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17382 for (auto I :
17383 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17384 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17385 if (I == -1)
17386 break;
17387 if ((I == Src2Idx) && (HasAGPRs))
17388 break;
17389 MachineOperand &Op = MI.getOperand(I);
17390 if (!Op.isReg() || !Op.getReg().isVirtual())
17391 continue;
17392 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17393 if (!TRI->hasAGPRs(RC))
17394 continue;
17395 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17396 if (!Src || !Src->isCopy() ||
17397 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17398 continue;
17399 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17400 // All uses of agpr64 and agpr32 can also accept vgpr except for
17401 // v_accvgpr_read, but we do not produce agpr reads during selection,
17402 // so no use checks are needed.
17403 MRI.setRegClass(Op.getReg(), NewRC);
17404 }
17405
17406 if (TII->isMAI(MI)) {
17407 // The ordinary src0, src1, src2 were legalized above.
17408 //
17409 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17410 // as a separate instruction.
17411 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17412 AMDGPU::OpName::scale_src0);
17413 if (Src0Idx != -1) {
17414 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17415 AMDGPU::OpName::scale_src1);
17416 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17417 TII->usesConstantBus(MRI, MI, Src1Idx))
17418 TII->legalizeOpWithMove(MI, Src1Idx);
17419 }
17420 }
17421
17422 if (!HasAGPRs)
17423 return;
17424
17425 // Resolve the rest of AV operands to AGPRs.
17426 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17427 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17428 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17429 if (TRI->isVectorSuperClass(RC)) {
17430 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17431 MRI.setRegClass(Src2->getReg(), NewRC);
17432 if (Src2->isTied())
17433 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17434 }
17435 }
17436 }
17437 }
17438
17439 return;
17440 }
17441
17442 if (TII->isImage(MI))
17443 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17444}
17445
17447 uint64_t Val) {
17448 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17449 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17450}
17451
17453 const SDLoc &DL,
17454 SDValue Ptr) const {
17456
17457 // Build the half of the subregister with the constants before building the
17458 // full 128-bit register. If we are building multiple resource descriptors,
17459 // this will allow CSEing of the 2-component register.
17460 const SDValue Ops0[] = {
17461 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17462 buildSMovImm32(DAG, DL, 0),
17463 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17464 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17465 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17466
17467 SDValue SubRegHi = SDValue(
17468 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17469
17470 // Combine the constants and the pointer.
17471 const SDValue Ops1[] = {
17472 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17473 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17474 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17475
17476 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17477}
17478
17479/// Return a resource descriptor with the 'Add TID' bit enabled
17480/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17481/// of the resource descriptor) to create an offset, which is added to
17482/// the resource pointer.
17484 SDValue Ptr, uint32_t RsrcDword1,
17485 uint64_t RsrcDword2And3) const {
17486 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17487 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17488 if (RsrcDword1) {
17489 PtrHi =
17490 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17491 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17492 0);
17493 }
17494
17495 SDValue DataLo =
17496 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17497 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17498
17499 const SDValue Ops[] = {
17500 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17501 PtrLo,
17502 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17503 PtrHi,
17504 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17505 DataLo,
17506 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17507 DataHi,
17508 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17509
17510 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17511}
17512
17513//===----------------------------------------------------------------------===//
17514// SI Inline Assembly Support
17515//===----------------------------------------------------------------------===//
17516
17517std::pair<unsigned, const TargetRegisterClass *>
17519 StringRef Constraint,
17520 MVT VT) const {
17521 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17522
17523 const TargetRegisterClass *RC = nullptr;
17524 if (Constraint.size() == 1) {
17525 // Check if we cannot determine the bit size of the given value type. This
17526 // can happen, for example, in this situation where we have an empty struct
17527 // (size 0): `call void asm "", "v"({} poison)`-
17528 if (VT == MVT::Other)
17529 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17530 const unsigned BitWidth = VT.getSizeInBits();
17531 switch (Constraint[0]) {
17532 default:
17533 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17534 case 's':
17535 case 'r':
17536 switch (BitWidth) {
17537 case 16:
17538 RC = &AMDGPU::SReg_32RegClass;
17539 break;
17540 case 64:
17541 RC = &AMDGPU::SGPR_64RegClass;
17542 break;
17543 default:
17545 if (!RC)
17546 return std::pair(0U, nullptr);
17547 break;
17548 }
17549 break;
17550 case 'v':
17551 switch (BitWidth) {
17552 case 16:
17553 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17554 : &AMDGPU::VGPR_32_Lo256RegClass;
17555 break;
17556 default:
17557 RC = Subtarget->has1024AddressableVGPRs()
17558 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17559 : TRI->getVGPRClassForBitWidth(BitWidth);
17560 if (!RC)
17561 return std::pair(0U, nullptr);
17562 break;
17563 }
17564 break;
17565 case 'a':
17566 if (!Subtarget->hasMAIInsts())
17567 break;
17568 switch (BitWidth) {
17569 case 16:
17570 RC = &AMDGPU::AGPR_32RegClass;
17571 break;
17572 default:
17573 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17574 if (!RC)
17575 return std::pair(0U, nullptr);
17576 break;
17577 }
17578 break;
17579 }
17580 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17581 const unsigned BitWidth = VT.getSizeInBits();
17582 switch (BitWidth) {
17583 case 16:
17584 RC = &AMDGPU::AV_32RegClass;
17585 break;
17586 default:
17587 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17588 if (!RC)
17589 return std::pair(0U, nullptr);
17590 break;
17591 }
17592 }
17593
17594 // We actually support i128, i16 and f16 as inline parameters
17595 // even if they are not reported as legal
17596 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17597 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17598 return std::pair(0U, RC);
17599
17600 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17601 if (Kind != '\0') {
17602 if (Kind == 'v') {
17603 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17604 } else if (Kind == 's') {
17605 RC = &AMDGPU::SGPR_32RegClass;
17606 } else if (Kind == 'a') {
17607 RC = &AMDGPU::AGPR_32RegClass;
17608 }
17609
17610 if (RC) {
17611 if (NumRegs > 1) {
17612 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17613 return std::pair(0U, nullptr);
17614
17615 uint32_t Width = NumRegs * 32;
17616 // Prohibit constraints for register ranges with a width that does not
17617 // match the required type.
17618 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17619 return std::pair(0U, nullptr);
17620
17621 MCRegister Reg = RC->getRegister(Idx);
17623 RC = TRI->getVGPRClassForBitWidth(Width);
17624 else if (SIRegisterInfo::isSGPRClass(RC))
17625 RC = TRI->getSGPRClassForBitWidth(Width);
17626 else if (SIRegisterInfo::isAGPRClass(RC))
17627 RC = TRI->getAGPRClassForBitWidth(Width);
17628 if (RC) {
17629 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17630 if (!Reg) {
17631 // The register class does not contain the requested register,
17632 // e.g., because it is an SGPR pair that would violate alignment
17633 // requirements.
17634 return std::pair(0U, nullptr);
17635 }
17636 return std::pair(Reg, RC);
17637 }
17638 }
17639
17640 // Check for lossy scalar/vector conversions.
17641 if (VT.isVector() && VT.getSizeInBits() != 32)
17642 return std::pair(0U, nullptr);
17643 if (Idx < RC->getNumRegs())
17644 return std::pair(RC->getRegister(Idx), RC);
17645 return std::pair(0U, nullptr);
17646 }
17647 }
17648
17649 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17650 if (Ret.first)
17651 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17652
17653 return Ret;
17654}
17655
17656static bool isImmConstraint(StringRef Constraint) {
17657 if (Constraint.size() == 1) {
17658 switch (Constraint[0]) {
17659 default:
17660 break;
17661 case 'I':
17662 case 'J':
17663 case 'A':
17664 case 'B':
17665 case 'C':
17666 return true;
17667 }
17668 } else if (Constraint == "DA" || Constraint == "DB") {
17669 return true;
17670 }
17671 return false;
17672}
17673
17676 if (Constraint.size() == 1) {
17677 switch (Constraint[0]) {
17678 default:
17679 break;
17680 case 's':
17681 case 'v':
17682 case 'a':
17683 return C_RegisterClass;
17684 }
17685 } else if (Constraint.size() == 2) {
17686 if (Constraint == "VA")
17687 return C_RegisterClass;
17688 }
17689 if (isImmConstraint(Constraint)) {
17690 return C_Other;
17691 }
17692 return TargetLowering::getConstraintType(Constraint);
17693}
17694
17695static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17697 Val = Val & maskTrailingOnes<uint64_t>(Size);
17698 }
17699 return Val;
17700}
17701
17703 StringRef Constraint,
17704 std::vector<SDValue> &Ops,
17705 SelectionDAG &DAG) const {
17706 if (isImmConstraint(Constraint)) {
17707 uint64_t Val;
17708 if (getAsmOperandConstVal(Op, Val) &&
17709 checkAsmConstraintVal(Op, Constraint, Val)) {
17710 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17711 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17712 }
17713 } else {
17715 }
17716}
17717
17719 unsigned Size = Op.getScalarValueSizeInBits();
17720 if (Size > 64)
17721 return false;
17722
17723 if (Size == 16 && !Subtarget->has16BitInsts())
17724 return false;
17725
17727 Val = C->getSExtValue();
17728 return true;
17729 }
17731 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17732 return true;
17733 }
17735 if (Size != 16 || Op.getNumOperands() != 2)
17736 return false;
17737 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17738 return false;
17739 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17740 Val = C->getSExtValue();
17741 return true;
17742 }
17743 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17744 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17745 return true;
17746 }
17747 }
17748
17749 return false;
17750}
17751
17753 uint64_t Val) const {
17754 if (Constraint.size() == 1) {
17755 switch (Constraint[0]) {
17756 case 'I':
17758 case 'J':
17759 return isInt<16>(Val);
17760 case 'A':
17761 return checkAsmConstraintValA(Op, Val);
17762 case 'B':
17763 return isInt<32>(Val);
17764 case 'C':
17765 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17767 default:
17768 break;
17769 }
17770 } else if (Constraint.size() == 2) {
17771 if (Constraint == "DA") {
17772 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17773 int64_t LoBits = static_cast<int32_t>(Val);
17774 return checkAsmConstraintValA(Op, HiBits, 32) &&
17775 checkAsmConstraintValA(Op, LoBits, 32);
17776 }
17777 if (Constraint == "DB") {
17778 return true;
17779 }
17780 }
17781 llvm_unreachable("Invalid asm constraint");
17782}
17783
17785 unsigned MaxSize) const {
17786 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17787 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17788 if (Size == 16) {
17789 MVT VT = Op.getSimpleValueType();
17790 switch (VT.SimpleTy) {
17791 default:
17792 return false;
17793 case MVT::i16:
17794 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17795 case MVT::f16:
17796 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17797 case MVT::bf16:
17798 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17799 case MVT::v2i16:
17800 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17801 case MVT::v2f16:
17802 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17803 case MVT::v2bf16:
17804 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17805 }
17806 }
17807 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17808 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17809 return true;
17810 return false;
17811}
17812
17813static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17814 switch (UnalignedClassID) {
17815 case AMDGPU::VReg_64RegClassID:
17816 return AMDGPU::VReg_64_Align2RegClassID;
17817 case AMDGPU::VReg_96RegClassID:
17818 return AMDGPU::VReg_96_Align2RegClassID;
17819 case AMDGPU::VReg_128RegClassID:
17820 return AMDGPU::VReg_128_Align2RegClassID;
17821 case AMDGPU::VReg_160RegClassID:
17822 return AMDGPU::VReg_160_Align2RegClassID;
17823 case AMDGPU::VReg_192RegClassID:
17824 return AMDGPU::VReg_192_Align2RegClassID;
17825 case AMDGPU::VReg_224RegClassID:
17826 return AMDGPU::VReg_224_Align2RegClassID;
17827 case AMDGPU::VReg_256RegClassID:
17828 return AMDGPU::VReg_256_Align2RegClassID;
17829 case AMDGPU::VReg_288RegClassID:
17830 return AMDGPU::VReg_288_Align2RegClassID;
17831 case AMDGPU::VReg_320RegClassID:
17832 return AMDGPU::VReg_320_Align2RegClassID;
17833 case AMDGPU::VReg_352RegClassID:
17834 return AMDGPU::VReg_352_Align2RegClassID;
17835 case AMDGPU::VReg_384RegClassID:
17836 return AMDGPU::VReg_384_Align2RegClassID;
17837 case AMDGPU::VReg_512RegClassID:
17838 return AMDGPU::VReg_512_Align2RegClassID;
17839 case AMDGPU::VReg_1024RegClassID:
17840 return AMDGPU::VReg_1024_Align2RegClassID;
17841 case AMDGPU::AReg_64RegClassID:
17842 return AMDGPU::AReg_64_Align2RegClassID;
17843 case AMDGPU::AReg_96RegClassID:
17844 return AMDGPU::AReg_96_Align2RegClassID;
17845 case AMDGPU::AReg_128RegClassID:
17846 return AMDGPU::AReg_128_Align2RegClassID;
17847 case AMDGPU::AReg_160RegClassID:
17848 return AMDGPU::AReg_160_Align2RegClassID;
17849 case AMDGPU::AReg_192RegClassID:
17850 return AMDGPU::AReg_192_Align2RegClassID;
17851 case AMDGPU::AReg_256RegClassID:
17852 return AMDGPU::AReg_256_Align2RegClassID;
17853 case AMDGPU::AReg_512RegClassID:
17854 return AMDGPU::AReg_512_Align2RegClassID;
17855 case AMDGPU::AReg_1024RegClassID:
17856 return AMDGPU::AReg_1024_Align2RegClassID;
17857 default:
17858 return -1;
17859 }
17860}
17861
17862// Figure out which registers should be reserved for stack access. Only after
17863// the function is legalized do we know all of the non-spill stack objects or if
17864// calls are present.
17868 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17869 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17870 const SIInstrInfo *TII = ST.getInstrInfo();
17871
17872 if (Info->isEntryFunction()) {
17873 // Callable functions have fixed registers used for stack access.
17875 }
17876
17877 // TODO: Move this logic to getReservedRegs()
17878 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17879 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17880 Register SReg = ST.isWave32()
17881 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17882 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17883 &AMDGPU::SGPR_64RegClass);
17884 Info->setSGPRForEXECCopy(SReg);
17885
17886 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17887 Info->getStackPtrOffsetReg()));
17888 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17889 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17890
17891 // We need to worry about replacing the default register with itself in case
17892 // of MIR testcases missing the MFI.
17893 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17894 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17895
17896 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17897 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17898
17899 Info->limitOccupancy(MF);
17900
17901 if (ST.isWave32() && !MF.empty()) {
17902 for (auto &MBB : MF) {
17903 for (auto &MI : MBB) {
17904 TII->fixImplicitOperands(MI);
17905 }
17906 }
17907 }
17908
17909 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17910 // classes if required. Ideally the register class constraints would differ
17911 // per-subtarget, but there's no easy way to achieve that right now. This is
17912 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17913 // from using them as the register class for legal types.
17914 if (ST.needsAlignedVGPRs()) {
17915 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17916 const Register Reg = Register::index2VirtReg(I);
17917 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17918 if (!RC)
17919 continue;
17920 int NewClassID = getAlignedAGPRClassID(RC->getID());
17921 if (NewClassID != -1)
17922 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17923 }
17924 }
17925
17927}
17928
17930 KnownBits &Known,
17931 const APInt &DemandedElts,
17932 const SelectionDAG &DAG,
17933 unsigned Depth) const {
17934 Known.resetAll();
17935 unsigned Opc = Op.getOpcode();
17936 switch (Opc) {
17938 unsigned IID = Op.getConstantOperandVal(0);
17939 switch (IID) {
17940 case Intrinsic::amdgcn_mbcnt_lo:
17941 case Intrinsic::amdgcn_mbcnt_hi: {
17942 const GCNSubtarget &ST =
17944 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17945 // most 31 + src1.
17946 Known.Zero.setBitsFrom(
17947 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17948 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17949 Known = KnownBits::add(Known, Known2);
17950 return;
17951 }
17952 }
17953 break;
17954 }
17955 }
17957 Op, Known, DemandedElts, DAG, Depth);
17958}
17959
17961 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17963
17964 // Set the high bits to zero based on the maximum allowed scratch size per
17965 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17966 // calculation won't overflow, so assume the sign bit is never set.
17967 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17968}
17969
17971 GISelValueTracking &VT, KnownBits &Known,
17972 unsigned Dim) {
17973 unsigned MaxValue =
17974 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17975 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17976}
17977
17979 KnownBits &Known, const APInt &DemandedElts,
17980 unsigned BFEWidth, bool SExt, unsigned Depth) {
17982 const MachineOperand &Src1 = MI.getOperand(2);
17983
17984 unsigned Src1Cst = 0;
17985 if (Src1.isImm()) {
17986 Src1Cst = Src1.getImm();
17987 } else if (Src1.isReg()) {
17988 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17989 if (!Cst)
17990 return;
17991 Src1Cst = Cst->Value.getZExtValue();
17992 } else {
17993 return;
17994 }
17995
17996 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17997 // Width is always [22:16].
17998 const unsigned Offset =
17999 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18000 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18001
18002 if (Width >= BFEWidth) // Ill-formed.
18003 return;
18004
18005 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18006 Depth + 1);
18007
18008 Known = Known.extractBits(Width, Offset);
18009
18010 if (SExt)
18011 Known = Known.sext(BFEWidth);
18012 else
18013 Known = Known.zext(BFEWidth);
18014}
18015
18017 GISelValueTracking &VT, Register R, KnownBits &Known,
18018 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18019 unsigned Depth) const {
18020 Known.resetAll();
18021 const MachineInstr *MI = MRI.getVRegDef(R);
18022 switch (MI->getOpcode()) {
18023 case AMDGPU::S_BFE_I32:
18024 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18025 /*SExt=*/true, Depth);
18026 case AMDGPU::S_BFE_U32:
18027 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18028 /*SExt=*/false, Depth);
18029 case AMDGPU::S_BFE_I64:
18030 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18031 /*SExt=*/true, Depth);
18032 case AMDGPU::S_BFE_U64:
18033 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18034 /*SExt=*/false, Depth);
18035 case AMDGPU::G_INTRINSIC:
18036 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18037 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18038 switch (IID) {
18039 case Intrinsic::amdgcn_workitem_id_x:
18040 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18041 break;
18042 case Intrinsic::amdgcn_workitem_id_y:
18043 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18044 break;
18045 case Intrinsic::amdgcn_workitem_id_z:
18046 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18047 break;
18048 case Intrinsic::amdgcn_mbcnt_lo:
18049 case Intrinsic::amdgcn_mbcnt_hi: {
18050 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18051 // most 31 + src1.
18052 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18053 ? getSubtarget()->getWavefrontSizeLog2()
18054 : 5);
18055 KnownBits Known2;
18056 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18057 Depth + 1);
18058 Known = KnownBits::add(Known, Known2);
18059 break;
18060 }
18061 case Intrinsic::amdgcn_groupstaticsize: {
18062 // We can report everything over the maximum size as 0. We can't report
18063 // based on the actual size because we don't know if it's accurate or not
18064 // at any given point.
18065 Known.Zero.setHighBits(
18066 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18067 break;
18068 }
18069 }
18070 break;
18071 }
18072 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18073 Known.Zero.setHighBits(24);
18074 break;
18075 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18076 Known.Zero.setHighBits(16);
18077 break;
18078 case AMDGPU::G_AMDGPU_SMED3:
18079 case AMDGPU::G_AMDGPU_UMED3: {
18080 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18081
18082 KnownBits Known2;
18083 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18084 if (Known2.isUnknown())
18085 break;
18086
18087 KnownBits Known1;
18088 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18089 if (Known1.isUnknown())
18090 break;
18091
18092 KnownBits Known0;
18093 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18094 if (Known0.isUnknown())
18095 break;
18096
18097 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18098 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18099 Known.One = Known0.One & Known1.One & Known2.One;
18100 break;
18101 }
18102 }
18103}
18104
18107 unsigned Depth) const {
18108 const MachineInstr *MI = MRI.getVRegDef(R);
18109 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18110 // FIXME: Can this move to generic code? What about the case where the call
18111 // site specifies a lower alignment?
18112 Intrinsic::ID IID = GI->getIntrinsicID();
18114 AttributeList Attrs =
18115 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18116 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18117 return *RetAlign;
18118 }
18119 return Align(1);
18120}
18121
18124 const Align CacheLineAlign = Align(64);
18125
18126 // Pre-GFX10 target did not benefit from loop alignment
18127 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18128 getSubtarget()->hasInstFwdPrefetchBug())
18129 return PrefAlign;
18130
18131 // On GFX10 I$ is 4 x 64 bytes cache lines.
18132 // By default prefetcher keeps one cache line behind and reads two ahead.
18133 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18134 // behind and one ahead.
18135 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18136 // If loop fits 64 bytes it always spans no more than two cache lines and
18137 // does not need an alignment.
18138 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18139 // Else if loop is less or equal 192 bytes we need two lines behind.
18140
18142 const MachineBasicBlock *Header = ML->getHeader();
18143 if (Header->getAlignment() != PrefAlign)
18144 return Header->getAlignment(); // Already processed.
18145
18146 unsigned LoopSize = 0;
18147 for (const MachineBasicBlock *MBB : ML->blocks()) {
18148 // If inner loop block is aligned assume in average half of the alignment
18149 // size to be added as nops.
18150 if (MBB != Header)
18151 LoopSize += MBB->getAlignment().value() / 2;
18152
18153 for (const MachineInstr &MI : *MBB) {
18154 LoopSize += TII->getInstSizeInBytes(MI);
18155 if (LoopSize > 192)
18156 return PrefAlign;
18157 }
18158 }
18159
18160 if (LoopSize <= 64)
18161 return PrefAlign;
18162
18163 if (LoopSize <= 128)
18164 return CacheLineAlign;
18165
18166 // If any of parent loops is surrounded by prefetch instructions do not
18167 // insert new for inner loop, which would reset parent's settings.
18168 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18169 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18170 auto I = Exit->getFirstNonDebugInstr();
18171 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18172 return CacheLineAlign;
18173 }
18174 }
18175
18176 MachineBasicBlock *Pre = ML->getLoopPreheader();
18177 MachineBasicBlock *Exit = ML->getExitBlock();
18178
18179 if (Pre && Exit) {
18180 auto PreTerm = Pre->getFirstTerminator();
18181 if (PreTerm == Pre->begin() ||
18182 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18183 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18184 .addImm(1); // prefetch 2 lines behind PC
18185
18186 auto ExitHead = Exit->getFirstNonDebugInstr();
18187 if (ExitHead == Exit->end() ||
18188 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18189 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18190 .addImm(2); // prefetch 1 line behind PC
18191 }
18192
18193 return CacheLineAlign;
18194}
18195
18197static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18198 assert(N->getOpcode() == ISD::CopyFromReg);
18199 do {
18200 // Follow the chain until we find an INLINEASM node.
18201 N = N->getOperand(0).getNode();
18202 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18203 return true;
18204 } while (N->getOpcode() == ISD::CopyFromReg);
18205 return false;
18206}
18207
18210 UniformityInfo *UA) const {
18211 switch (N->getOpcode()) {
18212 case ISD::CopyFromReg: {
18213 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18214 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18215 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18216 Register Reg = R->getReg();
18217
18218 // FIXME: Why does this need to consider isLiveIn?
18219 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18220 return !TRI->isSGPRReg(MRI, Reg);
18221
18222 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18223 return UA->isDivergent(V);
18224
18226 return !TRI->isSGPRReg(MRI, Reg);
18227 }
18228 case ISD::LOAD: {
18229 const LoadSDNode *L = cast<LoadSDNode>(N);
18230 unsigned AS = L->getAddressSpace();
18231 // A flat load may access private memory.
18233 }
18234 case ISD::CALLSEQ_END:
18235 return true;
18237 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18239 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18258 // Target-specific read-modify-write atomics are sources of divergence.
18259 return true;
18260 default:
18261 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18262 // Generic read-modify-write atomics are sources of divergence.
18263 return A->readMem() && A->writeMem();
18264 }
18265 return false;
18266 }
18267}
18268
18270 EVT VT) const {
18271 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18272 case MVT::f32:
18274 case MVT::f64:
18275 case MVT::f16:
18277 default:
18278 return false;
18279 }
18280}
18281
18283 LLT Ty, const MachineFunction &MF) const {
18284 switch (Ty.getScalarSizeInBits()) {
18285 case 32:
18286 return !denormalModeIsFlushAllF32(MF);
18287 case 64:
18288 case 16:
18289 return !denormalModeIsFlushAllF64F16(MF);
18290 default:
18291 return false;
18292 }
18293}
18294
18296 const APInt &DemandedElts,
18297 const SelectionDAG &DAG,
18298 bool SNaN,
18299 unsigned Depth) const {
18300 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18301 const MachineFunction &MF = DAG.getMachineFunction();
18303
18304 if (Info->getMode().DX10Clamp)
18305 return true; // Clamped to 0.
18306 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18307 }
18308
18310 DAG, SNaN, Depth);
18311}
18312
18313// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18314// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18316 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18317 return true;
18318
18320 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18321 if (DenormMode == DenormalMode::getPreserveSign())
18322 return true;
18323
18324 // TODO: Remove this.
18325 return RMW->getFunction()
18326 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18327 .getValueAsBool();
18328}
18329
18331 LLVMContext &Ctx = RMW->getContext();
18332 StringRef MemScope =
18333 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18334
18335 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18336 << "Hardware instruction generated for atomic "
18337 << RMW->getOperationName(RMW->getOperation())
18338 << " operation at memory scope " << MemScope;
18339}
18340
18341static bool isV2F16OrV2BF16(Type *Ty) {
18342 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18343 Type *EltTy = VT->getElementType();
18344 return VT->getNumElements() == 2 &&
18345 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18346 }
18347
18348 return false;
18349}
18350
18351static bool isV2F16(Type *Ty) {
18353 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18354}
18355
18356static bool isV2BF16(Type *Ty) {
18358 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18359}
18360
18361/// \return true if atomicrmw integer ops work for the type.
18362static bool isAtomicRMWLegalIntTy(Type *Ty) {
18363 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18364 unsigned BW = IT->getBitWidth();
18365 return BW == 32 || BW == 64;
18366 }
18367
18368 return false;
18369}
18370
18371/// \return true if this atomicrmw xchg type can be selected.
18372static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18373 Type *Ty = RMW->getType();
18374 if (isAtomicRMWLegalIntTy(Ty))
18375 return true;
18376
18377 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18378 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18379 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18380 return BW == 32 || BW == 64;
18381 }
18382
18383 if (Ty->isFloatTy() || Ty->isDoubleTy())
18384 return true;
18385
18387 return VT->getNumElements() == 2 &&
18388 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18389 }
18390
18391 return false;
18392}
18393
18394/// \returns true if it's valid to emit a native instruction for \p RMW, based
18395/// on the properties of the target memory.
18396static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18397 const AtomicRMWInst *RMW,
18398 bool HasSystemScope) {
18399 // The remote/fine-grained access logic is different from the integer
18400 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18401 // fine-grained access does not work, even for a device local allocation.
18402 //
18403 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18404 // allocations work.
18405 if (HasSystemScope) {
18407 RMW->hasMetadata("amdgpu.no.remote.memory"))
18408 return true;
18409 if (Subtarget.hasEmulatedSystemScopeAtomics())
18410 return true;
18412 return true;
18413
18414 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18415}
18416
18417/// \return Action to perform on AtomicRMWInsts for integer operations.
18424
18425/// Return if a flat address space atomicrmw can access private memory.
18427 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18428 return !MD ||
18430}
18431
18439
18442 unsigned AS = RMW->getPointerAddressSpace();
18443 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18445
18446 // 64-bit flat atomics that dynamically reside in private memory will silently
18447 // be dropped.
18448 //
18449 // Note that we will emit a new copy of the original atomic in the expansion,
18450 // which will be incrementally relegalized.
18451 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18452 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18453 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18456
18457 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18459 ORE.emit([=]() {
18460 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18461 });
18462 return Kind;
18463 };
18464
18465 auto SSID = RMW->getSyncScopeID();
18466 bool HasSystemScope =
18467 SSID == SyncScope::System ||
18468 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18469
18470 auto Op = RMW->getOperation();
18471 switch (Op) {
18473 // PCIe supports add and xchg for system atomics.
18474 return isAtomicRMWLegalXChgTy(RMW)
18477 case AtomicRMWInst::Add:
18478 // PCIe supports add and xchg for system atomics.
18480 case AtomicRMWInst::Sub:
18481 case AtomicRMWInst::And:
18482 case AtomicRMWInst::Or:
18483 case AtomicRMWInst::Xor:
18484 case AtomicRMWInst::Max:
18485 case AtomicRMWInst::Min:
18492 if (Subtarget->hasEmulatedSystemScopeAtomics())
18494
18495 // On most subtargets, for atomicrmw operations other than add/xchg,
18496 // whether or not the instructions will behave correctly depends on where
18497 // the address physically resides and what interconnect is used in the
18498 // system configuration. On some some targets the instruction will nop,
18499 // and in others synchronization will only occur at degraded device scope.
18500 //
18501 // If the allocation is known local to the device, the instructions should
18502 // work correctly.
18503 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18505
18506 // If fine-grained remote memory works at device scope, we don't need to
18507 // do anything.
18508 if (!HasSystemScope &&
18509 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18511
18512 // If we are targeting a remote allocated address, it depends what kind of
18513 // allocation the address belongs to.
18514 //
18515 // If the allocation is fine-grained (in host memory, or in PCIe peer
18516 // device memory), the operation will fail depending on the target.
18517 //
18518 // Note fine-grained host memory access does work on APUs or if XGMI is
18519 // used, but we do not know if we are targeting an APU or the system
18520 // configuration from the ISA version/target-cpu.
18521 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18523
18526 // Atomic sub/or/xor do not work over PCI express, but atomic add
18527 // does. InstCombine transforms these with 0 to or, so undo that.
18528 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18529 ConstVal && ConstVal->isNullValue())
18531 }
18532
18533 // If the allocation could be in remote, fine-grained memory, the rmw
18534 // instructions may fail. cmpxchg should work, so emit that. On some
18535 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18536 // even work, so you're out of luck anyway.
18537
18538 // In summary:
18539 //
18540 // Cases that may fail:
18541 // - fine-grained pinned host memory
18542 // - fine-grained migratable host memory
18543 // - fine-grained PCIe peer device
18544 //
18545 // Cases that should work, but may be treated overly conservatively.
18546 // - fine-grained host memory on an APU
18547 // - fine-grained XGMI peer device
18549 }
18550
18552 }
18553 case AtomicRMWInst::FAdd: {
18554 Type *Ty = RMW->getType();
18555
18556 // TODO: Handle REGION_ADDRESS
18557 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18558 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18559 // is fixed to round-to-nearest-even.
18560 //
18561 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18562 // round-to-nearest-even.
18563 //
18564 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18565 // suggests it is OK if the floating-point mode may not match the calling
18566 // thread.
18567 if (Ty->isFloatTy()) {
18568 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18570 }
18571
18572 if (Ty->isDoubleTy()) {
18573 // Ignores denormal mode, but we don't consider flushing mandatory.
18574 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18576 }
18577
18578 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18580
18582 }
18583
18584 // LDS atomics respect the denormal mode from the mode register.
18585 //
18586 // Traditionally f32 global/buffer memory atomics would unconditionally
18587 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18588 // flush.
18589 //
18590 // On targets with flat atomic fadd, denormals would flush depending on
18591 // whether the target address resides in LDS or global memory. We consider
18592 // this flat-maybe-flush as will-flush.
18593 if (Ty->isFloatTy() &&
18594 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18597
18598 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18599 // safe. The message phrasing also should be better.
18600 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18601 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18602 // gfx942, gfx12
18603 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18604 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18605 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18606 // gfx90a, gfx942, gfx12
18607 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18608 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18609
18610 // gfx942, gfx12
18611 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18612 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18613 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18614 // gfx90a, gfx942, gfx12
18615 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18616 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18617
18618 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18619 // buffer. gfx12 does have the buffer version.
18620 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18621 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18622 }
18623
18624 // global and flat atomic fadd f64: gfx90a, gfx942.
18625 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18626 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18627
18628 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18629 if (Ty->isFloatTy()) {
18630 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18631 // gfx11+.
18632 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18633 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18634 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18635 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18636 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18637 } else {
18638 // gfx908
18639 if (RMW->use_empty() &&
18640 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18641 isV2F16(Ty))
18642 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18643 }
18644 }
18645
18646 // flat atomic fadd f32: gfx942, gfx11+.
18647 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18648 if (Subtarget->hasFlatAtomicFaddF32Inst())
18649 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18650
18651 // If it is in flat address space, and the type is float, we will try to
18652 // expand it, if the target supports global and lds atomic fadd. The
18653 // reason we need that is, in the expansion, we emit the check of
18654 // address space. If it is in global address space, we emit the global
18655 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18656 // fadd.
18657 if (Subtarget->hasLDSFPAtomicAddF32()) {
18658 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18660 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18662 }
18663 }
18664 }
18665
18667 }
18669 case AtomicRMWInst::FMax: {
18670 Type *Ty = RMW->getType();
18671
18672 // LDS float and double fmin/fmax were always supported.
18673 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18674 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18676 }
18677
18678 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18679 // For flat and global cases:
18680 // float, double in gfx7. Manual claims denormal support.
18681 // Removed in gfx8.
18682 // float, double restored in gfx10.
18683 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18684 //
18685 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18686 // no f32.
18687 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18688 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18689 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18690 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18691 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18692 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18694 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18695 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18696 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18697 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18698 }
18699 }
18700
18702 }
18705 default:
18707 }
18708
18709 llvm_unreachable("covered atomicrmw op switch");
18710}
18711
18718
18725
18728 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18729 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18731
18732 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18734
18735 const DataLayout &DL = CmpX->getDataLayout();
18736
18737 Type *ValTy = CmpX->getNewValOperand()->getType();
18738
18739 // If a 64-bit flat atomic may alias private, we need to avoid using the
18740 // atomic in the private case.
18741 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18743}
18744
18745const TargetRegisterClass *
18746SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18748 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18749 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18750 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18751 : &AMDGPU::SReg_32RegClass;
18752 if (!TRI->isSGPRClass(RC) && !isDivergent)
18753 return TRI->getEquivalentSGPRClass(RC);
18754 if (TRI->isSGPRClass(RC) && isDivergent)
18755 return TRI->getEquivalentVGPRClass(RC);
18756
18757 return RC;
18758}
18759
18760// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18761// uniform values (as produced by the mask results of control flow intrinsics)
18762// used outside of divergent blocks. The phi users need to also be treated as
18763// always uniform.
18764//
18765// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18766static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18767 unsigned WaveSize) {
18768 // FIXME: We assume we never cast the mask results of a control flow
18769 // intrinsic.
18770 // Early exit if the type won't be consistent as a compile time hack.
18771 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18772 if (!IT || IT->getBitWidth() != WaveSize)
18773 return false;
18774
18775 if (!isa<Instruction>(V))
18776 return false;
18777 if (!Visited.insert(V).second)
18778 return false;
18779 bool Result = false;
18780 for (const auto *U : V->users()) {
18782 if (V == U->getOperand(1)) {
18783 switch (Intrinsic->getIntrinsicID()) {
18784 default:
18785 Result = false;
18786 break;
18787 case Intrinsic::amdgcn_if_break:
18788 case Intrinsic::amdgcn_if:
18789 case Intrinsic::amdgcn_else:
18790 Result = true;
18791 break;
18792 }
18793 }
18794 if (V == U->getOperand(0)) {
18795 switch (Intrinsic->getIntrinsicID()) {
18796 default:
18797 Result = false;
18798 break;
18799 case Intrinsic::amdgcn_end_cf:
18800 case Intrinsic::amdgcn_loop:
18801 Result = true;
18802 break;
18803 }
18804 }
18805 } else {
18806 Result = hasCFUser(U, Visited, WaveSize);
18807 }
18808 if (Result)
18809 break;
18810 }
18811 return Result;
18812}
18813
18815 const Value *V) const {
18816 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18817 if (CI->isInlineAsm()) {
18818 // FIXME: This cannot give a correct answer. This should only trigger in
18819 // the case where inline asm returns mixed SGPR and VGPR results, used
18820 // outside the defining block. We don't have a specific result to
18821 // consider, so this assumes if any value is SGPR, the overall register
18822 // also needs to be SGPR.
18823 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18825 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18826 for (auto &TC : TargetConstraints) {
18827 if (TC.Type == InlineAsm::isOutput) {
18829 const TargetRegisterClass *RC =
18830 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18831 TC.ConstraintVT)
18832 .second;
18833 if (RC && SIRI->isSGPRClass(RC))
18834 return true;
18835 }
18836 }
18837 }
18838 }
18840 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18841}
18842
18844 for (SDUse &Use : N->uses()) {
18846 if (getBasePtrIndex(M) == Use.getOperandNo())
18847 return true;
18848 }
18849 }
18850 return false;
18851}
18852
18854 SDValue N1) const {
18855 if (!N0.hasOneUse())
18856 return false;
18857 // Take care of the opportunity to keep N0 uniform
18858 if (N0->isDivergent() || !N1->isDivergent())
18859 return true;
18860 // Check if we have a good chance to form the memory access pattern with the
18861 // base and offset
18862 return (DAG.isBaseWithConstantOffset(N0) &&
18864}
18865
18867 Register N0, Register N1) const {
18868 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18869}
18870
18873 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18875 if (I.getMetadata("amdgpu.noclobber"))
18876 Flags |= MONoClobber;
18877 if (I.getMetadata("amdgpu.last.use"))
18878 Flags |= MOLastUse;
18879 return Flags;
18880}
18881
18883 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18884 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18885 if (User->getOpcode() != ISD::CopyToReg)
18886 return false;
18887 if (!Def->isMachineOpcode())
18888 return false;
18890 if (!MDef)
18891 return false;
18892
18893 unsigned ResNo = User->getOperand(Op).getResNo();
18894 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18895 return false;
18896 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18897 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18898 PhysReg = AMDGPU::SCC;
18899 const TargetRegisterClass *RC =
18900 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18901 Cost = RC->getCopyCost();
18902 return true;
18903 }
18904 return false;
18905}
18906
18908 Instruction *AI) const {
18909 // Given: atomicrmw fadd ptr %addr, float %val ordering
18910 //
18911 // With this expansion we produce the following code:
18912 // [...]
18913 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18914 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18915 //
18916 // atomicrmw.shared:
18917 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18918 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18919 // float %val ordering
18920 // br label %atomicrmw.phi
18921 //
18922 // atomicrmw.check.private:
18923 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18924 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18925 //
18926 // atomicrmw.private:
18927 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18928 // %loaded.private = load float, ptr addrspace(5) %cast.private
18929 // %val.new = fadd float %loaded.private, %val
18930 // store float %val.new, ptr addrspace(5) %cast.private
18931 // br label %atomicrmw.phi
18932 //
18933 // atomicrmw.global:
18934 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18935 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18936 // float %val ordering
18937 // br label %atomicrmw.phi
18938 //
18939 // atomicrmw.phi:
18940 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18941 // [ %loaded.private, %atomicrmw.private ],
18942 // [ %loaded.global, %atomicrmw.global ]
18943 // br label %atomicrmw.end
18944 //
18945 // atomicrmw.end:
18946 // [...]
18947 //
18948 //
18949 // For 64-bit atomics which may reside in private memory, we perform a simpler
18950 // version that only inserts the private check, and uses the flat operation.
18951
18952 IRBuilder<> Builder(AI);
18953 LLVMContext &Ctx = Builder.getContext();
18954
18955 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18956 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18958 Value *Addr = AI->getOperand(PtrOpIdx);
18959
18960 /// TODO: Only need to check private, then emit flat-known-not private (no
18961 /// need for shared block, or cast to global).
18963
18964 Align Alignment;
18965 if (RMW)
18966 Alignment = RMW->getAlign();
18967 else if (CX)
18968 Alignment = CX->getAlign();
18969 else
18970 llvm_unreachable("unhandled atomic operation");
18971
18972 // FullFlatEmulation is true if we need to issue the private, shared, and
18973 // global cases.
18974 //
18975 // If this is false, we are only dealing with the flat-targeting-private case,
18976 // where we only insert a check for private and still use the flat instruction
18977 // for global and shared.
18978
18979 bool FullFlatEmulation =
18980 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18981 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18982 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18983 RMW->getType()->isDoubleTy()));
18984
18985 // If the return value isn't used, do not introduce a false use in the phi.
18986 bool ReturnValueIsUsed = !AI->use_empty();
18987
18988 BasicBlock *BB = Builder.GetInsertBlock();
18989 Function *F = BB->getParent();
18990 BasicBlock *ExitBB =
18991 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18992 BasicBlock *SharedBB = nullptr;
18993
18994 BasicBlock *CheckPrivateBB = BB;
18995 if (FullFlatEmulation) {
18996 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18997 CheckPrivateBB =
18998 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18999 }
19000
19001 BasicBlock *PrivateBB =
19002 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19003 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19004 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19005
19006 std::prev(BB->end())->eraseFromParent();
19007 Builder.SetInsertPoint(BB);
19008
19009 Value *LoadedShared = nullptr;
19010 if (FullFlatEmulation) {
19011 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19012 {Addr}, nullptr, "is.shared");
19013 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19014 Builder.SetInsertPoint(SharedBB);
19015 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19017
19018 Instruction *Clone = AI->clone();
19019 Clone->insertInto(SharedBB, SharedBB->end());
19020 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19021 LoadedShared = Clone;
19022
19023 Builder.CreateBr(PhiBB);
19024 Builder.SetInsertPoint(CheckPrivateBB);
19025 }
19026
19027 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19028 {Addr}, nullptr, "is.private");
19029 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19030
19031 Builder.SetInsertPoint(PrivateBB);
19032
19033 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19035
19036 Value *LoadedPrivate;
19037 if (RMW) {
19038 LoadedPrivate = Builder.CreateAlignedLoad(
19039 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19040
19041 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19042 LoadedPrivate, RMW->getValOperand());
19043
19044 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19045 } else {
19046 auto [ResultLoad, Equal] =
19047 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19048 CX->getNewValOperand(), CX->getAlign());
19049
19050 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19051 ResultLoad, 0);
19052 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19053 }
19054
19055 Builder.CreateBr(PhiBB);
19056
19057 Builder.SetInsertPoint(GlobalBB);
19058
19059 // Continue using a flat instruction if we only emitted the check for private.
19060 Instruction *LoadedGlobal = AI;
19061 if (FullFlatEmulation) {
19062 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19064 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19065 }
19066
19067 AI->removeFromParent();
19068 AI->insertInto(GlobalBB, GlobalBB->end());
19069
19070 // The new atomicrmw may go through another round of legalization later.
19071 if (!FullFlatEmulation) {
19072 // We inserted the runtime check already, make sure we do not try to
19073 // re-expand this.
19074 // TODO: Should union with any existing metadata.
19075 MDBuilder MDB(F->getContext());
19076 MDNode *RangeNotPrivate =
19079 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19080 RangeNotPrivate);
19081 }
19082
19083 Builder.CreateBr(PhiBB);
19084
19085 Builder.SetInsertPoint(PhiBB);
19086
19087 if (ReturnValueIsUsed) {
19088 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19089 AI->replaceAllUsesWith(Loaded);
19090 if (FullFlatEmulation)
19091 Loaded->addIncoming(LoadedShared, SharedBB);
19092 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19093 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19094 Loaded->takeName(AI);
19095 }
19096
19097 Builder.CreateBr(ExitBB);
19098}
19099
19101 unsigned PtrOpIdx) {
19102 Value *PtrOp = I->getOperand(PtrOpIdx);
19105
19106 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19107 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19108 I->getIterator());
19109 I->setOperand(PtrOpIdx, ASCast);
19110}
19111
19114
19117
19120 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19121 ConstVal && ConstVal->isNullValue()) {
19122 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19124
19125 // We may still need the private-alias-flat handling below.
19126
19127 // TODO: Skip this for cases where we cannot access remote memory.
19128 }
19129 }
19130
19131 // The non-flat expansions should only perform the de-canonicalization of
19132 // identity values.
19134 return;
19135
19137}
19138
19145
19149
19151 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19152}
19153
19155 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19156 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19157
19159 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19160}
19161
19162LoadInst *
19164 IRBuilder<> Builder(AI);
19165 auto Order = AI->getOrdering();
19166
19167 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19168 // must be flushed if the atomic ordering had a release semantics. This is
19169 // not necessary a fence, a release fence just coincides to do that flush.
19170 // Avoid replacing of an atomicrmw with a release semantics.
19171 if (isReleaseOrStronger(Order))
19172 return nullptr;
19173
19174 LoadInst *LI = Builder.CreateAlignedLoad(
19175 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19176 LI->setAtomic(Order, AI->getSyncScopeID());
19177 LI->copyMetadata(*AI);
19178 LI->takeName(AI);
19179 AI->replaceAllUsesWith(LI);
19180 AI->eraseFromParent();
19181 return LI;
19182}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1247
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1244
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1445
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:833
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2118
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1740
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:218
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:241
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs