LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUTargetMachine.h"
19#include "GCNSubtarget.h"
22#include "SIRegisterInfo.h"
23#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/Statistic.h"
39#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
43#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/ModRef.h"
48#include <optional>
49
50using namespace llvm;
51using namespace llvm::SDPatternMatch;
52
53#define DEBUG_TYPE "si-lower"
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58 DisableLoopAlignment("amdgpu-disable-loop-alignment",
59 cl::desc("Do not align and prefetch loops"),
60 cl::init(false));
61
63 "amdgpu-use-divergent-register-indexing", cl::Hidden,
64 cl::desc("Use indirect register addressing for divergent indexes"),
65 cl::init(false));
66
67// TODO: This option should be removed once we switch to always using PTRADD in
68// the SelectionDAG.
70 "amdgpu-use-sdag-ptradd", cl::Hidden,
71 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
72 "SelectionDAG ISel"),
73 cl::init(false));
74
77 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
78}
79
82 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
83}
84
85static unsigned findFirstFreeSGPR(CCState &CCInfo) {
86 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
87 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
88 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
89 return AMDGPU::SGPR0 + Reg;
90 }
91 }
92 llvm_unreachable("Cannot allocate sgpr");
93}
94
96 const GCNSubtarget &STI)
97 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
98 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
99 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
100
101 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
102 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
103
104 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
105
106 const SIRegisterInfo *TRI = STI.getRegisterInfo();
107 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
108
109 addRegisterClass(MVT::f64, V64RegClass);
110 addRegisterClass(MVT::v2f32, V64RegClass);
111 addRegisterClass(MVT::Untyped, V64RegClass);
112
113 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
114 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
115
116 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
117 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
118
119 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
120 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
121
122 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
123 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
124
125 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
126 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
127
128 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
129 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
130
131 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
132 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
133
134 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
135 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
136
137 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
138 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
139
140 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
141 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
142
143 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
144 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
151
152 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
153 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
154
155 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
156 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
157
158 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
159 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
160
161 if (Subtarget->has16BitInsts()) {
162 if (Subtarget->useRealTrue16Insts()) {
163 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
165 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
166 } else {
167 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
170 }
171
172 // Unless there are also VOP3P operations, not operations are really legal.
173 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
176 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
179 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
182 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
185 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
187 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
188 }
189
190 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
191 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
192
193 computeRegisterProperties(Subtarget->getRegisterInfo());
194
195 // The boolean content concept here is too inflexible. Compares only ever
196 // really produce a 1-bit result. Any copy/extend from these will turn into a
197 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
198 // it's what most targets use.
201
202 // We need to custom lower vector stores from local memory
203 setOperationAction(ISD::LOAD,
204 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
205 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
206 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
207 MVT::i1, MVT::v32i32},
208 Custom);
209
210 setOperationAction(ISD::STORE,
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Custom);
216
217 if (isTypeLegal(MVT::bf16)) {
218 for (unsigned Opc :
220 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
221 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
222 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
223 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
224 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
225 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
226 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
227 ISD::SETCC}) {
228 // FIXME: The promoted to type shouldn't need to be explicit
229 setOperationAction(Opc, MVT::bf16, Promote);
230 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
231 }
232
234
236 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
237
238 setOperationAction(ISD::FABS, MVT::bf16, Legal);
239 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
241
242 // We only need to custom lower because we can't specify an action for bf16
243 // sources.
246 }
247
248 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
249 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
250 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
251 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
252 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
253 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
254 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
259 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
264
265 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
266 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
267 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
270 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
271 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
272
273 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
274
278 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
279
280 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
281
283 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
284
286 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
287 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
288
290 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
291 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
292 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
293 Expand);
295 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
296 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
297 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
298 Expand);
299
301 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
302 MVT::v3i16, MVT::v4i16, MVT::Other},
303 Custom);
304
305 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
306 setOperationAction(ISD::BR_CC,
307 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
308
310
312
314 Expand);
315
316#if 0
318#endif
319
320 // We only support LOAD/STORE and vector manipulation ops for vectors
321 // with > 4 elements.
322 for (MVT VT :
323 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
324 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
325 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
326 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
327 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
328 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
329 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
330 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
331 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
332 switch (Op) {
333 case ISD::LOAD:
334 case ISD::STORE:
336 case ISD::BITCAST:
337 case ISD::UNDEF:
341 case ISD::IS_FPCLASS:
342 break;
347 break;
348 default:
350 break;
351 }
352 }
353 }
354
355 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
356
357 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
358 // is expanded to avoid having two separate loops in case the index is a VGPR.
359
360 // Most operations are naturally 32-bit vector operations. We only support
361 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
362 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
364 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
368
370 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
374 }
375
376 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
378 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
382
384 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
388 }
389
390 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
392 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
396
398 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
402 }
403
404 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
406 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
410
412 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
416 }
417
418 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
420 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
424
426 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
430 }
431
433 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
434 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
435 Custom);
436
437 if (Subtarget->hasPkMovB32()) {
438 // TODO: 16-bit element vectors should be legal with even aligned elements.
439 // TODO: Can be legal with wider source types than the result with
440 // subregister extracts.
441 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
442 }
443
445 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
446 // instead lower to cndmask in SITargetLowering::LowerSELECT().
448 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
449 // alignbit.
450 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
451
452 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
453 Custom);
454
455 // Avoid stack access for these.
456 // TODO: Generalize to more vector types.
458 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
459 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
460 Custom);
461
462 // Deal with vec3 vector operations when widened to vec4.
464 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
465
466 // Deal with vec5/6/7 vector operations when widened to vec8.
468 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
469 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
470 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
471 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
472 Custom);
473
474 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
475 // and output demarshalling
476 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
477
478 // We can't return success/failure, only the old value,
479 // let LLVM add the comparison
480 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
481 Expand);
482
483 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
484
485 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
486
487 // FIXME: This should be narrowed to i32, but that only happens if i64 is
488 // illegal.
489 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
490 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
491
492 // On SI this is s_memtime and s_memrealtime on VI.
493 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
494
495 if (Subtarget->hasSMemRealTime() ||
496 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
497 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
498 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
499
500 if (Subtarget->has16BitInsts()) {
501 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
502 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
503 } else {
504 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
505 }
506
507 if (Subtarget->hasMadMacF32Insts())
509
510 if (!Subtarget->hasBFI())
511 // fcopysign can be done in a single instruction with BFI.
512 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
513
514 if (!Subtarget->hasBCNT(32))
516
517 if (!Subtarget->hasBCNT(64))
519
520 if (Subtarget->hasFFBH())
522
523 if (Subtarget->hasFFBL())
525
526 // We only really have 32-bit BFE instructions (and 16-bit on VI).
527 //
528 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
529 // effort to match them now. We want this to be false for i64 cases when the
530 // extraction isn't restricted to the upper or lower half. Ideally we would
531 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
532 // span the midpoint are probably relatively rare, so don't worry about them
533 // for now.
534 if (Subtarget->hasBFE())
536
537 // Clamp modifier on add/sub
538 if (Subtarget->hasIntClamp())
540
541 if (Subtarget->hasAddNoCarry())
542 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
543 Legal);
544
546 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
547 {MVT::f32, MVT::f64}, Custom);
548
549 // These are really only legal for ieee_mode functions. We should be avoiding
550 // them for functions that don't have ieee_mode enabled, so just say they are
551 // legal.
552 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
553 {MVT::f32, MVT::f64}, Legal);
554
555 if (Subtarget->haveRoundOpsF64())
556 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
557 Legal);
558 else
559 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
560 MVT::f64, Custom);
561
562 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
563 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
564 Legal);
565 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
566
567 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
569
570 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
571 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
572
573 // Custom lower these because we can't specify a rule based on an illegal
574 // source bf16.
575 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
576 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
577
578 if (Subtarget->has16BitInsts()) {
581 MVT::i16, Legal);
582
583 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
584
586 MVT::i16, Expand);
587
591 ISD::CTPOP},
592 MVT::i16, Promote);
593
594 setOperationAction(ISD::LOAD, MVT::i16, Custom);
595
596 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
597
598 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
599 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
600 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
601 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
602
606
608
609 // F16 - Constant Actions.
612
613 // F16 - Load/Store Actions.
614 setOperationAction(ISD::LOAD, MVT::f16, Promote);
615 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
616 setOperationAction(ISD::STORE, MVT::f16, Promote);
617 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
618
619 // BF16 - Load/Store Actions.
620 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
621 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
622 setOperationAction(ISD::STORE, MVT::bf16, Promote);
623 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
624
625 // F16 - VOP1 Actions.
627 ISD::FSIN, ISD::FROUND},
628 MVT::f16, Custom);
629
630 // BF16 - VOP1 Actions.
631 if (Subtarget->hasBF16TransInsts())
632 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
633
636
637 // F16 - VOP2 Actions.
638 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
639 Expand);
640 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
641 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
643
644 // F16 - VOP3 Actions.
646 if (STI.hasMadF16())
648
649 for (MVT VT :
650 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
651 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
652 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
653 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
654 switch (Op) {
655 case ISD::LOAD:
656 case ISD::STORE:
658 case ISD::BITCAST:
659 case ISD::UNDEF:
664 case ISD::IS_FPCLASS:
665 break;
669 break;
670 default:
672 break;
673 }
674 }
675 }
676
677 // v_perm_b32 can handle either of these.
678 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
680
681 // XXX - Do these do anything? Vector constants turn into build_vector.
682 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
683
684 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
685 Legal);
686
687 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
688 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
689 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
690 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
691
692 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
694 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
695 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
696
697 setOperationAction(ISD::AND, MVT::v2i16, Promote);
698 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
699 setOperationAction(ISD::OR, MVT::v2i16, Promote);
700 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
701 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
702 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
703
704 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
705 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
706 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
708 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
710
711 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
712 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
713 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
714 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
715 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
716 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
717
718 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
719 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
720 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
722 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
724
725 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
726 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
727 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
728 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
729
730 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
732 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
734 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
735 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
736
737 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
738 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
739 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
740 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
741 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
742 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
743
744 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
745 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
746 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
747 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
748 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
749 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
750
751 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
752 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
753 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
754 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
755 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
756 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
757
758 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
759 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
760 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
761 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
762 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
763 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
764
766 MVT::v2i32, Expand);
767 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
768
770 MVT::v4i32, Expand);
771
773 MVT::v8i32, Expand);
774
775 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
776 Subtarget->hasVOP3PInsts() ? Legal : Custom);
777
778 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
779 // This isn't really legal, but this avoids the legalizer unrolling it (and
780 // allows matching fneg (fabs x) patterns)
781 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
782
783 // Can do this in one BFI plus a constant materialize.
785 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
786 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
787 MVT::v32f16, MVT::v32bf16},
788 Custom);
789
791 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
792 MVT::f16, Custom);
793 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
794
795 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
796 ISD::FMAXIMUMNUM},
797 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
798 Custom);
799
800 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
801 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
802 Expand);
803
804 for (MVT Vec16 :
805 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
806 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
809 Vec16, Custom);
811 }
812 }
813
814 if (Subtarget->hasVOP3PInsts()) {
818 MVT::v2i16, Legal);
819
820 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
821 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
822 MVT::v2f16, Legal);
823
825 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
826
828 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
829 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
830 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
831 Custom);
832
833 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
834 // Split vector operations.
839 VT, Custom);
840
841 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
842 // Split vector operations.
844 VT, Custom);
845
847 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
848 {MVT::v2f16, MVT::v4f16}, Custom);
849
850 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
851 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
852 Custom);
853
854 if (Subtarget->hasPackedFP32Ops()) {
856 MVT::v2f32, Legal);
858 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
859 Custom);
860 }
861 }
862
863 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
864
865 if (Subtarget->has16BitInsts()) {
867 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
869 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
870 } else {
871 // Legalization hack.
872 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
873
874 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
875 }
876
878 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
879 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
880 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
881 MVT::v32f16, MVT::v32bf16},
882 Custom);
883
885
886 if (Subtarget->hasVectorMulU64())
888 else if (Subtarget->hasScalarSMulU64())
890
891 if (Subtarget->hasMad64_32())
893
894 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
895 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
896
897 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
899 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
900 } else {
901 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
902 if (Subtarget->hasMinimum3Maximum3F32())
903 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
904
905 if (Subtarget->hasMinimum3Maximum3PKF16()) {
906 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
907
908 // If only the vector form is available, we need to widen to a vector.
909 if (!Subtarget->hasMinimum3Maximum3F16())
910 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
911 }
912 }
913
914 if (Subtarget->hasVOP3PInsts()) {
915 // We want to break these into v2f16 pieces, not scalarize.
916 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
917 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
918 Custom);
919 }
920
921 if (Subtarget->hasIntMinMax64())
923 Legal);
924
926 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
927 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
928 MVT::i8},
929 Custom);
930
932 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
933 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
934 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
935 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
936 Custom);
937
939 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
940 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
941 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
942 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
943 Custom);
944
945 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
947 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
948 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
949 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
950
951 // TODO: Could move this to custom lowering, could benefit from combines on
952 // extract of relevant bits.
953 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
954
956
957 if (Subtarget->hasBF16ConversionInsts()) {
958 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
960 }
961
962 if (Subtarget->hasBF16PackedInsts()) {
964 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
965 MVT::v2bf16, Legal);
966 }
967
968 if (Subtarget->hasBF16TransInsts()) {
969 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
970 }
971
972 if (Subtarget->hasCvtPkF16F32Inst()) {
974 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
975 Custom);
976 }
977
979 ISD::PTRADD,
981 ISD::SUB,
983 ISD::MUL,
984 ISD::FADD,
985 ISD::FSUB,
986 ISD::FDIV,
987 ISD::FMUL,
988 ISD::FMINNUM,
989 ISD::FMAXNUM,
990 ISD::FMINNUM_IEEE,
991 ISD::FMAXNUM_IEEE,
992 ISD::FMINIMUM,
993 ISD::FMAXIMUM,
994 ISD::FMINIMUMNUM,
995 ISD::FMAXIMUMNUM,
996 ISD::FMA,
997 ISD::SMIN,
998 ISD::SMAX,
999 ISD::UMIN,
1000 ISD::UMAX,
1001 ISD::SETCC,
1003 ISD::SMIN,
1004 ISD::SMAX,
1005 ISD::UMIN,
1006 ISD::UMAX,
1007 ISD::AND,
1008 ISD::OR,
1009 ISD::XOR,
1010 ISD::SHL,
1011 ISD::SRL,
1012 ISD::SRA,
1013 ISD::FSHR,
1023
1024 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1026
1027 // All memory operations. Some folding on the pointer operand is done to help
1028 // matching the constant offsets in the addressing modes.
1029 setTargetDAGCombine({ISD::LOAD,
1030 ISD::STORE,
1031 ISD::ATOMIC_LOAD,
1032 ISD::ATOMIC_STORE,
1033 ISD::ATOMIC_CMP_SWAP,
1034 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1035 ISD::ATOMIC_SWAP,
1036 ISD::ATOMIC_LOAD_ADD,
1037 ISD::ATOMIC_LOAD_SUB,
1038 ISD::ATOMIC_LOAD_AND,
1039 ISD::ATOMIC_LOAD_OR,
1040 ISD::ATOMIC_LOAD_XOR,
1041 ISD::ATOMIC_LOAD_NAND,
1042 ISD::ATOMIC_LOAD_MIN,
1043 ISD::ATOMIC_LOAD_MAX,
1044 ISD::ATOMIC_LOAD_UMIN,
1045 ISD::ATOMIC_LOAD_UMAX,
1046 ISD::ATOMIC_LOAD_FADD,
1047 ISD::ATOMIC_LOAD_FMIN,
1048 ISD::ATOMIC_LOAD_FMAX,
1049 ISD::ATOMIC_LOAD_UINC_WRAP,
1050 ISD::ATOMIC_LOAD_UDEC_WRAP,
1053
1054 // FIXME: In other contexts we pretend this is a per-function property.
1056
1058}
1059
1060const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1061
1063 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1064 return RCRegs;
1065}
1066
1067//===----------------------------------------------------------------------===//
1068// TargetLowering queries
1069//===----------------------------------------------------------------------===//
1070
1071// v_mad_mix* support a conversion from f16 to f32.
1072//
1073// There is only one special case when denormals are enabled we don't currently,
1074// where this is OK to use.
1075bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1076 EVT DestVT, EVT SrcVT) const {
1077 return DestVT.getScalarType() == MVT::f32 &&
1078 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1079 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 SrcVT.getScalarType() == MVT::f16) ||
1081 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1082 SrcVT.getScalarType() == MVT::bf16)) &&
1083 // TODO: This probably only requires no input flushing?
1085}
1086
1088 LLT DestTy, LLT SrcTy) const {
1089 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1090 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 DestTy.getScalarSizeInBits() == 32 &&
1092 SrcTy.getScalarSizeInBits() == 16 &&
1093 // TODO: This probably only requires no input flushing?
1094 denormalModeIsFlushAllF32(*MI.getMF());
1095}
1096
1098 // SI has some legal vector types, but no legal vector operations. Say no
1099 // shuffles are legal in order to prefer scalarizing some vector operations.
1100 return false;
1101}
1102
1104 CallingConv::ID CC,
1105 EVT VT) const {
1107 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1108
1109 if (VT.isVector()) {
1110 EVT ScalarVT = VT.getScalarType();
1111 unsigned Size = ScalarVT.getSizeInBits();
1112 if (Size == 16) {
1113 if (Subtarget->has16BitInsts()) {
1114 if (VT.isInteger())
1115 return MVT::v2i16;
1116 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 }
1118 return VT.isInteger() ? MVT::i32 : MVT::f32;
1119 }
1120
1121 if (Size < 16)
1122 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1123 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1124 }
1125
1126 if (VT.getSizeInBits() > 32)
1127 return MVT::i32;
1128
1129 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1130}
1131
1133 CallingConv::ID CC,
1134 EVT VT) const {
1136 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1137
1138 if (VT.isVector()) {
1139 unsigned NumElts = VT.getVectorNumElements();
1140 EVT ScalarVT = VT.getScalarType();
1141 unsigned Size = ScalarVT.getSizeInBits();
1142
1143 // FIXME: Should probably promote 8-bit vectors to i16.
1144 if (Size == 16 && Subtarget->has16BitInsts())
1145 return (NumElts + 1) / 2;
1146
1147 if (Size <= 32)
1148 return NumElts;
1149
1150 if (Size > 32)
1151 return NumElts * ((Size + 31) / 32);
1152 } else if (VT.getSizeInBits() > 32)
1153 return (VT.getSizeInBits() + 31) / 32;
1154
1155 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1156}
1157
1159 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1160 unsigned &NumIntermediates, MVT &RegisterVT) const {
1161 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1162 unsigned NumElts = VT.getVectorNumElements();
1163 EVT ScalarVT = VT.getScalarType();
1164 unsigned Size = ScalarVT.getSizeInBits();
1165 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1166 // support, but unless we can properly handle 3-vectors, it will be still be
1167 // inconsistent.
1168 if (Size == 16 && Subtarget->has16BitInsts()) {
1169 if (ScalarVT == MVT::bf16) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = MVT::v2bf16;
1172 } else {
1173 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1174 IntermediateVT = RegisterVT;
1175 }
1176 NumIntermediates = (NumElts + 1) / 2;
1177 return NumIntermediates;
1178 }
1179
1180 if (Size == 32) {
1181 RegisterVT = ScalarVT.getSimpleVT();
1182 IntermediateVT = RegisterVT;
1183 NumIntermediates = NumElts;
1184 return NumIntermediates;
1185 }
1186
1187 if (Size < 16 && Subtarget->has16BitInsts()) {
1188 // FIXME: Should probably form v2i16 pieces
1189 RegisterVT = MVT::i16;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1193 }
1194
1195 if (Size != 16 && Size <= 32) {
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1200 }
1201
1202 if (Size > 32) {
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = RegisterVT;
1205 NumIntermediates = NumElts * ((Size + 31) / 32);
1206 return NumIntermediates;
1207 }
1208 }
1209
1211 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1212}
1213
1215 const DataLayout &DL, Type *Ty,
1216 unsigned MaxNumLanes) {
1217 assert(MaxNumLanes != 0);
1218
1219 LLVMContext &Ctx = Ty->getContext();
1220 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1221 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1222 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1223 NumElts);
1224 }
1225
1226 return TLI.getValueType(DL, Ty);
1227}
1228
1229// Peek through TFE struct returns to only use the data size.
1231 const DataLayout &DL, Type *Ty,
1232 unsigned MaxNumLanes) {
1233 auto *ST = dyn_cast<StructType>(Ty);
1234 if (!ST)
1235 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1236
1237 // TFE intrinsics return an aggregate type.
1238 assert(ST->getNumContainedTypes() == 2 &&
1239 ST->getContainedType(1)->isIntegerTy(32));
1240 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1241}
1242
1243/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1244/// in-memory representation. This return value is a custom type because there
1245/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1246/// could cause issues during codegen, these address space 7 pointers will be
1247/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1248/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1249/// for cost modeling, to work. (This also sets us up decently for doing the
1250/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1252 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1253 return MVT::amdgpuBufferFatPointer;
1255 DL.getPointerSizeInBits(AS) == 192)
1256 return MVT::amdgpuBufferStridedPointer;
1258}
1259/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1260/// v8i32 when padding is added.
1261/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1262/// also v8i32 with padding.
1264 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1265 DL.getPointerSizeInBits(AS) == 160) ||
1267 DL.getPointerSizeInBits(AS) == 192))
1268 return MVT::v8i32;
1270}
1271
1272static unsigned getIntrMemWidth(unsigned IntrID) {
1273 switch (IntrID) {
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 return 8;
1278 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1281 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1282 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 return 32;
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1287 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1288 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 return 64;
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1295 return 128;
1296 default:
1297 llvm_unreachable("Unknown width");
1298 }
1299}
1300
1301static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1303 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1304 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1305 switch (AtomicOrderingCABI(Ord)) {
1308 break;
1311 break;
1314 break;
1315 default:
1317 break;
1318 }
1319
1320 Info.flags =
1322 Info.flags |= MOCooperative;
1323
1324 MDNode *ScopeMD = cast<MDNode>(
1325 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1326 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1327 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1328}
1329
1331 const CallInst &CI,
1332 MachineFunction &MF,
1333 unsigned IntrID) const {
1334 Info.flags = MachineMemOperand::MONone;
1335 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1336 Info.flags |= MachineMemOperand::MOInvariant;
1337 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1339 Info.flags |= getTargetMMOFlags(CI);
1340
1341 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1343 AttributeSet Attr =
1345 MemoryEffects ME = Attr.getMemoryEffects();
1346 if (ME.doesNotAccessMemory())
1347 return false;
1348
1349 // TODO: Should images get their own address space?
1350 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1351
1352 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1353 if (RsrcIntr->IsImage) {
1354 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1356 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1357 Info.align.reset();
1358 }
1359
1360 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1361 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1362 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1363 // We conservatively set the memory operand of a buffer intrinsic to the
1364 // base resource pointer, so that we can access alias information about
1365 // those pointers. Cases like "this points at the same value
1366 // but with a different offset" are handled in
1367 // areMemAccessesTriviallyDisjoint.
1368 Info.ptrVal = RsrcArg;
1369 }
1370
1371 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1372 if (!IsSPrefetch) {
1373 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1374 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1375 Info.flags |= MachineMemOperand::MOVolatile;
1376 }
1377
1379 if (ME.onlyReadsMemory()) {
1380 if (RsrcIntr->IsImage) {
1381 unsigned MaxNumLanes = 4;
1382
1383 if (!BaseOpcode->Gather4) {
1384 // If this isn't a gather, we may have excess loaded elements in the
1385 // IR type. Check the dmask for the real number of elements loaded.
1386 unsigned DMask =
1387 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1388 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1389 }
1390
1391 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1392 CI.getType(), MaxNumLanes);
1393 } else {
1394 Info.memVT =
1396 std::numeric_limits<unsigned>::max());
1397 }
1398
1399 // FIXME: What does alignment mean for an image?
1400 Info.opc = ISD::INTRINSIC_W_CHAIN;
1401 Info.flags |= MachineMemOperand::MOLoad;
1402 } else if (ME.onlyWritesMemory()) {
1403 Info.opc = ISD::INTRINSIC_VOID;
1404
1405 Type *DataTy = CI.getArgOperand(0)->getType();
1406 if (RsrcIntr->IsImage) {
1407 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1408 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1409 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1410 DMaskLanes);
1411 } else
1412 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1413
1414 Info.flags |= MachineMemOperand::MOStore;
1415 } else {
1416 // Atomic, NoReturn Sampler or prefetch
1417 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1419 Info.flags |=
1421
1422 if (!IsSPrefetch)
1423 Info.flags |= MachineMemOperand::MOStore;
1424
1425 switch (IntrID) {
1426 default:
1427 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1428 // Fake memory access type for no return sampler intrinsics
1429 Info.memVT = MVT::i32;
1430 } else {
1431 // XXX - Should this be volatile without known ordering?
1432 Info.flags |= MachineMemOperand::MOVolatile;
1433 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1434 }
1435 break;
1436 case Intrinsic::amdgcn_raw_buffer_load_lds:
1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1441 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1442 Info.ptrVal = CI.getArgOperand(1);
1443 return true;
1444 }
1445 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1446 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1449 Info.memVT =
1451 std::numeric_limits<unsigned>::max());
1452 Info.flags &= ~MachineMemOperand::MOStore;
1453 return true;
1454 }
1455 }
1456 }
1457 return true;
1458 }
1459
1460 switch (IntrID) {
1461 case Intrinsic::amdgcn_ds_ordered_add:
1462 case Intrinsic::amdgcn_ds_ordered_swap: {
1463 Info.opc = ISD::INTRINSIC_W_CHAIN;
1464 Info.memVT = MVT::getVT(CI.getType());
1465 Info.ptrVal = CI.getOperand(0);
1466 Info.align.reset();
1468
1469 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1470 if (!Vol->isZero())
1471 Info.flags |= MachineMemOperand::MOVolatile;
1472
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1476 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1479 Info.ptrVal = nullptr;
1480 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1482 return true;
1483 }
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume: {
1486 Info.opc = ISD::INTRINSIC_W_CHAIN;
1487 Info.memVT = MVT::getVT(CI.getType());
1488 Info.ptrVal = CI.getOperand(0);
1489 Info.align.reset();
1491
1492 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1493 if (!Vol->isZero())
1494 Info.flags |= MachineMemOperand::MOVolatile;
1495
1496 return true;
1497 }
1498 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1499 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1500 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1503 Info.memVT = MVT::getVT(CI.getType());
1504 Info.ptrVal = CI.getOperand(0);
1505 Info.memVT = MVT::i64;
1506 Info.size = 8;
1507 Info.align.reset();
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_global_atomic_csub: {
1512 Info.opc = ISD::INTRINSIC_W_CHAIN;
1513 Info.memVT = MVT::getVT(CI.getType());
1514 Info.ptrVal = CI.getOperand(0);
1515 Info.align.reset();
1518 return true;
1519 }
1520 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1522 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1523 Info.opc = ISD::INTRINSIC_W_CHAIN;
1524 Info.memVT =
1525 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1526 ? CI.getType()
1528 ->getElementType(0)); // XXX: what is correct VT?
1529
1530 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1531 Info.align.reset();
1532 Info.flags |=
1534 return true;
1535 }
1536 case Intrinsic::amdgcn_global_atomic_fmin_num:
1537 case Intrinsic::amdgcn_global_atomic_fmax_num:
1538 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1539 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1540 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1541 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1542 Info.opc = ISD::INTRINSIC_W_CHAIN;
1543 Info.memVT = MVT::getVT(CI.getType());
1544 Info.ptrVal = CI.getOperand(0);
1545 Info.align.reset();
1549 return true;
1550 }
1551 case Intrinsic::amdgcn_flat_load_monitor_b32:
1552 case Intrinsic::amdgcn_flat_load_monitor_b64:
1553 case Intrinsic::amdgcn_flat_load_monitor_b128:
1554 case Intrinsic::amdgcn_global_load_monitor_b32:
1555 case Intrinsic::amdgcn_global_load_monitor_b64:
1556 case Intrinsic::amdgcn_global_load_monitor_b128:
1557 case Intrinsic::amdgcn_cluster_load_b32:
1558 case Intrinsic::amdgcn_cluster_load_b64:
1559 case Intrinsic::amdgcn_cluster_load_b128:
1560 case Intrinsic::amdgcn_ds_load_tr6_b96:
1561 case Intrinsic::amdgcn_ds_load_tr4_b64:
1562 case Intrinsic::amdgcn_ds_load_tr8_b64:
1563 case Intrinsic::amdgcn_ds_load_tr16_b128:
1564 case Intrinsic::amdgcn_global_load_tr6_b96:
1565 case Intrinsic::amdgcn_global_load_tr4_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b64:
1567 case Intrinsic::amdgcn_global_load_tr_b128:
1568 case Intrinsic::amdgcn_ds_read_tr4_b64:
1569 case Intrinsic::amdgcn_ds_read_tr6_b96:
1570 case Intrinsic::amdgcn_ds_read_tr8_b64:
1571 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1572 Info.opc = ISD::INTRINSIC_W_CHAIN;
1573 Info.memVT = MVT::getVT(CI.getType());
1574 Info.ptrVal = CI.getOperand(0);
1575 Info.align.reset();
1576 Info.flags |= MachineMemOperand::MOLoad;
1577 return true;
1578 }
1579 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1581 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1582 Info.opc = ISD::INTRINSIC_W_CHAIN;
1583 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1584 Info.ptrVal = CI.getOperand(0);
1585 Info.align.reset();
1586 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1587 return true;
1588 }
1589 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1591 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1592 Info.opc = ISD::INTRINSIC_VOID;
1593 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1594 Info.ptrVal = CI.getArgOperand(0);
1595 Info.align.reset();
1596 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1597 return true;
1598 }
1599 case Intrinsic::amdgcn_ds_gws_init:
1600 case Intrinsic::amdgcn_ds_gws_barrier:
1601 case Intrinsic::amdgcn_ds_gws_sema_v:
1602 case Intrinsic::amdgcn_ds_gws_sema_br:
1603 case Intrinsic::amdgcn_ds_gws_sema_p:
1604 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1605 Info.opc = ISD::INTRINSIC_VOID;
1606
1607 const GCNTargetMachine &TM =
1608 static_cast<const GCNTargetMachine &>(getTargetMachine());
1609
1611 Info.ptrVal = MFI->getGWSPSV(TM);
1612
1613 // This is an abstract access, but we need to specify a type and size.
1614 Info.memVT = MVT::i32;
1615 Info.size = 4;
1616 Info.align = Align(4);
1617
1618 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1619 Info.flags |= MachineMemOperand::MOLoad;
1620 else
1621 Info.flags |= MachineMemOperand::MOStore;
1622 return true;
1623 }
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1627 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1632 Info.opc = ISD::INTRINSIC_VOID;
1633 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1634 Info.ptrVal = CI.getArgOperand(1);
1636 return true;
1637 }
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1641 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1642 Info.opc = ISD::INTRINSIC_VOID;
1643 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1644 Info.ptrVal = CI.getArgOperand(0);
1646 return true;
1647 }
1648 case Intrinsic::amdgcn_load_to_lds:
1649 case Intrinsic::amdgcn_global_load_lds: {
1650 Info.opc = ISD::INTRINSIC_VOID;
1651 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1652 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1653 Info.ptrVal = CI.getArgOperand(1);
1655 return true;
1656 }
1657 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1660 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1661 Info.opc = ISD::INTRINSIC_W_CHAIN;
1662
1663 const GCNTargetMachine &TM =
1664 static_cast<const GCNTargetMachine &>(getTargetMachine());
1665
1667 Info.ptrVal = MFI->getGWSPSV(TM);
1668
1669 // This is an abstract access, but we need to specify a type and size.
1670 Info.memVT = MVT::i32;
1671 Info.size = 4;
1672 Info.align = Align(4);
1673
1675 return true;
1676 }
1677 case Intrinsic::amdgcn_s_prefetch_data:
1678 case Intrinsic::amdgcn_flat_prefetch:
1679 case Intrinsic::amdgcn_global_prefetch: {
1680 Info.opc = ISD::INTRINSIC_VOID;
1681 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1682 Info.ptrVal = CI.getArgOperand(0);
1683 Info.flags |= MachineMemOperand::MOLoad;
1684 return true;
1685 }
1686 default:
1687 return false;
1688 }
1689}
1690
1692 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1694 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1695 // The DAG's ValueType loses the addrspaces.
1696 // Add them as 2 extra Constant operands "from" and "to".
1697 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1698 unsigned DstAS = I.getType()->getPointerAddressSpace();
1699 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1700 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1701 break;
1702 }
1703 default:
1704 break;
1705 }
1706}
1707
1710 Type *&AccessTy) const {
1711 Value *Ptr = nullptr;
1712 switch (II->getIntrinsicID()) {
1713 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1714 case Intrinsic::amdgcn_cluster_load_b128:
1715 case Intrinsic::amdgcn_cluster_load_b64:
1716 case Intrinsic::amdgcn_cluster_load_b32:
1717 case Intrinsic::amdgcn_ds_append:
1718 case Intrinsic::amdgcn_ds_consume:
1719 case Intrinsic::amdgcn_ds_load_tr8_b64:
1720 case Intrinsic::amdgcn_ds_load_tr16_b128:
1721 case Intrinsic::amdgcn_ds_load_tr4_b64:
1722 case Intrinsic::amdgcn_ds_load_tr6_b96:
1723 case Intrinsic::amdgcn_ds_read_tr4_b64:
1724 case Intrinsic::amdgcn_ds_read_tr6_b96:
1725 case Intrinsic::amdgcn_ds_read_tr8_b64:
1726 case Intrinsic::amdgcn_ds_read_tr16_b64:
1727 case Intrinsic::amdgcn_ds_ordered_add:
1728 case Intrinsic::amdgcn_ds_ordered_swap:
1729 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1730 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1731 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1732 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1733 case Intrinsic::amdgcn_flat_load_monitor_b128:
1734 case Intrinsic::amdgcn_flat_load_monitor_b32:
1735 case Intrinsic::amdgcn_flat_load_monitor_b64:
1736 case Intrinsic::amdgcn_global_atomic_csub:
1737 case Intrinsic::amdgcn_global_atomic_fmax_num:
1738 case Intrinsic::amdgcn_global_atomic_fmin_num:
1739 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1740 case Intrinsic::amdgcn_global_load_monitor_b128:
1741 case Intrinsic::amdgcn_global_load_monitor_b32:
1742 case Intrinsic::amdgcn_global_load_monitor_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b64:
1744 case Intrinsic::amdgcn_global_load_tr_b128:
1745 case Intrinsic::amdgcn_global_load_tr4_b64:
1746 case Intrinsic::amdgcn_global_load_tr6_b96:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1751 Ptr = II->getArgOperand(0);
1752 break;
1753 case Intrinsic::amdgcn_load_to_lds:
1754 case Intrinsic::amdgcn_global_load_lds:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1758 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1762 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1763 Ptr = II->getArgOperand(1);
1764 break;
1765 default:
1766 return false;
1767 }
1768 AccessTy = II->getType();
1769 Ops.push_back(Ptr);
1770 return true;
1771}
1772
1774 unsigned AddrSpace) const {
1775 if (!Subtarget->hasFlatInstOffsets()) {
1776 // Flat instructions do not have offsets, and only have the register
1777 // address.
1778 return AM.BaseOffs == 0 && AM.Scale == 0;
1779 }
1780
1781 decltype(SIInstrFlags::FLAT) FlatVariant =
1785
1786 return AM.Scale == 0 &&
1787 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1788 AM.BaseOffs, AddrSpace, FlatVariant));
1789}
1790
1792 if (Subtarget->hasFlatGlobalInsts())
1794
1795 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1796 // Assume the we will use FLAT for all global memory accesses
1797 // on VI.
1798 // FIXME: This assumption is currently wrong. On VI we still use
1799 // MUBUF instructions for the r + i addressing mode. As currently
1800 // implemented, the MUBUF instructions only work on buffer < 4GB.
1801 // It may be possible to support > 4GB buffers with MUBUF instructions,
1802 // by setting the stride value in the resource descriptor which would
1803 // increase the size limit to (stride * 4GB). However, this is risky,
1804 // because it has never been validated.
1806 }
1807
1808 return isLegalMUBUFAddressingMode(AM);
1809}
1810
1811bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1812 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1813 // additionally can do r + r + i with addr64. 32-bit has more addressing
1814 // mode options. Depending on the resource constant, it can also do
1815 // (i64 r0) + (i32 r1) * (i14 i).
1816 //
1817 // Private arrays end up using a scratch buffer most of the time, so also
1818 // assume those use MUBUF instructions. Scratch loads / stores are currently
1819 // implemented as mubuf instructions with offen bit set, so slightly
1820 // different than the normal addr64.
1821 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1822 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1823 return false;
1824
1825 // FIXME: Since we can split immediate into soffset and immediate offset,
1826 // would it make sense to allow any immediate?
1827
1828 switch (AM.Scale) {
1829 case 0: // r + i or just i, depending on HasBaseReg.
1830 return true;
1831 case 1:
1832 return true; // We have r + r or r + i.
1833 case 2:
1834 if (AM.HasBaseReg) {
1835 // Reject 2 * r + r.
1836 return false;
1837 }
1838
1839 // Allow 2 * r as r + r
1840 // Or 2 * r + i is allowed as r + r + i.
1841 return true;
1842 default: // Don't allow n * r
1843 return false;
1844 }
1845}
1846
1848 const AddrMode &AM, Type *Ty,
1849 unsigned AS,
1850 Instruction *I) const {
1851 // No global is ever allowed as a base.
1852 if (AM.BaseGV)
1853 return false;
1854
1855 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1856 return isLegalGlobalAddressingMode(AM);
1857
1858 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1862 // If the offset isn't a multiple of 4, it probably isn't going to be
1863 // correctly aligned.
1864 // FIXME: Can we get the real alignment here?
1865 if (AM.BaseOffs % 4 != 0)
1866 return isLegalMUBUFAddressingMode(AM);
1867
1868 if (!Subtarget->hasScalarSubwordLoads()) {
1869 // There are no SMRD extloads, so if we have to do a small type access we
1870 // will use a MUBUF load.
1871 // FIXME?: We also need to do this if unaligned, but we don't know the
1872 // alignment here.
1873 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1874 return isLegalGlobalAddressingMode(AM);
1875 }
1876
1877 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1878 // SMRD instructions have an 8-bit, dword offset on SI.
1879 if (!isUInt<8>(AM.BaseOffs / 4))
1880 return false;
1881 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1882 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1883 // in 8-bits, it can use a smaller encoding.
1884 if (!isUInt<32>(AM.BaseOffs / 4))
1885 return false;
1886 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1887 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1888 if (!isUInt<20>(AM.BaseOffs))
1889 return false;
1890 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1891 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1892 // for S_BUFFER_* instructions).
1893 if (!isInt<21>(AM.BaseOffs))
1894 return false;
1895 } else {
1896 // On GFX12, all offsets are signed 24-bit in bytes.
1897 if (!isInt<24>(AM.BaseOffs))
1898 return false;
1899 }
1900
1901 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1903 AM.BaseOffs < 0) {
1904 // Scalar (non-buffer) loads can only use a negative offset if
1905 // soffset+offset is non-negative. Since the compiler can only prove that
1906 // in a few special cases, it is safer to claim that negative offsets are
1907 // not supported.
1908 return false;
1909 }
1910
1911 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1912 return true;
1913
1914 if (AM.Scale == 1 && AM.HasBaseReg)
1915 return true;
1916
1917 return false;
1918 }
1919
1920 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1921 return Subtarget->enableFlatScratch()
1923 : isLegalMUBUFAddressingMode(AM);
1924
1925 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1926 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1927 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1928 // field.
1929 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1930 // an 8-bit dword offset but we don't know the alignment here.
1931 if (!isUInt<16>(AM.BaseOffs))
1932 return false;
1933
1934 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1935 return true;
1936
1937 if (AM.Scale == 1 && AM.HasBaseReg)
1938 return true;
1939
1940 return false;
1941 }
1942
1944 // For an unknown address space, this usually means that this is for some
1945 // reason being used for pure arithmetic, and not based on some addressing
1946 // computation. We don't have instructions that compute pointers with any
1947 // addressing modes, so treat them as having no offset like flat
1948 // instructions.
1950 }
1951
1952 // Assume a user alias of global for unknown address spaces.
1953 return isLegalGlobalAddressingMode(AM);
1954}
1955
1957 const MachineFunction &MF) const {
1959 return (MemVT.getSizeInBits() <= 4 * 32);
1960 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1961 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1962 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1963 }
1965 return (MemVT.getSizeInBits() <= 2 * 32);
1966 return true;
1967}
1968
1970 unsigned Size, unsigned AddrSpace, Align Alignment,
1971 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1972 if (IsFast)
1973 *IsFast = 0;
1974
1975 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1976 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1977 // Check if alignment requirements for ds_read/write instructions are
1978 // disabled.
1979 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1980 return false;
1981
1982 Align RequiredAlignment(
1983 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1984 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1985 Alignment < RequiredAlignment)
1986 return false;
1987
1988 // Either, the alignment requirements are "enabled", or there is an
1989 // unaligned LDS access related hardware bug though alignment requirements
1990 // are "disabled". In either case, we need to check for proper alignment
1991 // requirements.
1992 //
1993 switch (Size) {
1994 case 64:
1995 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1996 // address is negative, then the instruction is incorrectly treated as
1997 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1998 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1999 // load later in the SILoadStoreOptimizer.
2000 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2001 return false;
2002
2003 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2004 // can do a 4 byte aligned, 8 byte access in a single operation using
2005 // ds_read2/write2_b32 with adjacent offsets.
2006 RequiredAlignment = Align(4);
2007
2008 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2009 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2010 // ds_write2_b32 depending on the alignment. In either case with either
2011 // alignment there is no faster way of doing this.
2012
2013 // The numbers returned here and below are not additive, it is a 'speed
2014 // rank'. They are just meant to be compared to decide if a certain way
2015 // of lowering an operation is faster than another. For that purpose
2016 // naturally aligned operation gets it bitsize to indicate that "it
2017 // operates with a speed comparable to N-bit wide load". With the full
2018 // alignment ds128 is slower than ds96 for example. If underaligned it
2019 // is comparable to a speed of a single dword access, which would then
2020 // mean 32 < 128 and it is faster to issue a wide load regardless.
2021 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2022 // wider load which will not be aligned anymore the latter is slower.
2023 if (IsFast)
2024 *IsFast = (Alignment >= RequiredAlignment) ? 64
2025 : (Alignment < Align(4)) ? 32
2026 : 1;
2027 return true;
2028 }
2029
2030 break;
2031 case 96:
2032 if (!Subtarget->hasDS96AndDS128())
2033 return false;
2034
2035 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2036 // gfx8 and older.
2037
2038 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2039 // Naturally aligned access is fastest. However, also report it is Fast
2040 // if memory is aligned less than DWORD. A narrow load or store will be
2041 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2042 // be more of them, so overall we will pay less penalty issuing a single
2043 // instruction.
2044
2045 // See comment on the values above.
2046 if (IsFast)
2047 *IsFast = (Alignment >= RequiredAlignment) ? 96
2048 : (Alignment < Align(4)) ? 32
2049 : 1;
2050 return true;
2051 }
2052
2053 break;
2054 case 128:
2055 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2056 return false;
2057
2058 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2059 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2060 // single operation using ds_read2/write2_b64.
2061 RequiredAlignment = Align(8);
2062
2063 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2064 // Naturally aligned access is fastest. However, also report it is Fast
2065 // if memory is aligned less than DWORD. A narrow load or store will be
2066 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2067 // will be more of them, so overall we will pay less penalty issuing a
2068 // single instruction.
2069
2070 // See comment on the values above.
2071 if (IsFast)
2072 *IsFast = (Alignment >= RequiredAlignment) ? 128
2073 : (Alignment < Align(4)) ? 32
2074 : 1;
2075 return true;
2076 }
2077
2078 break;
2079 default:
2080 if (Size > 32)
2081 return false;
2082
2083 break;
2084 }
2085
2086 // See comment on the values above.
2087 // Note that we have a single-dword or sub-dword here, so if underaligned
2088 // it is a slowest possible access, hence returned value is 0.
2089 if (IsFast)
2090 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2091
2092 return Alignment >= RequiredAlignment ||
2093 Subtarget->hasUnalignedDSAccessEnabled();
2094 }
2095
2096 // FIXME: We have to be conservative here and assume that flat operations
2097 // will access scratch. If we had access to the IR function, then we
2098 // could determine if any private memory was used in the function.
2099 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2100 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2101 bool AlignedBy4 = Alignment >= Align(4);
2102 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2103 if (IsFast)
2104 *IsFast = AlignedBy4 ? Size : 1;
2105 return true;
2106 }
2107
2108 if (IsFast)
2109 *IsFast = AlignedBy4;
2110
2111 return AlignedBy4;
2112 }
2113
2114 // So long as they are correct, wide global memory operations perform better
2115 // than multiple smaller memory ops -- even when misaligned
2116 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2117 if (IsFast)
2118 *IsFast = Size;
2119
2120 return Alignment >= Align(4) ||
2121 Subtarget->hasUnalignedBufferAccessEnabled();
2122 }
2123
2124 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2125 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2126 // out-of-bounds behavior, but in the edge case where an access starts
2127 // out-of-bounds and then enter in-bounds, the entire access would be treated
2128 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2129 // natural alignment of buffer accesses.
2130 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2131 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2132 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2133 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2134 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2135 return false;
2136 }
2137
2138 // Smaller than dword value must be aligned.
2139 if (Size < 32)
2140 return false;
2141
2142 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2143 // byte-address are ignored, thus forcing Dword alignment.
2144 // This applies to private, global, and constant memory.
2145 if (IsFast)
2146 *IsFast = 1;
2147
2148 return Size >= 32 && Alignment >= Align(4);
2149}
2150
2152 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2153 unsigned *IsFast) const {
2155 Alignment, Flags, IsFast);
2156}
2157
2159 LLVMContext &Context, const MemOp &Op,
2160 const AttributeList &FuncAttributes) const {
2161 // FIXME: Should account for address space here.
2162
2163 // The default fallback uses the private pointer size as a guess for a type to
2164 // use. Make sure we switch these to 64-bit accesses.
2165
2166 if (Op.size() >= 16 &&
2167 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2168 return MVT::v4i32;
2169
2170 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2171 return MVT::v2i32;
2172
2173 // Use the default.
2174 return MVT::Other;
2175}
2176
2178 const MemSDNode *MemNode = cast<MemSDNode>(N);
2179 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2180}
2181
2186
2188 unsigned DestAS) const {
2189 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2190 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2191 Subtarget->hasGloballyAddressableScratch()) {
2192 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2193 return false;
2194 }
2195
2196 // Flat -> private/local is a simple truncate.
2197 // Flat -> global is no-op
2198 return true;
2199 }
2200
2201 const GCNTargetMachine &TM =
2202 static_cast<const GCNTargetMachine &>(getTargetMachine());
2203 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2204}
2205
2213
2215 Type *Ty) const {
2216 // FIXME: Could be smarter if called for vector constants.
2217 return true;
2218}
2219
2221 unsigned Index) const {
2223 return false;
2224
2225 // TODO: Add more cases that are cheap.
2226 return Index == 0;
2227}
2228
2229bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2230 // TODO: This should be more aggressive, particular for 16-bit element
2231 // vectors. However there are some mixed improvements and regressions.
2232 EVT EltTy = VT.getVectorElementType();
2233 return EltTy.getSizeInBits() % 32 == 0;
2234}
2235
2237 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2238 switch (Op) {
2239 case ISD::LOAD:
2240 case ISD::STORE:
2241 return true;
2242 default:
2243 return false;
2244 }
2245 }
2246
2247 // SimplifySetCC uses this function to determine whether or not it should
2248 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2249 if (VT == MVT::i1 && Op == ISD::SETCC)
2250 return false;
2251
2253}
2254
2255SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2256 const SDLoc &SL,
2257 SDValue Chain,
2258 uint64_t Offset) const {
2259 const DataLayout &DL = DAG.getDataLayout();
2263
2264 auto [InputPtrReg, RC, ArgTy] =
2265 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2266
2267 // We may not have the kernarg segment argument if we have no kernel
2268 // arguments.
2269 if (!InputPtrReg)
2270 return DAG.getConstant(Offset, SL, PtrVT);
2271
2273 SDValue BasePtr = DAG.getCopyFromReg(
2274 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2275
2276 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2277}
2278
2279SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2280 const SDLoc &SL) const {
2283 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2284}
2285
2286SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2287 const SDLoc &SL) const {
2288
2290 std::optional<uint32_t> KnownSize =
2292 if (KnownSize.has_value())
2293 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2294 return SDValue();
2295}
2296
2297SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2298 const SDLoc &SL, SDValue Val,
2299 bool Signed,
2300 const ISD::InputArg *Arg) const {
2301 // First, if it is a widened vector, narrow it.
2302 if (VT.isVector() &&
2304 EVT NarrowedVT =
2307 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2308 DAG.getConstant(0, SL, MVT::i32));
2309 }
2310
2311 // Then convert the vector elements or scalar value.
2312 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2313 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2314 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2315 }
2316
2317 if (MemVT.isFloatingPoint())
2318 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2319 else if (Signed)
2320 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2321 else
2322 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2323
2324 return Val;
2325}
2326
2327SDValue SITargetLowering::lowerKernargMemParameter(
2328 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2329 uint64_t Offset, Align Alignment, bool Signed,
2330 const ISD::InputArg *Arg) const {
2331 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2332
2333 // Try to avoid using an extload by loading earlier than the argument address,
2334 // and extracting the relevant bits. The load should hopefully be merged with
2335 // the previous argument.
2336 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2337 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2338 int64_t AlignDownOffset = alignDown(Offset, 4);
2339 int64_t OffsetDiff = Offset - AlignDownOffset;
2340
2341 EVT IntVT = MemVT.changeTypeToInteger();
2342
2343 // TODO: If we passed in the base kernel offset we could have a better
2344 // alignment than 4, but we don't really need it.
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2346 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2349
2350 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2351 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2352
2353 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2354 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2355 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2356
2357 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2358 }
2359
2360 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2361 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2364
2365 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2366 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2367}
2368
2369/// Coerce an argument which was passed in a different ABI type to the original
2370/// expected value type.
2371SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2372 SDValue Val,
2373 CCValAssign &VA,
2374 const SDLoc &SL) const {
2375 EVT ValVT = VA.getValVT();
2376
2377 // If this is an 8 or 16-bit value, it is really passed promoted
2378 // to 32 bits. Insert an assert[sz]ext to capture this, then
2379 // truncate to the right size.
2380 switch (VA.getLocInfo()) {
2381 case CCValAssign::Full:
2382 return Val;
2383 case CCValAssign::BCvt:
2384 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2385 case CCValAssign::SExt:
2386 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2387 DAG.getValueType(ValVT));
2388 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2389 case CCValAssign::ZExt:
2390 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2391 DAG.getValueType(ValVT));
2392 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2393 case CCValAssign::AExt:
2394 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2395 default:
2396 llvm_unreachable("Unknown loc info!");
2397 }
2398}
2399
2400SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2401 CCValAssign &VA, const SDLoc &SL,
2402 SDValue Chain,
2403 const ISD::InputArg &Arg) const {
2404 MachineFunction &MF = DAG.getMachineFunction();
2405 MachineFrameInfo &MFI = MF.getFrameInfo();
2406
2407 if (Arg.Flags.isByVal()) {
2408 unsigned Size = Arg.Flags.getByValSize();
2409 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2410 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2411 }
2412
2413 unsigned ArgOffset = VA.getLocMemOffset();
2414 unsigned ArgSize = VA.getValVT().getStoreSize();
2415
2416 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2417
2418 // Create load nodes to retrieve arguments from the stack.
2419 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2420
2421 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2423 MVT MemVT = VA.getValVT();
2424
2425 switch (VA.getLocInfo()) {
2426 default:
2427 break;
2428 case CCValAssign::BCvt:
2429 MemVT = VA.getLocVT();
2430 break;
2431 case CCValAssign::SExt:
2432 ExtType = ISD::SEXTLOAD;
2433 break;
2434 case CCValAssign::ZExt:
2435 ExtType = ISD::ZEXTLOAD;
2436 break;
2437 case CCValAssign::AExt:
2438 ExtType = ISD::EXTLOAD;
2439 break;
2440 }
2441
2442 SDValue ArgValue = DAG.getExtLoad(
2443 ExtType, SL, VA.getLocVT(), Chain, FIN,
2445
2446 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2447 if (ConvertedVal == ArgValue)
2448 return ConvertedVal;
2449
2450 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2451}
2452
2453SDValue SITargetLowering::lowerWorkGroupId(
2454 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2457 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2458 if (!Subtarget->hasClusters())
2459 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2460
2461 // Clusters are supported. Return the global position in the grid. If clusters
2462 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2463
2464 // WorkGroupIdXYZ = ClusterId == 0 ?
2465 // ClusterIdXYZ :
2466 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2467 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2468 SDLoc SL(ClusterIdXYZ);
2469 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2470 SDValue One = DAG.getConstant(1, SL, VT);
2471 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2472 SDValue ClusterWorkGroupIdXYZ =
2473 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2474 SDValue GlobalIdXYZ =
2475 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2476 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2477
2478 switch (MFI.getClusterDims().getKind()) {
2481 return GlobalIdXYZ;
2483 return ClusterIdXYZ;
2485 using namespace AMDGPU::Hwreg;
2486 SDValue ClusterIdField =
2487 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2488 SDNode *GetReg =
2489 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2490 SDValue ClusterId(GetReg, 0);
2491 SDValue Zero = DAG.getConstant(0, SL, VT);
2492 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2493 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2494 }
2495 }
2496
2497 llvm_unreachable("nothing should reach here");
2498}
2499
2500SDValue SITargetLowering::getPreloadedValue(
2501 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2503 const ArgDescriptor *Reg = nullptr;
2504 const TargetRegisterClass *RC;
2505 LLT Ty;
2506
2508 const ArgDescriptor WorkGroupIDX =
2509 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2510 // If GridZ is not programmed in an entry function then the hardware will set
2511 // it to all zeros, so there is no need to mask the GridY value in the low
2512 // order bits.
2513 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2514 AMDGPU::TTMP7,
2515 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2516 const ArgDescriptor WorkGroupIDZ =
2517 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2518 const ArgDescriptor ClusterWorkGroupIDX =
2519 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2520 const ArgDescriptor ClusterWorkGroupIDY =
2521 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2522 const ArgDescriptor ClusterWorkGroupIDZ =
2523 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2524 const ArgDescriptor ClusterWorkGroupMaxIDX =
2525 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2526 const ArgDescriptor ClusterWorkGroupMaxIDY =
2527 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2528 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2529 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2530 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2531 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2532
2533 auto LoadConstant = [&](unsigned N) {
2534 return DAG.getConstant(N, SDLoc(), VT);
2535 };
2536
2537 if (Subtarget->hasArchitectedSGPRs() &&
2539 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2540 bool HasFixedDims = ClusterDims.isFixedDims();
2541
2542 switch (PVID) {
2544 Reg = &WorkGroupIDX;
2545 RC = &AMDGPU::SReg_32RegClass;
2546 Ty = LLT::scalar(32);
2547 break;
2549 Reg = &WorkGroupIDY;
2550 RC = &AMDGPU::SReg_32RegClass;
2551 Ty = LLT::scalar(32);
2552 break;
2554 Reg = &WorkGroupIDZ;
2555 RC = &AMDGPU::SReg_32RegClass;
2556 Ty = LLT::scalar(32);
2557 break;
2559 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDX;
2562 RC = &AMDGPU::SReg_32RegClass;
2563 Ty = LLT::scalar(32);
2564 break;
2566 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDY;
2569 RC = &AMDGPU::SReg_32RegClass;
2570 Ty = LLT::scalar(32);
2571 break;
2573 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2574 return LoadConstant(0);
2575 Reg = &ClusterWorkGroupIDZ;
2576 RC = &AMDGPU::SReg_32RegClass;
2577 Ty = LLT::scalar(32);
2578 break;
2580 if (HasFixedDims)
2581 return LoadConstant(ClusterDims.getDims()[0] - 1);
2582 Reg = &ClusterWorkGroupMaxIDX;
2583 RC = &AMDGPU::SReg_32RegClass;
2584 Ty = LLT::scalar(32);
2585 break;
2587 if (HasFixedDims)
2588 return LoadConstant(ClusterDims.getDims()[1] - 1);
2589 Reg = &ClusterWorkGroupMaxIDY;
2590 RC = &AMDGPU::SReg_32RegClass;
2591 Ty = LLT::scalar(32);
2592 break;
2594 if (HasFixedDims)
2595 return LoadConstant(ClusterDims.getDims()[2] - 1);
2596 Reg = &ClusterWorkGroupMaxIDZ;
2597 RC = &AMDGPU::SReg_32RegClass;
2598 Ty = LLT::scalar(32);
2599 break;
2601 Reg = &ClusterWorkGroupMaxFlatID;
2602 RC = &AMDGPU::SReg_32RegClass;
2603 Ty = LLT::scalar(32);
2604 break;
2605 default:
2606 break;
2607 }
2608 }
2609
2610 if (!Reg)
2611 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2612 if (!Reg) {
2614 // It's possible for a kernarg intrinsic call to appear in a kernel with
2615 // no allocated segment, in which case we do not add the user sgpr
2616 // argument, so just return null.
2617 return DAG.getConstant(0, SDLoc(), VT);
2618 }
2619
2620 // It's undefined behavior if a function marked with the amdgpu-no-*
2621 // attributes uses the corresponding intrinsic.
2622 return DAG.getPOISON(VT);
2623 }
2624
2625 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2626}
2627
2629 CallingConv::ID CallConv,
2630 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2631 FunctionType *FType,
2633 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2634 const ISD::InputArg *Arg = &Ins[I];
2635
2636 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2637 "vector type argument should have been split");
2638
2639 // First check if it's a PS input addr.
2640 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2641 PSInputNum <= 15) {
2642 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2643
2644 // Inconveniently only the first part of the split is marked as isSplit,
2645 // so skip to the end. We only want to increment PSInputNum once for the
2646 // entire split argument.
2647 if (Arg->Flags.isSplit()) {
2648 while (!Arg->Flags.isSplitEnd()) {
2649 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2650 "unexpected vector split in ps argument type");
2651 if (!SkipArg)
2652 Splits.push_back(*Arg);
2653 Arg = &Ins[++I];
2654 }
2655 }
2656
2657 if (SkipArg) {
2658 // We can safely skip PS inputs.
2659 Skipped.set(Arg->getOrigArgIndex());
2660 ++PSInputNum;
2661 continue;
2662 }
2663
2664 Info->markPSInputAllocated(PSInputNum);
2665 if (Arg->Used)
2666 Info->markPSInputEnabled(PSInputNum);
2667
2668 ++PSInputNum;
2669 }
2670
2671 Splits.push_back(*Arg);
2672 }
2673}
2674
2675// Allocate special inputs passed in VGPRs.
2677 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2678 SIMachineFunctionInfo &Info) const {
2679 const LLT S32 = LLT::scalar(32);
2681
2682 if (Info.hasWorkItemIDX()) {
2683 Register Reg = AMDGPU::VGPR0;
2684 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2685
2686 CCInfo.AllocateReg(Reg);
2687 unsigned Mask =
2688 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2689 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2690 }
2691
2692 if (Info.hasWorkItemIDY()) {
2693 assert(Info.hasWorkItemIDX());
2694 if (Subtarget->hasPackedTID()) {
2695 Info.setWorkItemIDY(
2696 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2697 } else {
2698 unsigned Reg = AMDGPU::VGPR1;
2699 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2700
2701 CCInfo.AllocateReg(Reg);
2702 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2703 }
2704 }
2705
2706 if (Info.hasWorkItemIDZ()) {
2707 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2708 if (Subtarget->hasPackedTID()) {
2709 Info.setWorkItemIDZ(
2710 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2711 } else {
2712 unsigned Reg = AMDGPU::VGPR2;
2713 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2714
2715 CCInfo.AllocateReg(Reg);
2716 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2717 }
2718 }
2719}
2720
2721// Try to allocate a VGPR at the end of the argument list, or if no argument
2722// VGPRs are left allocating a stack slot.
2723// If \p Mask is is given it indicates bitfield position in the register.
2724// If \p Arg is given use it with new ]p Mask instead of allocating new.
2725static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2726 ArgDescriptor Arg = ArgDescriptor()) {
2727 if (Arg.isSet())
2728 return ArgDescriptor::createArg(Arg, Mask);
2729
2730 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2731 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2732 if (RegIdx == ArgVGPRs.size()) {
2733 // Spill to stack required.
2734 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2735
2736 return ArgDescriptor::createStack(Offset, Mask);
2737 }
2738
2739 unsigned Reg = ArgVGPRs[RegIdx];
2740 Reg = CCInfo.AllocateReg(Reg);
2741 assert(Reg != AMDGPU::NoRegister);
2742
2743 MachineFunction &MF = CCInfo.getMachineFunction();
2744 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2745 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2746 return ArgDescriptor::createRegister(Reg, Mask);
2747}
2748
2750 const TargetRegisterClass *RC,
2751 unsigned NumArgRegs) {
2752 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2753 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2754 if (RegIdx == ArgSGPRs.size())
2755 report_fatal_error("ran out of SGPRs for arguments");
2756
2757 unsigned Reg = ArgSGPRs[RegIdx];
2758 Reg = CCInfo.AllocateReg(Reg);
2759 assert(Reg != AMDGPU::NoRegister);
2760
2761 MachineFunction &MF = CCInfo.getMachineFunction();
2762 MF.addLiveIn(Reg, RC);
2764}
2765
2766// If this has a fixed position, we still should allocate the register in the
2767// CCInfo state. Technically we could get away with this for values passed
2768// outside of the normal argument range.
2770 const TargetRegisterClass *RC,
2771 MCRegister Reg) {
2772 Reg = CCInfo.AllocateReg(Reg);
2773 assert(Reg != AMDGPU::NoRegister);
2774 MachineFunction &MF = CCInfo.getMachineFunction();
2775 MF.addLiveIn(Reg, RC);
2776}
2777
2778static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2779 if (Arg) {
2780 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2781 Arg.getRegister());
2782 } else
2783 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2784}
2785
2786static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2787 if (Arg) {
2788 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2789 Arg.getRegister());
2790 } else
2791 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2792}
2793
2794/// Allocate implicit function VGPR arguments at the end of allocated user
2795/// arguments.
2797 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2798 SIMachineFunctionInfo &Info) const {
2799 const unsigned Mask = 0x3ff;
2800 ArgDescriptor Arg;
2801
2802 if (Info.hasWorkItemIDX()) {
2803 Arg = allocateVGPR32Input(CCInfo, Mask);
2804 Info.setWorkItemIDX(Arg);
2805 }
2806
2807 if (Info.hasWorkItemIDY()) {
2808 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2809 Info.setWorkItemIDY(Arg);
2810 }
2811
2812 if (Info.hasWorkItemIDZ())
2813 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2814}
2815
2816/// Allocate implicit function VGPR arguments in fixed registers.
2818 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2819 SIMachineFunctionInfo &Info) const {
2820 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2821 if (!Reg)
2822 report_fatal_error("failed to allocate VGPR for implicit arguments");
2823
2824 const unsigned Mask = 0x3ff;
2825 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2826 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2827 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2828}
2829
2831 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2832 SIMachineFunctionInfo &Info) const {
2833 auto &ArgInfo = Info.getArgInfo();
2834 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2835
2836 // TODO: Unify handling with private memory pointers.
2837 if (UserSGPRInfo.hasDispatchPtr())
2838 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2839
2840 if (UserSGPRInfo.hasQueuePtr())
2841 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2842
2843 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2844 // constant offset from the kernarg segment.
2845 if (Info.hasImplicitArgPtr())
2846 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2847
2848 if (UserSGPRInfo.hasDispatchID())
2849 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2850
2851 // flat_scratch_init is not applicable for non-kernel functions.
2852
2853 if (Info.hasWorkGroupIDX())
2854 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2855
2856 if (Info.hasWorkGroupIDY())
2857 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2858
2859 if (Info.hasWorkGroupIDZ())
2860 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2861
2862 if (Info.hasLDSKernelId())
2863 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2864}
2865
2866// Allocate special inputs passed in user SGPRs.
2868 MachineFunction &MF,
2869 const SIRegisterInfo &TRI,
2870 SIMachineFunctionInfo &Info) const {
2871 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2872 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2873 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2874 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2875 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2876 }
2877
2878 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2879 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2880 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2881 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2882 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2883 }
2884
2885 if (UserSGPRInfo.hasDispatchPtr()) {
2886 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2887 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2888 CCInfo.AllocateReg(DispatchPtrReg);
2889 }
2890
2891 if (UserSGPRInfo.hasQueuePtr()) {
2892 Register QueuePtrReg = Info.addQueuePtr(TRI);
2893 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2894 CCInfo.AllocateReg(QueuePtrReg);
2895 }
2896
2897 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2899 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2900 CCInfo.AllocateReg(InputPtrReg);
2901
2902 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2903 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2904 }
2905
2906 if (UserSGPRInfo.hasDispatchID()) {
2907 Register DispatchIDReg = Info.addDispatchID(TRI);
2908 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2909 CCInfo.AllocateReg(DispatchIDReg);
2910 }
2911
2912 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2913 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2914 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2915 CCInfo.AllocateReg(FlatScratchInitReg);
2916 }
2917
2918 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2919 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2920 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2921 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2922 }
2923
2924 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2925 // these from the dispatch pointer.
2926}
2927
2928// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2929// sequential starting from the first argument.
2931 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2933 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2934 Function &F = MF.getFunction();
2935 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2936 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2937 bool InPreloadSequence = true;
2938 unsigned InIdx = 0;
2939 bool AlignedForImplictArgs = false;
2940 unsigned ImplicitArgOffset = 0;
2941 for (auto &Arg : F.args()) {
2942 if (!InPreloadSequence || !Arg.hasInRegAttr())
2943 break;
2944
2945 unsigned ArgIdx = Arg.getArgNo();
2946 // Don't preload non-original args or parts not in the current preload
2947 // sequence.
2948 if (InIdx < Ins.size() &&
2949 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2950 break;
2951
2952 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2953 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2954 InIdx++) {
2955 assert(ArgLocs[ArgIdx].isMemLoc());
2956 auto &ArgLoc = ArgLocs[InIdx];
2957 const Align KernelArgBaseAlign = Align(16);
2958 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2959 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2960 unsigned NumAllocSGPRs =
2961 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2962
2963 // Fix alignment for hidden arguments.
2964 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2965 if (!AlignedForImplictArgs) {
2966 ImplicitArgOffset =
2967 alignTo(LastExplicitArgOffset,
2968 Subtarget->getAlignmentForImplicitArgPtr()) -
2969 LastExplicitArgOffset;
2970 AlignedForImplictArgs = true;
2971 }
2972 ArgOffset += ImplicitArgOffset;
2973 }
2974
2975 // Arg is preloaded into the previous SGPR.
2976 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2977 assert(InIdx >= 1 && "No previous SGPR");
2978 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2979 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2980 continue;
2981 }
2982
2983 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2984 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2985 // Check for free user SGPRs for preloading.
2986 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2987 InPreloadSequence = false;
2988 break;
2989 }
2990
2991 // Preload this argument.
2992 const TargetRegisterClass *RC =
2993 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2994 SmallVectorImpl<MCRegister> *PreloadRegs =
2995 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2996
2997 if (PreloadRegs->size() > 1)
2998 RC = &AMDGPU::SGPR_32RegClass;
2999 for (auto &Reg : *PreloadRegs) {
3000 assert(Reg);
3001 MF.addLiveIn(Reg, RC);
3002 CCInfo.AllocateReg(Reg);
3003 }
3004
3005 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3006 }
3007 }
3008}
3009
3011 const SIRegisterInfo &TRI,
3012 SIMachineFunctionInfo &Info) const {
3013 // Always allocate this last since it is a synthetic preload.
3014 if (Info.hasLDSKernelId()) {
3015 Register Reg = Info.addLDSKernelId();
3016 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3017 CCInfo.AllocateReg(Reg);
3018 }
3019}
3020
3021// Allocate special input registers that are initialized per-wave.
3024 CallingConv::ID CallConv,
3025 bool IsShader) const {
3026 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3027 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3028 // Note: user SGPRs are handled by the front-end for graphics shaders
3029 // Pad up the used user SGPRs with dead inputs.
3030
3031 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3032 // before enabling architected SGPRs for workgroup IDs.
3033 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3034
3035 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3036 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3037 // rely on it to reach 16 since if we end up having no stack usage, it will
3038 // not really be added.
3039 unsigned NumRequiredSystemSGPRs =
3040 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3041 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3042 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3043 Register Reg = Info.addReservedUserSGPR();
3044 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3045 CCInfo.AllocateReg(Reg);
3046 }
3047 }
3048
3049 if (!HasArchitectedSGPRs) {
3050 if (Info.hasWorkGroupIDX()) {
3051 Register Reg = Info.addWorkGroupIDX();
3052 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3053 CCInfo.AllocateReg(Reg);
3054 }
3055
3056 if (Info.hasWorkGroupIDY()) {
3057 Register Reg = Info.addWorkGroupIDY();
3058 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3059 CCInfo.AllocateReg(Reg);
3060 }
3061
3062 if (Info.hasWorkGroupIDZ()) {
3063 Register Reg = Info.addWorkGroupIDZ();
3064 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3065 CCInfo.AllocateReg(Reg);
3066 }
3067 }
3068
3069 if (Info.hasWorkGroupInfo()) {
3070 Register Reg = Info.addWorkGroupInfo();
3071 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 CCInfo.AllocateReg(Reg);
3073 }
3074
3075 if (Info.hasPrivateSegmentWaveByteOffset()) {
3076 // Scratch wave offset passed in system SGPR.
3077 unsigned PrivateSegmentWaveByteOffsetReg;
3078
3079 if (IsShader) {
3080 PrivateSegmentWaveByteOffsetReg =
3081 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3082
3083 // This is true if the scratch wave byte offset doesn't have a fixed
3084 // location.
3085 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3086 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3087 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3088 }
3089 } else
3090 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3091
3092 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3093 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3094 }
3095
3096 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3097 Info.getNumPreloadedSGPRs() >= 16);
3098}
3099
3101 MachineFunction &MF,
3102 const SIRegisterInfo &TRI,
3104 // Now that we've figured out where the scratch register inputs are, see if
3105 // should reserve the arguments and use them directly.
3106 MachineFrameInfo &MFI = MF.getFrameInfo();
3107 bool HasStackObjects = MFI.hasStackObjects();
3108 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3109
3110 // Record that we know we have non-spill stack objects so we don't need to
3111 // check all stack objects later.
3112 if (HasStackObjects)
3113 Info.setHasNonSpillStackObjects(true);
3114
3115 // Everything live out of a block is spilled with fast regalloc, so it's
3116 // almost certain that spilling will be required.
3117 if (TM.getOptLevel() == CodeGenOptLevel::None)
3118 HasStackObjects = true;
3119
3120 // For now assume stack access is needed in any callee functions, so we need
3121 // the scratch registers to pass in.
3122 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3123
3124 if (!ST.enableFlatScratch()) {
3125 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3126 // If we have stack objects, we unquestionably need the private buffer
3127 // resource. For the Code Object V2 ABI, this will be the first 4 user
3128 // SGPR inputs. We can reserve those and use them directly.
3129
3130 Register PrivateSegmentBufferReg =
3132 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3133 } else {
3134 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3135 // We tentatively reserve the last registers (skipping the last registers
3136 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3137 // we'll replace these with the ones immediately after those which were
3138 // really allocated. In the prologue copies will be inserted from the
3139 // argument to these reserved registers.
3140
3141 // Without HSA, relocations are used for the scratch pointer and the
3142 // buffer resource setup is always inserted in the prologue. Scratch wave
3143 // offset is still in an input SGPR.
3144 Info.setScratchRSrcReg(ReservedBufferReg);
3145 }
3146 }
3147
3149
3150 // For entry functions we have to set up the stack pointer if we use it,
3151 // whereas non-entry functions get this "for free". This means there is no
3152 // intrinsic advantage to using S32 over S34 in cases where we do not have
3153 // calls but do need a frame pointer (i.e. if we are requested to have one
3154 // because frame pointer elimination is disabled). To keep things simple we
3155 // only ever use S32 as the call ABI stack pointer, and so using it does not
3156 // imply we need a separate frame pointer.
3157 //
3158 // Try to use s32 as the SP, but move it if it would interfere with input
3159 // arguments. This won't work with calls though.
3160 //
3161 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3162 // registers.
3163 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3164 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3165 } else {
3167
3168 if (MFI.hasCalls())
3169 report_fatal_error("call in graphics shader with too many input SGPRs");
3170
3171 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3172 if (!MRI.isLiveIn(Reg)) {
3173 Info.setStackPtrOffsetReg(Reg);
3174 break;
3175 }
3176 }
3177
3178 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3179 report_fatal_error("failed to find register for SP");
3180 }
3181
3182 // hasFP should be accurate for entry functions even before the frame is
3183 // finalized, because it does not rely on the known stack size, only
3184 // properties like whether variable sized objects are present.
3185 if (ST.getFrameLowering()->hasFP(MF)) {
3186 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3187 }
3188}
3189
3192 return !Info->isEntryFunction();
3193}
3194
3196
3198 MachineBasicBlock *Entry,
3199 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3201
3202 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3203 if (!IStart)
3204 return;
3205
3206 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3207 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3208 MachineBasicBlock::iterator MBBI = Entry->begin();
3209 for (const MCPhysReg *I = IStart; *I; ++I) {
3210 const TargetRegisterClass *RC = nullptr;
3211 if (AMDGPU::SReg_64RegClass.contains(*I))
3212 RC = &AMDGPU::SGPR_64RegClass;
3213 else if (AMDGPU::SReg_32RegClass.contains(*I))
3214 RC = &AMDGPU::SGPR_32RegClass;
3215 else
3216 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3217
3218 Register NewVR = MRI->createVirtualRegister(RC);
3219 // Create copy from CSR to a virtual register.
3220 Entry->addLiveIn(*I);
3221 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3222 .addReg(*I);
3223
3224 // Insert the copy-back instructions right before the terminator.
3225 for (auto *Exit : Exits)
3226 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3227 TII->get(TargetOpcode::COPY), *I)
3228 .addReg(NewVR);
3229 }
3230}
3231
3233 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3234 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3235 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3237
3239 const Function &Fn = MF.getFunction();
3242 bool IsError = false;
3243
3244 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3246 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3247 IsError = true;
3248 }
3249
3252 BitVector Skipped(Ins.size());
3253 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3254 *DAG.getContext());
3255
3256 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3257 bool IsKernel = AMDGPU::isKernel(CallConv);
3258 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3259
3260 if (IsGraphics) {
3261 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3262 assert(!UserSGPRInfo.hasDispatchPtr() &&
3263 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3264 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3265 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3266 (void)UserSGPRInfo;
3267 if (!Subtarget->enableFlatScratch())
3268 assert(!UserSGPRInfo.hasFlatScratchInit());
3269 if ((CallConv != CallingConv::AMDGPU_CS &&
3270 CallConv != CallingConv::AMDGPU_Gfx &&
3271 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3272 !Subtarget->hasArchitectedSGPRs())
3273 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3274 !Info->hasWorkGroupIDZ());
3275 }
3276
3277 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3278
3279 if (CallConv == CallingConv::AMDGPU_PS) {
3280 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3281
3282 // At least one interpolation mode must be enabled or else the GPU will
3283 // hang.
3284 //
3285 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3286 // set PSInputAddr, the user wants to enable some bits after the compilation
3287 // based on run-time states. Since we can't know what the final PSInputEna
3288 // will look like, so we shouldn't do anything here and the user should take
3289 // responsibility for the correct programming.
3290 //
3291 // Otherwise, the following restrictions apply:
3292 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3293 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3294 // enabled too.
3295 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3296 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3297 CCInfo.AllocateReg(AMDGPU::VGPR0);
3298 CCInfo.AllocateReg(AMDGPU::VGPR1);
3299 Info->markPSInputAllocated(0);
3300 Info->markPSInputEnabled(0);
3301 }
3302 if (Subtarget->isAmdPalOS()) {
3303 // For isAmdPalOS, the user does not enable some bits after compilation
3304 // based on run-time states; the register values being generated here are
3305 // the final ones set in hardware. Therefore we need to apply the
3306 // workaround to PSInputAddr and PSInputEnable together. (The case where
3307 // a bit is set in PSInputAddr but not PSInputEnable is where the
3308 // frontend set up an input arg for a particular interpolation mode, but
3309 // nothing uses that input arg. Really we should have an earlier pass
3310 // that removes such an arg.)
3311 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3312 if ((PsInputBits & 0x7F) == 0 ||
3313 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3314 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3315 }
3316 } else if (IsKernel) {
3317 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3318 } else {
3319 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3320 Ins.end());
3321 }
3322
3323 if (IsKernel)
3324 analyzeFormalArgumentsCompute(CCInfo, Ins);
3325
3326 if (IsEntryFunc) {
3327 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3328 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3329 if (IsKernel && Subtarget->hasKernargPreload())
3330 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3331
3332 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3333 } else if (!IsGraphics) {
3334 // For the fixed ABI, pass workitem IDs in the last argument register.
3335 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3336
3337 // FIXME: Sink this into allocateSpecialInputSGPRs
3338 if (!Subtarget->enableFlatScratch())
3339 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3340
3341 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3342 }
3343
3344 if (!IsKernel) {
3345 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3346 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3347
3348 // This assumes the registers are allocated by CCInfo in ascending order
3349 // with no gaps.
3350 Info->setNumWaveDispatchSGPRs(
3351 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3352 Info->setNumWaveDispatchVGPRs(
3353 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3354 } else if (Info->getNumKernargPreloadedSGPRs()) {
3355 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3356 }
3357
3359
3360 if (IsWholeWaveFunc) {
3362 {MVT::i1, MVT::Other}, Chain);
3363 InVals.push_back(Setup.getValue(0));
3364 Chains.push_back(Setup.getValue(1));
3365 }
3366
3367 // FIXME: This is the minimum kernel argument alignment. We should improve
3368 // this to the maximum alignment of the arguments.
3369 //
3370 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3371 // kern arg offset.
3372 const Align KernelArgBaseAlign = Align(16);
3373
3374 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3375 ++i) {
3376 const ISD::InputArg &Arg = Ins[i];
3377 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3378 InVals.push_back(DAG.getPOISON(Arg.VT));
3379 continue;
3380 }
3381
3382 CCValAssign &VA = ArgLocs[ArgIdx++];
3383 MVT VT = VA.getLocVT();
3384
3385 if (IsEntryFunc && VA.isMemLoc()) {
3386 VT = Ins[i].VT;
3387 EVT MemVT = VA.getLocVT();
3388
3389 const uint64_t Offset = VA.getLocMemOffset();
3390 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3391
3392 if (Arg.Flags.isByRef()) {
3393 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3394
3395 const GCNTargetMachine &TM =
3396 static_cast<const GCNTargetMachine &>(getTargetMachine());
3397 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3398 Arg.Flags.getPointerAddrSpace())) {
3401 }
3402
3403 InVals.push_back(Ptr);
3404 continue;
3405 }
3406
3407 SDValue NewArg;
3408 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3409 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3410 // In this case the argument is packed into the previous preload SGPR.
3411 int64_t AlignDownOffset = alignDown(Offset, 4);
3412 int64_t OffsetDiff = Offset - AlignDownOffset;
3413 EVT IntVT = MemVT.changeTypeToInteger();
3414
3415 const SIMachineFunctionInfo *Info =
3418 Register Reg =
3419 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3420
3421 assert(Reg);
3422 Register VReg = MRI.getLiveInVirtReg(Reg);
3423 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3424
3425 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3426 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3427
3428 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3429 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3430 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3431 Ins[i].Flags.isSExt(), &Ins[i]);
3432
3433 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3434 } else {
3435 const SIMachineFunctionInfo *Info =
3438 const SmallVectorImpl<MCRegister> &PreloadRegs =
3439 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3440
3441 SDValue Copy;
3442 if (PreloadRegs.size() == 1) {
3443 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3444 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3445 NewArg = DAG.getCopyFromReg(
3446 Chain, DL, VReg,
3448 TRI->getRegSizeInBits(*RC)));
3449
3450 } else {
3451 // If the kernarg alignment does not match the alignment of the SGPR
3452 // tuple RC that can accommodate this argument, it will be built up
3453 // via copies from from the individual SGPRs that the argument was
3454 // preloaded to.
3456 for (auto Reg : PreloadRegs) {
3457 Register VReg = MRI.getLiveInVirtReg(Reg);
3458 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3459 Elts.push_back(Copy);
3460 }
3461 NewArg =
3462 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3463 PreloadRegs.size()),
3464 DL, Elts);
3465 }
3466
3467 // If the argument was preloaded to multiple consecutive 32-bit
3468 // registers because of misalignment between addressable SGPR tuples
3469 // and the argument size, we can still assume that because of kernarg
3470 // segment alignment restrictions that NewArg's size is the same as
3471 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3472 // truncate since we cannot preload to less than a single SGPR and the
3473 // MemVT may be smaller.
3474 EVT MemVTInt =
3476 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3477 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3478
3479 NewArg = DAG.getBitcast(MemVT, NewArg);
3480 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3481 Ins[i].Flags.isSExt(), &Ins[i]);
3482 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3483 }
3484 } else {
3485 // Hidden arguments that are in the kernel signature must be preloaded
3486 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3487 // the argument list and is not preloaded.
3488 if (Arg.isOrigArg()) {
3489 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3490 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3492 *OrigArg->getParent(),
3493 "hidden argument in kernel signature was not preloaded",
3494 DL.getDebugLoc()));
3495 }
3496 }
3497
3498 NewArg =
3499 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3500 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3501 }
3502 Chains.push_back(NewArg.getValue(1));
3503
3504 auto *ParamTy =
3505 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3506 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3507 ParamTy &&
3508 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3509 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3510 // On SI local pointers are just offsets into LDS, so they are always
3511 // less than 16-bits. On CI and newer they could potentially be
3512 // real pointers, so we can't guarantee their size.
3513 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3514 DAG.getValueType(MVT::i16));
3515 }
3516
3517 InVals.push_back(NewArg);
3518 continue;
3519 }
3520 if (!IsEntryFunc && VA.isMemLoc()) {
3521 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3522 InVals.push_back(Val);
3523 if (!Arg.Flags.isByVal())
3524 Chains.push_back(Val.getValue(1));
3525 continue;
3526 }
3527
3528 assert(VA.isRegLoc() && "Parameter must be in a register!");
3529
3530 Register Reg = VA.getLocReg();
3531 const TargetRegisterClass *RC = nullptr;
3532 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3533 RC = &AMDGPU::VGPR_32RegClass;
3534 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3535 RC = &AMDGPU::SGPR_32RegClass;
3536 else
3537 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3538
3539 Reg = MF.addLiveIn(Reg, RC);
3540 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3541
3542 if (Arg.Flags.isSRet()) {
3543 // The return object should be reasonably addressable.
3544
3545 // FIXME: This helps when the return is a real sret. If it is a
3546 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3547 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3548 unsigned NumBits =
3550 Val = DAG.getNode(
3551 ISD::AssertZext, DL, VT, Val,
3552 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3553 }
3554
3555 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3556 InVals.push_back(Val);
3557 }
3558
3559 // Start adding system SGPRs.
3560 if (IsEntryFunc)
3561 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3562
3563 // DAG.getPass() returns nullptr when using new pass manager.
3564 // TODO: Use DAG.getMFAM() to access analysis result.
3565 if (DAG.getPass()) {
3566 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3567 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3568 }
3569
3570 unsigned StackArgSize = CCInfo.getStackSize();
3571 Info->setBytesInStackArgArea(StackArgSize);
3572
3573 return Chains.empty() ? Chain
3574 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3575}
3576
3577// TODO: If return values can't fit in registers, we should return as many as
3578// possible in registers before passing on stack.
3580 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3581 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3582 const Type *RetTy) const {
3583 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3584 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3585 // for shaders. Vector types should be explicitly handled by CC.
3586 if (AMDGPU::isEntryFunctionCC(CallConv))
3587 return true;
3588
3590 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3591 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3592 return false;
3593
3594 // We must use the stack if return would require unavailable registers.
3595 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3596 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3597 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3598 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3599 return false;
3600
3601 return true;
3602}
3603
3604SDValue
3606 bool isVarArg,
3608 const SmallVectorImpl<SDValue> &OutVals,
3609 const SDLoc &DL, SelectionDAG &DAG) const {
3613
3614 if (AMDGPU::isKernel(CallConv)) {
3615 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3616 OutVals, DL, DAG);
3617 }
3618
3619 bool IsShader = AMDGPU::isShader(CallConv);
3620
3621 Info->setIfReturnsVoid(Outs.empty());
3622 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3623
3624 // CCValAssign - represent the assignment of the return value to a location.
3626
3627 // CCState - Info about the registers and stack slots.
3628 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3629 *DAG.getContext());
3630
3631 // Analyze outgoing return values.
3632 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3633
3634 SDValue Glue;
3636 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3637
3638 SDValue ReadFirstLane =
3639 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3640 // Copy the result values into the output registers.
3641 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3642 ++I, ++RealRVLocIdx) {
3643 CCValAssign &VA = RVLocs[I];
3644 assert(VA.isRegLoc() && "Can only return in registers!");
3645 // TODO: Partially return in registers if return values don't fit.
3646 SDValue Arg = OutVals[RealRVLocIdx];
3647
3648 // Copied from other backends.
3649 switch (VA.getLocInfo()) {
3650 case CCValAssign::Full:
3651 break;
3652 case CCValAssign::BCvt:
3653 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3654 break;
3655 case CCValAssign::SExt:
3656 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3657 break;
3658 case CCValAssign::ZExt:
3659 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3660 break;
3661 case CCValAssign::AExt:
3662 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3663 break;
3664 default:
3665 llvm_unreachable("Unknown loc info!");
3666 }
3667 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3669 ReadFirstLane, Arg);
3670 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3671 Glue = Chain.getValue(1);
3672 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3673 }
3674
3675 // FIXME: Does sret work properly?
3676 if (!Info->isEntryFunction()) {
3677 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3678 const MCPhysReg *I =
3679 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3680 if (I) {
3681 for (; *I; ++I) {
3682 if (AMDGPU::SReg_64RegClass.contains(*I))
3683 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3684 else if (AMDGPU::SReg_32RegClass.contains(*I))
3685 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3686 else
3687 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3688 }
3689 }
3690 }
3691
3692 // Update chain and glue.
3693 RetOps[0] = Chain;
3694 if (Glue.getNode())
3695 RetOps.push_back(Glue);
3696
3697 unsigned Opc = AMDGPUISD::ENDPGM;
3698 if (!IsWaveEnd)
3699 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3700 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3702 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3703}
3704
3706 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3707 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3708 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3709 SDValue ThisVal) const {
3710 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3711
3712 // Assign locations to each value returned by this call.
3714 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3715 *DAG.getContext());
3716 CCInfo.AnalyzeCallResult(Ins, RetCC);
3717
3718 // Copy all of the result registers out of their specified physreg.
3719 for (CCValAssign VA : RVLocs) {
3720 SDValue Val;
3721
3722 if (VA.isRegLoc()) {
3723 Val =
3724 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3725 Chain = Val.getValue(1);
3726 InGlue = Val.getValue(2);
3727 } else if (VA.isMemLoc()) {
3728 report_fatal_error("TODO: return values in memory");
3729 } else
3730 llvm_unreachable("unknown argument location type");
3731
3732 switch (VA.getLocInfo()) {
3733 case CCValAssign::Full:
3734 break;
3735 case CCValAssign::BCvt:
3736 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3737 break;
3738 case CCValAssign::ZExt:
3739 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3740 DAG.getValueType(VA.getValVT()));
3741 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3742 break;
3743 case CCValAssign::SExt:
3744 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3745 DAG.getValueType(VA.getValVT()));
3746 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3747 break;
3748 case CCValAssign::AExt:
3749 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3750 break;
3751 default:
3752 llvm_unreachable("Unknown loc info!");
3753 }
3754
3755 InVals.push_back(Val);
3756 }
3757
3758 return Chain;
3759}
3760
3761// Add code to pass special inputs required depending on used features separate
3762// from the explicit user arguments present in the IR.
3764 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3765 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3766 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3767 // If we don't have a call site, this was a call inserted by
3768 // legalization. These can never use special inputs.
3769 if (!CLI.CB)
3770 return;
3771
3772 SelectionDAG &DAG = CLI.DAG;
3773 const SDLoc &DL = CLI.DL;
3774 const Function &F = DAG.getMachineFunction().getFunction();
3775
3776 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3777 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3778
3779 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3781 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3782 // DAG.getPass() returns nullptr when using new pass manager.
3783 // TODO: Use DAG.getMFAM() to access analysis result.
3784 if (DAG.getPass()) {
3785 auto &ArgUsageInfo =
3787 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3788 }
3789 }
3790
3791 // TODO: Unify with private memory register handling. This is complicated by
3792 // the fact that at least in kernels, the input argument is not necessarily
3793 // in the same location as the input.
3794 // clang-format off
3795 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3796 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3797 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3798 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3799 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3800 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3801 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3802 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3803 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3804 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3805 };
3806 // clang-format on
3807
3808 for (auto [InputID, Attrs] : ImplicitAttrs) {
3809 // If the callee does not use the attribute value, skip copying the value.
3810 if (all_of(Attrs, [&](StringRef Attr) {
3811 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3812 }))
3813 continue;
3814
3815 const auto [OutgoingArg, ArgRC, ArgTy] =
3816 CalleeArgInfo->getPreloadedValue(InputID);
3817 if (!OutgoingArg)
3818 continue;
3819
3820 const auto [IncomingArg, IncomingArgRC, Ty] =
3821 CallerArgInfo.getPreloadedValue(InputID);
3822 assert(IncomingArgRC == ArgRC);
3823
3824 // All special arguments are ints for now.
3825 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3826 SDValue InputReg;
3827
3828 if (IncomingArg) {
3829 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3830 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3831 // The implicit arg ptr is special because it doesn't have a corresponding
3832 // input for kernels, and is computed from the kernarg segment pointer.
3833 InputReg = getImplicitArgPtr(DAG, DL);
3834 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3835 std::optional<uint32_t> Id =
3837 if (Id.has_value()) {
3838 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3839 } else {
3840 InputReg = DAG.getPOISON(ArgVT);
3841 }
3842 } else {
3843 // We may have proven the input wasn't needed, although the ABI is
3844 // requiring it. We just need to allocate the register appropriately.
3845 InputReg = DAG.getPOISON(ArgVT);
3846 }
3847
3848 if (OutgoingArg->isRegister()) {
3849 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3850 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3851 report_fatal_error("failed to allocate implicit input argument");
3852 } else {
3853 unsigned SpecialArgOffset =
3854 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3855 SDValue ArgStore =
3856 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3857 MemOpChains.push_back(ArgStore);
3858 }
3859 }
3860
3861 // Pack workitem IDs into a single register or pass it as is if already
3862 // packed.
3863
3864 auto [OutgoingArg, ArgRC, Ty] =
3866 if (!OutgoingArg)
3867 std::tie(OutgoingArg, ArgRC, Ty) =
3869 if (!OutgoingArg)
3870 std::tie(OutgoingArg, ArgRC, Ty) =
3872 if (!OutgoingArg)
3873 return;
3874
3875 const ArgDescriptor *IncomingArgX = std::get<0>(
3877 const ArgDescriptor *IncomingArgY = std::get<0>(
3879 const ArgDescriptor *IncomingArgZ = std::get<0>(
3881
3882 SDValue InputReg;
3883 SDLoc SL;
3884
3885 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3886 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3887 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3888
3889 // If incoming ids are not packed we need to pack them.
3890 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3891 NeedWorkItemIDX) {
3892 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3893 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3894 } else {
3895 InputReg = DAG.getConstant(0, DL, MVT::i32);
3896 }
3897 }
3898
3899 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3900 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3901 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3902 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3903 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3904 InputReg = InputReg.getNode()
3905 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3906 : Y;
3907 }
3908
3909 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3910 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3911 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3912 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3913 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3914 InputReg = InputReg.getNode()
3915 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3916 : Z;
3917 }
3918
3919 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3920 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3921 // We're in a situation where the outgoing function requires the workitem
3922 // ID, but the calling function does not have it (e.g a graphics function
3923 // calling a C calling convention function). This is illegal, but we need
3924 // to produce something.
3925 InputReg = DAG.getPOISON(MVT::i32);
3926 } else {
3927 // Workitem ids are already packed, any of present incoming arguments
3928 // will carry all required fields.
3929 ArgDescriptor IncomingArg =
3930 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3931 : IncomingArgY ? *IncomingArgY
3932 : *IncomingArgZ,
3933 ~0u);
3934 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3935 }
3936 }
3937
3938 if (OutgoingArg->isRegister()) {
3939 if (InputReg)
3940 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3941
3942 CCInfo.AllocateReg(OutgoingArg->getRegister());
3943 } else {
3944 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3945 if (InputReg) {
3946 SDValue ArgStore =
3947 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3948 MemOpChains.push_back(ArgStore);
3949 }
3950 }
3951}
3952
3954 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3956 const SmallVectorImpl<SDValue> &OutVals,
3957 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3958 if (AMDGPU::isChainCC(CalleeCC))
3959 return true;
3960
3961 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3962 return false;
3963
3964 // For a divergent call target, we need to do a waterfall loop over the
3965 // possible callees which precludes us from using a simple jump.
3966 if (Callee->isDivergent())
3967 return false;
3968
3970 const Function &CallerF = MF.getFunction();
3971 CallingConv::ID CallerCC = CallerF.getCallingConv();
3973 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3974
3975 // Kernels aren't callable, and don't have a live in return address so it
3976 // doesn't make sense to do a tail call with entry functions.
3977 if (!CallerPreserved)
3978 return false;
3979
3980 bool CCMatch = CallerCC == CalleeCC;
3981
3983 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3984 return true;
3985 return false;
3986 }
3987
3988 // TODO: Can we handle var args?
3989 if (IsVarArg)
3990 return false;
3991
3992 for (const Argument &Arg : CallerF.args()) {
3993 if (Arg.hasByValAttr())
3994 return false;
3995 }
3996
3997 LLVMContext &Ctx = *DAG.getContext();
3998
3999 // Check that the call results are passed in the same way.
4000 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4001 CCAssignFnForCall(CalleeCC, IsVarArg),
4002 CCAssignFnForCall(CallerCC, IsVarArg)))
4003 return false;
4004
4005 // The callee has to preserve all registers the caller needs to preserve.
4006 if (!CCMatch) {
4007 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4008 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4009 return false;
4010 }
4011
4012 // Nothing more to check if the callee is taking no arguments.
4013 if (Outs.empty())
4014 return true;
4015
4017 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4018
4019 // FIXME: We are not allocating special input registers, so we will be
4020 // deciding based on incorrect register assignments.
4021 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4022
4023 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4024 // If the stack arguments for this call do not fit into our own save area then
4025 // the call cannot be made tail.
4026 // TODO: Is this really necessary?
4027 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4028 return false;
4029
4030 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4031 // FIXME: What about inreg arguments that end up passed in memory?
4032 if (!CCVA.isRegLoc())
4033 continue;
4034
4035 // If we are passing an argument in an SGPR, and the value is divergent,
4036 // this call requires a waterfall loop.
4037 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4038 LLVM_DEBUG(
4039 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4040 << printReg(CCVA.getLocReg(), TRI) << '\n');
4041 return false;
4042 }
4043 }
4044
4045 const MachineRegisterInfo &MRI = MF.getRegInfo();
4046 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4047}
4048
4050 if (!CI->isTailCall())
4051 return false;
4052
4053 const Function *ParentFn = CI->getParent()->getParent();
4055 return false;
4056 return true;
4057}
4058
4059namespace {
4060// Chain calls have special arguments that we need to handle. These are
4061// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4062// arguments (index 0 and 1 respectively).
4063enum ChainCallArgIdx {
4064 Exec = 2,
4065 Flags,
4066 NumVGPRs,
4067 FallbackExec,
4068 FallbackCallee
4069};
4070} // anonymous namespace
4071
4072// The wave scratch offset register is used as the global base pointer.
4074 SmallVectorImpl<SDValue> &InVals) const {
4075 CallingConv::ID CallConv = CLI.CallConv;
4076 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4077
4078 SelectionDAG &DAG = CLI.DAG;
4079
4080 const SDLoc &DL = CLI.DL;
4081 SDValue Chain = CLI.Chain;
4082 SDValue Callee = CLI.Callee;
4083
4084 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4085 bool UsesDynamicVGPRs = false;
4086 if (IsChainCallConv) {
4087 // The last arguments should be the value that we need to put in EXEC,
4088 // followed by the flags and any other arguments with special meanings.
4089 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4090 // we don't treat them like the "real" arguments.
4091 auto RequestedExecIt =
4092 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4093 return Arg.OrigArgIndex == 2;
4094 });
4095 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4096
4097 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4098 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4099 CLI.OutVals.end());
4100 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4101
4102 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4103 "Haven't popped all the special args");
4104
4105 TargetLowering::ArgListEntry RequestedExecArg =
4106 CLI.Args[ChainCallArgIdx::Exec];
4107 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4108 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4109
4110 // Convert constants into TargetConstants, so they become immediate operands
4111 // instead of being selected into S_MOV.
4112 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4113 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4114 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4115 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4116 } else
4117 ChainCallSpecialArgs.push_back(Arg.Node);
4118 };
4119
4120 PushNodeOrTargetConstant(RequestedExecArg);
4121
4122 // Process any other special arguments depending on the value of the flags.
4123 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4124
4125 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4126 if (FlagsValue.isZero()) {
4127 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4128 return lowerUnhandledCall(CLI, InVals,
4129 "no additional args allowed if flags == 0");
4130 } else if (FlagsValue.isOneBitSet(0)) {
4131 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4132 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4133 }
4134
4135 if (!Subtarget->isWave32()) {
4136 return lowerUnhandledCall(
4137 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4138 }
4139
4140 UsesDynamicVGPRs = true;
4141 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4142 CLI.Args.end(), PushNodeOrTargetConstant);
4143 }
4144 }
4145
4147 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4149 bool &IsTailCall = CLI.IsTailCall;
4150 bool IsVarArg = CLI.IsVarArg;
4151 bool IsSibCall = false;
4153
4154 if (Callee.isUndef() || isNullConstant(Callee)) {
4155 if (!CLI.IsTailCall) {
4156 for (ISD::InputArg &Arg : CLI.Ins)
4157 InVals.push_back(DAG.getPOISON(Arg.VT));
4158 }
4159
4160 return Chain;
4161 }
4162
4163 if (IsVarArg) {
4164 return lowerUnhandledCall(CLI, InVals,
4165 "unsupported call to variadic function ");
4166 }
4167
4168 if (!CLI.CB)
4169 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4170
4171 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4172 return lowerUnhandledCall(CLI, InVals,
4173 "unsupported required tail call to function ");
4174 }
4175
4176 if (IsTailCall) {
4177 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4178 Outs, OutVals, Ins, DAG);
4179 if (!IsTailCall &&
4180 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4181 report_fatal_error("failed to perform tail call elimination on a call "
4182 "site marked musttail or on llvm.amdgcn.cs.chain");
4183 }
4184
4185 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4186
4187 // A sibling call is one where we're under the usual C ABI and not planning
4188 // to change that but can still do a tail call:
4189 if (!TailCallOpt && IsTailCall)
4190 IsSibCall = true;
4191
4192 if (IsTailCall)
4193 ++NumTailCalls;
4194 }
4195
4198 SmallVector<SDValue, 8> MemOpChains;
4199
4200 // Analyze operands of the call, assigning locations to each operand.
4202 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4203 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4204
4205 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4207 // With a fixed ABI, allocate fixed registers before user arguments.
4208 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4209 }
4210
4211 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4212
4213 // Get a count of how many bytes are to be pushed on the stack.
4214 unsigned NumBytes = CCInfo.getStackSize();
4215
4216 if (IsSibCall) {
4217 // Since we're not changing the ABI to make this a tail call, the memory
4218 // operands are already available in the caller's incoming argument space.
4219 NumBytes = 0;
4220 }
4221
4222 // FPDiff is the byte offset of the call's argument area from the callee's.
4223 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4224 // by this amount for a tail call. In a sibling call it must be 0 because the
4225 // caller will deallocate the entire stack and the callee still expects its
4226 // arguments to begin at SP+0. Completely unused for non-tail calls.
4227 int32_t FPDiff = 0;
4228 MachineFrameInfo &MFI = MF.getFrameInfo();
4229 auto *TRI = Subtarget->getRegisterInfo();
4230
4231 // Adjust the stack pointer for the new arguments...
4232 // These operations are automatically eliminated by the prolog/epilog pass
4233 if (!IsSibCall)
4234 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4235
4236 if (!IsSibCall || IsChainCallConv) {
4237 if (!Subtarget->enableFlatScratch()) {
4238 SmallVector<SDValue, 4> CopyFromChains;
4239
4240 // In the HSA case, this should be an identity copy.
4241 SDValue ScratchRSrcReg =
4242 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4243 RegsToPass.emplace_back(IsChainCallConv
4244 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4245 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4246 ScratchRSrcReg);
4247 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4248 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4249 }
4250 }
4251
4252 const unsigned NumSpecialInputs = RegsToPass.size();
4253
4254 MVT PtrVT = MVT::i32;
4255
4256 // Walk the register/memloc assignments, inserting copies/loads.
4257 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4258 CCValAssign &VA = ArgLocs[i];
4259 SDValue Arg = OutVals[i];
4260
4261 // Promote the value if needed.
4262 switch (VA.getLocInfo()) {
4263 case CCValAssign::Full:
4264 break;
4265 case CCValAssign::BCvt:
4266 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4267 break;
4268 case CCValAssign::ZExt:
4269 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4270 break;
4271 case CCValAssign::SExt:
4272 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4273 break;
4274 case CCValAssign::AExt:
4275 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4276 break;
4277 case CCValAssign::FPExt:
4278 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4279 break;
4280 default:
4281 llvm_unreachable("Unknown loc info!");
4282 }
4283
4284 if (VA.isRegLoc()) {
4285 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4286 } else {
4287 assert(VA.isMemLoc());
4288
4289 SDValue DstAddr;
4290 MachinePointerInfo DstInfo;
4291
4292 unsigned LocMemOffset = VA.getLocMemOffset();
4293 int32_t Offset = LocMemOffset;
4294
4295 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4296 MaybeAlign Alignment;
4297
4298 if (IsTailCall) {
4299 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4300 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4301 : VA.getValVT().getStoreSize();
4302
4303 // FIXME: We can have better than the minimum byval required alignment.
4304 Alignment =
4305 Flags.isByVal()
4306 ? Flags.getNonZeroByValAlign()
4307 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4308
4309 Offset = Offset + FPDiff;
4310 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4311
4312 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4313 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4314
4315 // Make sure any stack arguments overlapping with where we're storing
4316 // are loaded before this eventual operation. Otherwise they'll be
4317 // clobbered.
4318
4319 // FIXME: Why is this really necessary? This seems to just result in a
4320 // lot of code to copy the stack and write them back to the same
4321 // locations, which are supposed to be immutable?
4322 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4323 } else {
4324 // Stores to the argument stack area are relative to the stack pointer.
4325 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4326 MVT::i32);
4327 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4328 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4329 Alignment =
4330 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4331 }
4332
4333 if (Outs[i].Flags.isByVal()) {
4334 SDValue SizeNode =
4335 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4336 SDValue Cpy =
4337 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4338 Outs[i].Flags.getNonZeroByValAlign(),
4339 /*isVol = */ false, /*AlwaysInline = */ true,
4340 /*CI=*/nullptr, std::nullopt, DstInfo,
4342
4343 MemOpChains.push_back(Cpy);
4344 } else {
4345 SDValue Store =
4346 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4347 MemOpChains.push_back(Store);
4348 }
4349 }
4350 }
4351
4352 if (!MemOpChains.empty())
4353 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4354
4355 SDValue ReadFirstLaneID =
4356 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4357
4358 SDValue TokenGlue;
4359 if (CLI.ConvergenceControlToken) {
4360 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4362 }
4363
4364 // Build a sequence of copy-to-reg nodes chained together with token chain
4365 // and flag operands which copy the outgoing args into the appropriate regs.
4366 SDValue InGlue;
4367
4368 unsigned ArgIdx = 0;
4369 for (auto [Reg, Val] : RegsToPass) {
4370 if (ArgIdx++ >= NumSpecialInputs &&
4371 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4372 // For chain calls, the inreg arguments are required to be
4373 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4374 // they are uniform.
4375 //
4376 // For other calls, if an inreg arguments is known to be uniform,
4377 // speculatively insert a readfirstlane in case it is in a VGPR.
4378 //
4379 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4380 // value, so let that continue to produce invalid code.
4381
4382 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4383 if (TokenGlue)
4384 ReadfirstlaneArgs.push_back(TokenGlue);
4386 ReadfirstlaneArgs);
4387 }
4388
4389 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4390 InGlue = Chain.getValue(1);
4391 }
4392
4393 // We don't usually want to end the call-sequence here because we would tidy
4394 // the frame up *after* the call, however in the ABI-changing tail-call case
4395 // we've carefully laid out the parameters so that when sp is reset they'll be
4396 // in the correct location.
4397 if (IsTailCall && !IsSibCall) {
4398 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4399 InGlue = Chain.getValue(1);
4400 }
4401
4402 std::vector<SDValue> Ops({Chain});
4403
4404 // Add a redundant copy of the callee global which will not be legalized, as
4405 // we need direct access to the callee later.
4407 const GlobalValue *GV = GSD->getGlobal();
4408 Ops.push_back(Callee);
4409 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4410 } else {
4411 if (IsTailCall) {
4412 // isEligibleForTailCallOptimization considered whether the call target is
4413 // divergent, but we may still end up with a uniform value in a VGPR.
4414 // Insert a readfirstlane just in case.
4415 SDValue ReadFirstLaneID =
4416 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4417
4418 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4419 if (TokenGlue)
4420 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4421 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4422 ReadfirstlaneArgs);
4423 }
4424
4425 Ops.push_back(Callee);
4426 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4427 }
4428
4429 if (IsTailCall) {
4430 // Each tail call may have to adjust the stack by a different amount, so
4431 // this information must travel along with the operation for eventual
4432 // consumption by emitEpilogue.
4433 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4434 }
4435
4436 if (IsChainCallConv)
4437 llvm::append_range(Ops, ChainCallSpecialArgs);
4438
4439 // Add argument registers to the end of the list so that they are known live
4440 // into the call.
4441 for (auto &[Reg, Val] : RegsToPass)
4442 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4443
4444 // Add a register mask operand representing the call-preserved registers.
4445 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4446 assert(Mask && "Missing call preserved mask for calling convention");
4447 Ops.push_back(DAG.getRegisterMask(Mask));
4448
4449 if (SDValue Token = CLI.ConvergenceControlToken) {
4451 GlueOps.push_back(Token);
4452 if (InGlue)
4453 GlueOps.push_back(InGlue);
4454
4455 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4456 MVT::Glue, GlueOps),
4457 0);
4458 }
4459
4460 if (InGlue)
4461 Ops.push_back(InGlue);
4462
4463 // If we're doing a tall call, use a TC_RETURN here rather than an
4464 // actual call instruction.
4465 if (IsTailCall) {
4466 MFI.setHasTailCall();
4467 unsigned OPC = AMDGPUISD::TC_RETURN;
4468 switch (CallConv) {
4471 break;
4474 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4476 break;
4477 }
4478
4479 // If the caller is a whole wave function, we need to use a special opcode
4480 // so we can patch up EXEC.
4481 if (Info->isWholeWaveFunction())
4483
4484 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4485 }
4486
4487 // Returns a chain and a flag for retval copy to use.
4488 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4489 Chain = Call.getValue(0);
4490 InGlue = Call.getValue(1);
4491
4492 uint64_t CalleePopBytes = NumBytes;
4493 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4494 if (!Ins.empty())
4495 InGlue = Chain.getValue(1);
4496
4497 // Handle result values, copying them out of physregs into vregs that we
4498 // return.
4499 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4500 InVals, /*IsThisReturn=*/false, SDValue());
4501}
4502
4503// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4504// except for:
4505// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4506// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4508 SelectionDAG &DAG) const {
4509 const MachineFunction &MF = DAG.getMachineFunction();
4511
4512 SDLoc dl(Op);
4513 EVT VT = Op.getValueType();
4514 SDValue Chain = Op.getOperand(0);
4515 Register SPReg = Info->getStackPtrOffsetReg();
4516
4517 // Chain the dynamic stack allocation so that it doesn't modify the stack
4518 // pointer when other instructions are using the stack.
4519 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4520
4521 SDValue Size = Op.getOperand(1);
4522 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4523 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4524
4525 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4527 "Stack grows upwards for AMDGPU");
4528
4529 Chain = BaseAddr.getValue(1);
4530 Align StackAlign = TFL->getStackAlign();
4531 if (Alignment > StackAlign) {
4532 uint64_t ScaledAlignment = Alignment.value()
4533 << Subtarget->getWavefrontSizeLog2();
4534 uint64_t StackAlignMask = ScaledAlignment - 1;
4535 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4536 DAG.getConstant(StackAlignMask, dl, VT));
4537 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4538 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4539 }
4540
4541 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4542 SDValue NewSP;
4544 // For constant sized alloca, scale alloca size by wave-size
4545 SDValue ScaledSize = DAG.getNode(
4546 ISD::SHL, dl, VT, Size,
4547 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4548 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4549 } else {
4550 // For dynamic sized alloca, perform wave-wide reduction to get max of
4551 // alloca size(divergent) and then scale it by wave-size
4552 SDValue WaveReduction =
4553 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4554 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4555 Size, DAG.getConstant(0, dl, MVT::i32));
4556 SDValue ScaledSize = DAG.getNode(
4557 ISD::SHL, dl, VT, Size,
4558 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4559 NewSP =
4560 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4561 SDValue ReadFirstLaneID =
4562 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4563 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4564 NewSP);
4565 }
4566
4567 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4568 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4569
4570 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4571}
4572
4574 if (Op.getValueType() != MVT::i32)
4575 return Op; // Defer to cannot select error.
4576
4578 SDLoc SL(Op);
4579
4580 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4581
4582 // Convert from wave uniform to swizzled vector address. This should protect
4583 // from any edge cases where the stacksave result isn't directly used with
4584 // stackrestore.
4585 SDValue VectorAddress =
4586 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4587 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4588}
4589
4591 SelectionDAG &DAG) const {
4592 SDLoc SL(Op);
4593 assert(Op.getValueType() == MVT::i32);
4594
4595 uint32_t BothRoundHwReg =
4597 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4598
4599 SDValue IntrinID =
4600 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4601 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4602 Op.getOperand(0), IntrinID, GetRoundBothImm);
4603
4604 // There are two rounding modes, one for f32 and one for f64/f16. We only
4605 // report in the standard value range if both are the same.
4606 //
4607 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4608 // ties away from zero is not supported, and the other values are rotated by
4609 // 1.
4610 //
4611 // If the two rounding modes are not the same, report a target defined value.
4612
4613 // Mode register rounding mode fields:
4614 //
4615 // [1:0] Single-precision round mode.
4616 // [3:2] Double/Half-precision round mode.
4617 //
4618 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4619 //
4620 // Hardware Spec
4621 // Toward-0 3 0
4622 // Nearest Even 0 1
4623 // +Inf 1 2
4624 // -Inf 2 3
4625 // NearestAway0 N/A 4
4626 //
4627 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4628 // table we can index by the raw hardware mode.
4629 //
4630 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4631
4632 SDValue BitTable =
4634
4635 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4636 SDValue RoundModeTimesNumBits =
4637 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4638
4639 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4640 // knew only one mode was demanded.
4641 SDValue TableValue =
4642 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4643 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4644
4645 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4646 SDValue TableEntry =
4647 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4648
4649 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4650 // if it's an extended value.
4651 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4652 SDValue IsStandardValue =
4653 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4654 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4655 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4656 TableEntry, EnumOffset);
4657
4658 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4659}
4660
4662 SelectionDAG &DAG) const {
4663 SDLoc SL(Op);
4664
4665 SDValue NewMode = Op.getOperand(1);
4666 assert(NewMode.getValueType() == MVT::i32);
4667
4668 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4669 // hardware MODE.fp_round values.
4670 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4671 uint32_t ClampedVal = std::min(
4672 static_cast<uint32_t>(ConstMode->getZExtValue()),
4674 NewMode = DAG.getConstant(
4675 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4676 } else {
4677 // If we know the input can only be one of the supported standard modes in
4678 // the range 0-3, we can use a simplified mapping to hardware values.
4679 KnownBits KB = DAG.computeKnownBits(NewMode);
4680 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4681 // The supported standard values are 0-3. The extended values start at 8. We
4682 // need to offset by 4 if the value is in the extended range.
4683
4684 if (UseReducedTable) {
4685 // Truncate to the low 32-bits.
4686 SDValue BitTable = DAG.getConstant(
4687 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4688
4689 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4690 SDValue RoundModeTimesNumBits =
4691 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4692
4693 NewMode =
4694 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4695
4696 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4697 // the table extracted bits into inline immediates.
4698 } else {
4699 // table_index = umin(value, value - 4)
4700 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4701 SDValue BitTable =
4703
4704 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4705 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4706 SDValue IndexVal =
4707 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4708
4709 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4710 SDValue RoundModeTimesNumBits =
4711 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4712
4713 SDValue TableValue =
4714 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4715 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4716
4717 // No need to mask out the high bits since the setreg will ignore them
4718 // anyway.
4719 NewMode = TruncTable;
4720 }
4721
4722 // Insert a readfirstlane in case the value is a VGPR. We could do this
4723 // earlier and keep more operations scalar, but that interferes with
4724 // combining the source.
4725 SDValue ReadFirstLaneID =
4726 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4727 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4728 ReadFirstLaneID, NewMode);
4729 }
4730
4731 // N.B. The setreg will be later folded into s_round_mode on supported
4732 // targets.
4733 SDValue IntrinID =
4734 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4735 uint32_t BothRoundHwReg =
4737 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4738
4739 SDValue SetReg =
4740 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4741 IntrinID, RoundBothImm, NewMode);
4742
4743 return SetReg;
4744}
4745
4747 if (Op->isDivergent() &&
4748 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4749 // Cannot do I$ prefetch with divergent pointer.
4750 return SDValue();
4751
4752 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4756 break;
4758 if (Subtarget->hasSafeSmemPrefetch())
4759 break;
4760 [[fallthrough]];
4761 default:
4762 return SDValue();
4763 }
4764
4765 // I$ prefetch
4766 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4767 return SDValue();
4768
4769 return Op;
4770}
4771
4772// Work around DAG legality rules only based on the result type.
4774 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4775 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4776 EVT SrcVT = Src.getValueType();
4777
4778 if (SrcVT.getScalarType() != MVT::bf16)
4779 return Op;
4780
4781 SDLoc SL(Op);
4782 SDValue BitCast =
4783 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4784
4785 EVT DstVT = Op.getValueType();
4786 if (IsStrict)
4787 llvm_unreachable("Need STRICT_BF16_TO_FP");
4788
4789 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4790}
4791
4793 SDLoc SL(Op);
4794 if (Op.getValueType() != MVT::i64)
4795 return Op;
4796
4797 uint32_t ModeHwReg =
4799 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4800 uint32_t TrapHwReg =
4802 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4803
4804 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4805 SDValue IntrinID =
4806 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4807 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4808 Op.getOperand(0), IntrinID, ModeHwRegImm);
4809 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4810 Op.getOperand(0), IntrinID, TrapHwRegImm);
4811 SDValue TokenReg =
4812 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4813 GetTrapReg.getValue(1));
4814
4815 SDValue CvtPtr =
4816 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4817 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4818
4819 return DAG.getMergeValues({Result, TokenReg}, SL);
4820}
4821
4823 SDLoc SL(Op);
4824 if (Op.getOperand(1).getValueType() != MVT::i64)
4825 return Op;
4826
4827 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4828 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4829 DAG.getConstant(0, SL, MVT::i32));
4830 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4831 DAG.getConstant(1, SL, MVT::i32));
4832
4833 SDValue ReadFirstLaneID =
4834 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4835 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4836 ReadFirstLaneID, NewModeReg);
4837 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4838 ReadFirstLaneID, NewTrapReg);
4839
4840 unsigned ModeHwReg =
4842 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4843 unsigned TrapHwReg =
4845 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4846
4847 SDValue IntrinID =
4848 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4849 SDValue SetModeReg =
4850 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4851 IntrinID, ModeHwRegImm, NewModeReg);
4852 SDValue SetTrapReg =
4853 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4854 IntrinID, TrapHwRegImm, NewTrapReg);
4855 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4856}
4857
4859 const MachineFunction &MF) const {
4860 const Function &Fn = MF.getFunction();
4861
4863 .Case("m0", AMDGPU::M0)
4864 .Case("exec", AMDGPU::EXEC)
4865 .Case("exec_lo", AMDGPU::EXEC_LO)
4866 .Case("exec_hi", AMDGPU::EXEC_HI)
4867 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4868 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4869 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4870 .Default(Register());
4871 if (!Reg)
4872 return Reg;
4873
4874 if (!Subtarget->hasFlatScrRegister() &&
4875 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4876 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4877 "\" for subtarget."));
4878 }
4879
4880 switch (Reg) {
4881 case AMDGPU::M0:
4882 case AMDGPU::EXEC_LO:
4883 case AMDGPU::EXEC_HI:
4884 case AMDGPU::FLAT_SCR_LO:
4885 case AMDGPU::FLAT_SCR_HI:
4886 if (VT.getSizeInBits() == 32)
4887 return Reg;
4888 break;
4889 case AMDGPU::EXEC:
4890 case AMDGPU::FLAT_SCR:
4891 if (VT.getSizeInBits() == 64)
4892 return Reg;
4893 break;
4894 default:
4895 llvm_unreachable("missing register type checking");
4896 }
4897
4899 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4900}
4901
4902// If kill is not the last instruction, split the block so kill is always a
4903// proper terminator.
4906 MachineBasicBlock *BB) const {
4907 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4909 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4910 return SplitBB;
4911}
4912
4913// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4914// \p MI will be the only instruction in the loop body block. Otherwise, it will
4915// be the first instruction in the remainder block.
4916//
4917/// \returns { LoopBody, Remainder }
4918static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4920 MachineFunction *MF = MBB.getParent();
4922
4923 // To insert the loop we need to split the block. Move everything after this
4924 // point to a new block, and insert a new empty block between the two.
4926 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4928 ++MBBI;
4929
4930 MF->insert(MBBI, LoopBB);
4931 MF->insert(MBBI, RemainderBB);
4932
4933 LoopBB->addSuccessor(LoopBB);
4934 LoopBB->addSuccessor(RemainderBB);
4935
4936 // Move the rest of the block into a new block.
4937 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4938
4939 if (InstInLoop) {
4940 auto Next = std::next(I);
4941
4942 // Move instruction to loop body.
4943 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4944
4945 // Move the rest of the block.
4946 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4947 } else {
4948 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4949 }
4950
4951 MBB.addSuccessor(LoopBB);
4952
4953 return std::pair(LoopBB, RemainderBB);
4954}
4955
4956/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4958 MachineBasicBlock *MBB = MI.getParent();
4960 auto I = MI.getIterator();
4961 auto E = std::next(I);
4962
4963 // clang-format off
4964 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4965 .addImm(0);
4966 // clang-format on
4967
4968 MIBundleBuilder Bundler(*MBB, I, E);
4969 finalizeBundle(*MBB, Bundler.begin());
4970}
4971
4974 MachineBasicBlock *BB) const {
4975 const DebugLoc &DL = MI.getDebugLoc();
4976
4978
4980
4981 // Apparently kill flags are only valid if the def is in the same block?
4982 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4983 Src->setIsKill(false);
4984
4985 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4986
4987 MachineBasicBlock::iterator I = LoopBB->end();
4988
4989 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4991
4992 // Clear TRAP_STS.MEM_VIOL
4993 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4994 .addImm(0)
4995 .addImm(EncodedReg);
4996
4998
4999 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5000
5001 // Load and check TRAP_STS.MEM_VIOL
5002 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5003 .addImm(EncodedReg);
5004
5005 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5006 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5007 .addReg(Reg, RegState::Kill)
5008 .addImm(0);
5009 // clang-format off
5010 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5011 .addMBB(LoopBB);
5012 // clang-format on
5013
5014 return RemainderBB;
5015}
5016
5017// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5018// wavefront. If the value is uniform and just happens to be in a VGPR, this
5019// will only do one iteration. In the worst case, this will loop 64 times.
5020//
5021// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5024 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5025 const DebugLoc &DL, const MachineOperand &Idx,
5026 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5027 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5028 Register &SGPRIdxReg) {
5029
5030 MachineFunction *MF = OrigBB.getParent();
5031 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5032 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5035
5036 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5037 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5038 Register NewExec = MRI.createVirtualRegister(BoolRC);
5039 Register CurrentIdxReg =
5040 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5041 Register CondReg = MRI.createVirtualRegister(BoolRC);
5042
5043 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5044 .addReg(InitReg)
5045 .addMBB(&OrigBB)
5046 .addReg(ResultReg)
5047 .addMBB(&LoopBB);
5048
5049 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5050 .addReg(InitSaveExecReg)
5051 .addMBB(&OrigBB)
5052 .addReg(NewExec)
5053 .addMBB(&LoopBB);
5054
5055 // Read the next variant <- also loop target.
5056 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5057 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5058
5059 // Compare the just read M0 value to all possible Idx values.
5060 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5061 .addReg(CurrentIdxReg)
5062 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5063
5064 // Update EXEC, save the original EXEC value to VCC.
5065 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5066 .addReg(CondReg, RegState::Kill);
5067
5068 MRI.setSimpleHint(NewExec, CondReg);
5069
5070 if (UseGPRIdxMode) {
5071 if (Offset == 0) {
5072 SGPRIdxReg = CurrentIdxReg;
5073 } else {
5074 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5075 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5076 .addReg(CurrentIdxReg, RegState::Kill)
5077 .addImm(Offset);
5078 }
5079 } else {
5080 // Move index from VCC into M0
5081 if (Offset == 0) {
5082 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5083 .addReg(CurrentIdxReg, RegState::Kill);
5084 } else {
5085 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5086 .addReg(CurrentIdxReg, RegState::Kill)
5087 .addImm(Offset);
5088 }
5089 }
5090
5091 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5092 MachineInstr *InsertPt =
5093 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5094 .addReg(LMC.ExecReg)
5095 .addReg(NewExec);
5096
5097 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5098 // s_cbranch_scc0?
5099
5100 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5101 // clang-format off
5102 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5103 .addMBB(&LoopBB);
5104 // clang-format on
5105
5106 return InsertPt->getIterator();
5107}
5108
5109// This has slightly sub-optimal regalloc when the source vector is killed by
5110// the read. The register allocator does not understand that the kill is
5111// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5112// subregister from it, using 1 more VGPR than necessary. This was saved when
5113// this was expanded after register allocation.
5116 unsigned InitResultReg, unsigned PhiReg, int Offset,
5117 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5118 MachineFunction *MF = MBB.getParent();
5119 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5120 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5122 const DebugLoc &DL = MI.getDebugLoc();
5124
5125 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5126 Register DstReg = MI.getOperand(0).getReg();
5127 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5128 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5130
5131 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5132
5133 // Save the EXEC mask
5134 // clang-format off
5135 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5136 .addReg(LMC.ExecReg);
5137 // clang-format on
5138
5139 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5140
5141 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5142
5143 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5144 InitResultReg, DstReg, PhiReg, TmpExec,
5145 Offset, UseGPRIdxMode, SGPRIdxReg);
5146
5147 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5149 ++MBBI;
5150 MF->insert(MBBI, LandingPad);
5151 LoopBB->removeSuccessor(RemainderBB);
5152 LandingPad->addSuccessor(RemainderBB);
5153 LoopBB->addSuccessor(LandingPad);
5154 MachineBasicBlock::iterator First = LandingPad->begin();
5155 // clang-format off
5156 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5157 .addReg(SaveExec);
5158 // clang-format on
5159
5160 return InsPt;
5161}
5162
5163// Returns subreg index, offset
5164static std::pair<unsigned, int>
5166 const TargetRegisterClass *SuperRC, unsigned VecReg,
5167 int Offset) {
5168 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5169
5170 // Skip out of bounds offsets, or else we would end up using an undefined
5171 // register.
5172 if (Offset >= NumElts || Offset < 0)
5173 return std::pair(AMDGPU::sub0, Offset);
5174
5175 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5176}
5177
5180 int Offset) {
5181 MachineBasicBlock *MBB = MI.getParent();
5182 const DebugLoc &DL = MI.getDebugLoc();
5184
5185 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5186
5187 assert(Idx->getReg() != AMDGPU::NoRegister);
5188
5189 if (Offset == 0) {
5190 // clang-format off
5191 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5192 .add(*Idx);
5193 // clang-format on
5194 } else {
5195 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5196 .add(*Idx)
5197 .addImm(Offset);
5198 }
5199}
5200
5203 int Offset) {
5204 MachineBasicBlock *MBB = MI.getParent();
5205 const DebugLoc &DL = MI.getDebugLoc();
5207
5208 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5209
5210 if (Offset == 0)
5211 return Idx->getReg();
5212
5213 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5214 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5215 .add(*Idx)
5216 .addImm(Offset);
5217 return Tmp;
5218}
5219
5222 const GCNSubtarget &ST) {
5223 const SIInstrInfo *TII = ST.getInstrInfo();
5224 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5225 MachineFunction *MF = MBB.getParent();
5227
5228 Register Dst = MI.getOperand(0).getReg();
5229 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5230 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5231 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5232
5233 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5234 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5235
5236 unsigned SubReg;
5237 std::tie(SubReg, Offset) =
5238 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5239
5240 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5241
5242 // Check for a SGPR index.
5243 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5245 const DebugLoc &DL = MI.getDebugLoc();
5246
5247 if (UseGPRIdxMode) {
5248 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5249 // to avoid interfering with other uses, so probably requires a new
5250 // optimization pass.
5252
5253 const MCInstrDesc &GPRIDXDesc =
5254 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5255 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5256 .addReg(SrcReg)
5257 .addReg(Idx)
5258 .addImm(SubReg);
5259 } else {
5261
5262 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5263 .addReg(SrcReg, 0, SubReg)
5264 .addReg(SrcReg, RegState::Implicit);
5265 }
5266
5267 MI.eraseFromParent();
5268
5269 return &MBB;
5270 }
5271
5272 // Control flow needs to be inserted if indexing with a VGPR.
5273 const DebugLoc &DL = MI.getDebugLoc();
5275
5276 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5277 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5278
5279 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5280
5281 Register SGPRIdxReg;
5282 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5283 UseGPRIdxMode, SGPRIdxReg);
5284
5285 MachineBasicBlock *LoopBB = InsPt->getParent();
5286
5287 if (UseGPRIdxMode) {
5288 const MCInstrDesc &GPRIDXDesc =
5289 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5290
5291 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5292 .addReg(SrcReg)
5293 .addReg(SGPRIdxReg)
5294 .addImm(SubReg);
5295 } else {
5296 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5297 .addReg(SrcReg, 0, SubReg)
5298 .addReg(SrcReg, RegState::Implicit);
5299 }
5300
5301 MI.eraseFromParent();
5302
5303 return LoopBB;
5304}
5305
5308 const GCNSubtarget &ST) {
5309 const SIInstrInfo *TII = ST.getInstrInfo();
5310 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5311 MachineFunction *MF = MBB.getParent();
5313
5314 Register Dst = MI.getOperand(0).getReg();
5315 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5316 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5317 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5318 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5319 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5320 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5321
5322 // This can be an immediate, but will be folded later.
5323 assert(Val->getReg());
5324
5325 unsigned SubReg;
5326 std::tie(SubReg, Offset) =
5327 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5328 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5329
5330 if (Idx->getReg() == AMDGPU::NoRegister) {
5332 const DebugLoc &DL = MI.getDebugLoc();
5333
5334 assert(Offset == 0);
5335
5336 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5337 .add(*SrcVec)
5338 .add(*Val)
5339 .addImm(SubReg);
5340
5341 MI.eraseFromParent();
5342 return &MBB;
5343 }
5344
5345 // Check for a SGPR index.
5346 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5348 const DebugLoc &DL = MI.getDebugLoc();
5349
5350 if (UseGPRIdxMode) {
5352
5353 const MCInstrDesc &GPRIDXDesc =
5354 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5355 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5356 .addReg(SrcVec->getReg())
5357 .add(*Val)
5358 .addReg(Idx)
5359 .addImm(SubReg);
5360 } else {
5362
5363 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5364 TRI.getRegSizeInBits(*VecRC), 32, false);
5365 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5366 .addReg(SrcVec->getReg())
5367 .add(*Val)
5368 .addImm(SubReg);
5369 }
5370 MI.eraseFromParent();
5371 return &MBB;
5372 }
5373
5374 // Control flow needs to be inserted if indexing with a VGPR.
5375 if (Val->isReg())
5376 MRI.clearKillFlags(Val->getReg());
5377
5378 const DebugLoc &DL = MI.getDebugLoc();
5379
5380 Register PhiReg = MRI.createVirtualRegister(VecRC);
5381
5382 Register SGPRIdxReg;
5383 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5384 UseGPRIdxMode, SGPRIdxReg);
5385 MachineBasicBlock *LoopBB = InsPt->getParent();
5386
5387 if (UseGPRIdxMode) {
5388 const MCInstrDesc &GPRIDXDesc =
5389 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5390
5391 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5392 .addReg(PhiReg)
5393 .add(*Val)
5394 .addReg(SGPRIdxReg)
5395 .addImm(SubReg);
5396 } else {
5397 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5398 TRI.getRegSizeInBits(*VecRC), 32, false);
5399 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5400 .addReg(PhiReg)
5401 .add(*Val)
5402 .addImm(SubReg);
5403 }
5404
5405 MI.eraseFromParent();
5406 return LoopBB;
5407}
5408
5410 MachineBasicBlock *BB) {
5411 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5412 // For GFX12, we emit s_add_u64 and s_sub_u64.
5413 MachineFunction *MF = BB->getParent();
5414 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5415 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5417 const DebugLoc &DL = MI.getDebugLoc();
5418 MachineOperand &Dest = MI.getOperand(0);
5419 MachineOperand &Src0 = MI.getOperand(1);
5420 MachineOperand &Src1 = MI.getOperand(2);
5421 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5422 if (ST.hasScalarAddSub64()) {
5423 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5424 // clang-format off
5425 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5426 .add(Src0)
5427 .add(Src1);
5428 // clang-format on
5429 } else {
5430 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5431 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5432
5433 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5435
5436 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5437 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5438 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5439 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5440
5441 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5442 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5443 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5444 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5445
5446 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5447 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5448 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5449 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5450 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5451 .addReg(DestSub0)
5452 .addImm(AMDGPU::sub0)
5453 .addReg(DestSub1)
5454 .addImm(AMDGPU::sub1);
5455 }
5456 MI.eraseFromParent();
5457 return BB;
5458}
5459
5461 switch (Opc) {
5462 case AMDGPU::S_MIN_U32:
5463 return std::numeric_limits<uint32_t>::max();
5464 case AMDGPU::S_MIN_I32:
5465 return std::numeric_limits<int32_t>::max();
5466 case AMDGPU::S_MAX_U32:
5467 return std::numeric_limits<uint32_t>::min();
5468 case AMDGPU::S_MAX_I32:
5469 return std::numeric_limits<int32_t>::min();
5470 case AMDGPU::S_ADD_I32:
5471 case AMDGPU::S_SUB_I32:
5472 case AMDGPU::S_OR_B32:
5473 case AMDGPU::S_XOR_B32:
5474 return std::numeric_limits<uint32_t>::min();
5475 case AMDGPU::S_AND_B32:
5476 return std::numeric_limits<uint32_t>::max();
5477 default:
5479 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5480 }
5481}
5482
5484 switch (Opc) {
5485 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5486 return std::numeric_limits<uint64_t>::max();
5487 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5488 return std::numeric_limits<int64_t>::max();
5489 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5490 return std::numeric_limits<uint64_t>::min();
5491 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5492 return std::numeric_limits<int64_t>::min();
5493 case AMDGPU::S_ADD_U64_PSEUDO:
5494 case AMDGPU::S_SUB_U64_PSEUDO:
5495 case AMDGPU::S_OR_B64:
5496 case AMDGPU::S_XOR_B64:
5497 return std::numeric_limits<uint64_t>::min();
5498 case AMDGPU::S_AND_B64:
5499 return std::numeric_limits<uint64_t>::max();
5500 default:
5502 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5503 }
5504}
5505
5506static bool is32bitWaveReduceOperation(unsigned Opc) {
5507 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5508 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5509 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5510 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5511 Opc == AMDGPU::S_XOR_B32;
5512}
5513
5516 const GCNSubtarget &ST,
5517 unsigned Opc) {
5519 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5520 const DebugLoc &DL = MI.getDebugLoc();
5521 const SIInstrInfo *TII = ST.getInstrInfo();
5522
5523 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5524 Register SrcReg = MI.getOperand(1).getReg();
5525 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5526 Register DstReg = MI.getOperand(0).getReg();
5527 MachineBasicBlock *RetBB = nullptr;
5528 if (isSGPR) {
5529 switch (Opc) {
5530 case AMDGPU::S_MIN_U32:
5531 case AMDGPU::S_MIN_I32:
5532 case AMDGPU::S_MAX_U32:
5533 case AMDGPU::S_MAX_I32:
5534 case AMDGPU::S_AND_B32:
5535 case AMDGPU::S_OR_B32: {
5536 // Idempotent operations.
5537 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5538 RetBB = &BB;
5539 break;
5540 }
5541 case AMDGPU::V_CMP_LT_U64_e64: // umin
5542 case AMDGPU::V_CMP_LT_I64_e64: // min
5543 case AMDGPU::V_CMP_GT_U64_e64: // umax
5544 case AMDGPU::V_CMP_GT_I64_e64: // max
5545 case AMDGPU::S_AND_B64:
5546 case AMDGPU::S_OR_B64: {
5547 // Idempotent operations.
5548 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5549 RetBB = &BB;
5550 break;
5551 }
5552 case AMDGPU::S_XOR_B32:
5553 case AMDGPU::S_XOR_B64:
5554 case AMDGPU::S_ADD_I32:
5555 case AMDGPU::S_ADD_U64_PSEUDO:
5556 case AMDGPU::S_SUB_I32:
5557 case AMDGPU::S_SUB_U64_PSEUDO: {
5558 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5559 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5560 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5561 Register NumActiveLanes =
5562 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5563
5564 bool IsWave32 = ST.isWave32();
5565 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5566 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5567 unsigned BitCountOpc =
5568 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5569
5570 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5571
5572 auto NewAccumulator =
5573 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5574 .addReg(ExecMask);
5575
5576 switch (Opc) {
5577 case AMDGPU::S_XOR_B32:
5578 case AMDGPU::S_XOR_B64: {
5579 // Performing an XOR operation on a uniform value
5580 // depends on the parity of the number of active lanes.
5581 // For even parity, the result will be 0, for odd
5582 // parity the result will be the same as the input value.
5583 Register ParityRegister =
5584 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5585
5586 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5587 .addReg(NewAccumulator->getOperand(0).getReg())
5588 .addImm(1)
5589 .setOperandDead(3); // Dead scc
5590 if (Opc == AMDGPU::S_XOR_B32) {
5591 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5592 .addReg(SrcReg)
5593 .addReg(ParityRegister);
5594 } else {
5595 Register DestSub0 =
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5597 Register DestSub1 =
5598 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5599
5600 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5601 const TargetRegisterClass *SrcSubRC =
5602 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5603
5604 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5605 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5606 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5607 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5608
5609 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5610 .add(Op1L)
5611 .addReg(ParityRegister);
5612
5613 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5614 .add(Op1H)
5615 .addReg(ParityRegister);
5616
5617 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5618 .addReg(DestSub0)
5619 .addImm(AMDGPU::sub0)
5620 .addReg(DestSub1)
5621 .addImm(AMDGPU::sub1);
5622 }
5623 break;
5624 }
5625 case AMDGPU::S_SUB_I32: {
5626 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5627
5628 // Take the negation of the source operand.
5629 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5630 .addImm(0)
5631 .addReg(SrcReg);
5632 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5633 .addReg(NegatedVal)
5634 .addReg(NewAccumulator->getOperand(0).getReg());
5635 break;
5636 }
5637 case AMDGPU::S_ADD_I32: {
5638 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5639 .addReg(SrcReg)
5640 .addReg(NewAccumulator->getOperand(0).getReg());
5641 break;
5642 }
5643 case AMDGPU::S_ADD_U64_PSEUDO:
5644 case AMDGPU::S_SUB_U64_PSEUDO: {
5645 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5647 Register Op1H_Op0L_Reg =
5648 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 Register Op1L_Op0H_Reg =
5650 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register NegatedValLo =
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655 Register NegatedValHi =
5656 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5657
5658 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5659 const TargetRegisterClass *Src1SubRC =
5660 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5661
5662 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5663 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5664 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5665 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5666
5667 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5668 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5669 .addImm(0)
5670 .addReg(NewAccumulator->getOperand(0).getReg())
5671 .setOperandDead(3); // Dead scc
5672 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5673 .addReg(NegatedValLo)
5674 .addImm(31)
5675 .setOperandDead(3); // Dead scc
5676 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5677 .add(Op1L)
5678 .addReg(NegatedValHi);
5679 }
5680 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5681 ? NegatedValLo
5682 : NewAccumulator->getOperand(0).getReg();
5683 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5684 .add(Op1L)
5685 .addReg(LowOpcode);
5686 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5687 .add(Op1L)
5688 .addReg(LowOpcode);
5689 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5690 .add(Op1H)
5691 .addReg(LowOpcode);
5692
5693 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5694 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5695 .addReg(CarryReg)
5696 .addReg(Op1H_Op0L_Reg)
5697 .setOperandDead(3); // Dead scc
5698
5699 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5700 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5701 .addReg(HiVal)
5702 .addReg(Op1L_Op0H_Reg)
5703 .setOperandDead(3); // Dead scc
5704 }
5705 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5706 .addReg(DestSub0)
5707 .addImm(AMDGPU::sub0)
5708 .addReg(DestSub1)
5709 .addImm(AMDGPU::sub1);
5710 break;
5711 }
5712 }
5713 RetBB = &BB;
5714 }
5715 }
5716 } else {
5717 // TODO: Implement DPP Strategy and switch based on immediate strategy
5718 // operand. For now, for all the cases (default, Iterative and DPP we use
5719 // iterative approach by default.)
5720
5721 // To reduce the VGPR using iterative approach, we need to iterate
5722 // over all the active lanes. Lowering consists of ComputeLoop,
5723 // which iterate over only active lanes. We use copy of EXEC register
5724 // as induction variable and every active lane modifies it using bitset0
5725 // so that we will get the next active lane for next iteration.
5727 Register SrcReg = MI.getOperand(1).getReg();
5728 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5729
5730 // Create Control flow for loop
5731 // Split MI's Machine Basic block into For loop
5732 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5733
5734 // Create virtual registers required for lowering.
5735 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5736 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5737 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5738 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5739 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5740 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5741 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5742 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5743 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5744
5745 bool IsWave32 = ST.isWave32();
5746 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5747 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5748
5749 // Create initial values of induction variable from Exec, Accumulator and
5750 // insert branch instr to newly created ComputeBlock
5751 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5752 if (is32BitOpc) {
5754 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5755 .addImm(IdentityValue);
5756 } else {
5758 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5759 .addImm(IdentityValue);
5760 }
5761 // clang-format off
5762 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5763 .addMBB(ComputeLoop);
5764 // clang-format on
5765
5766 // Start constructing ComputeLoop
5767 I = ComputeLoop->begin();
5768 auto Accumulator =
5769 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5770 .addReg(IdentityValReg)
5771 .addMBB(&BB);
5772 auto ActiveBits =
5773 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5774 .addReg(LoopIterator)
5775 .addMBB(&BB);
5776
5777 I = ComputeLoop->end();
5778 MachineInstr *NewAccumulator;
5779 // Perform the computations
5780 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5781 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5782 .addReg(ActiveBitsReg);
5783 if (is32BitOpc) {
5784 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5785 LaneValueReg)
5786 .addReg(SrcReg)
5787 .addReg(FF1Reg);
5788 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5789 .addReg(Accumulator->getOperand(0).getReg())
5790 .addReg(LaneValueReg);
5791 } else {
5792 Register LaneValueLoReg =
5793 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5794 Register LaneValueHiReg =
5795 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5796 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5797 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5798 const TargetRegisterClass *SrcSubRC =
5799 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5800 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5801 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5802 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5803 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5804 // lane value input should be in an sgpr
5805 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5806 LaneValueLoReg)
5807 .add(Op1L)
5808 .addReg(FF1Reg);
5809 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5810 LaneValueHiReg)
5811 .add(Op1H)
5812 .addReg(FF1Reg);
5813 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5814 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5815 .addReg(LaneValueLoReg)
5816 .addImm(AMDGPU::sub0)
5817 .addReg(LaneValueHiReg)
5818 .addImm(AMDGPU::sub1);
5819 switch (Opc) {
5820 case AMDGPU::S_OR_B64:
5821 case AMDGPU::S_AND_B64:
5822 case AMDGPU::S_XOR_B64: {
5823 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5824 .addReg(Accumulator->getOperand(0).getReg())
5825 .addReg(LaneValue->getOperand(0).getReg())
5826 .setOperandDead(3); // Dead scc
5827 break;
5828 }
5829 case AMDGPU::V_CMP_GT_I64_e64:
5830 case AMDGPU::V_CMP_GT_U64_e64:
5831 case AMDGPU::V_CMP_LT_I64_e64:
5832 case AMDGPU::V_CMP_LT_U64_e64: {
5833 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5834 Register ComparisonResultReg =
5835 MRI.createVirtualRegister(WaveMaskRegClass);
5836 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5837 const TargetRegisterClass *VSubRegClass =
5838 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5839 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5840 MachineOperand SrcReg0Sub0 =
5841 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5842 VregClass, AMDGPU::sub0, VSubRegClass);
5843 MachineOperand SrcReg0Sub1 =
5844 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5845 VregClass, AMDGPU::sub1, VSubRegClass);
5846 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5847 AccumulatorVReg)
5848 .add(SrcReg0Sub0)
5849 .addImm(AMDGPU::sub0)
5850 .add(SrcReg0Sub1)
5851 .addImm(AMDGPU::sub1);
5852 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5853 .addReg(LaneValue->getOperand(0).getReg())
5854 .addReg(AccumulatorVReg);
5855
5856 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5857 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5858 .addReg(LaneMaskReg)
5859 .addReg(ActiveBitsReg);
5860
5861 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5862 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5863 .addReg(LaneValue->getOperand(0).getReg())
5864 .addReg(Accumulator->getOperand(0).getReg());
5865 break;
5866 }
5867 case AMDGPU::S_ADD_U64_PSEUDO:
5868 case AMDGPU::S_SUB_U64_PSEUDO: {
5869 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5870 .addReg(Accumulator->getOperand(0).getReg())
5871 .addReg(LaneValue->getOperand(0).getReg());
5872 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5873 break;
5874 }
5875 }
5876 }
5877 // Manipulate the iterator to get the next active lane
5878 unsigned BITSETOpc =
5879 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5880 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5881 .addReg(FF1Reg)
5882 .addReg(ActiveBitsReg);
5883
5884 // Add phi nodes
5885 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5886 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5887
5888 // Creating branching
5889 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5890 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5891 .addReg(NewActiveBitsReg)
5892 .addImm(0);
5893 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5894 .addMBB(ComputeLoop);
5895
5896 RetBB = ComputeEnd;
5897 }
5898 MI.eraseFromParent();
5899 return RetBB;
5900}
5901
5904 MachineBasicBlock *BB) const {
5905
5907 MachineFunction *MF = BB->getParent();
5909
5910 switch (MI.getOpcode()) {
5911 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5912 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5913 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5914 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5915 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5916 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5917 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5918 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5919 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5920 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5921 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5922 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5923 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5924 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5925 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5926 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5927 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5928 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5929 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5930 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5931 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5932 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5933 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5934 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5935 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5936 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5937 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5938 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5939 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5940 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5941 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5942 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5943 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5944 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5945 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5946 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5947 case AMDGPU::S_UADDO_PSEUDO:
5948 case AMDGPU::S_USUBO_PSEUDO: {
5949 const DebugLoc &DL = MI.getDebugLoc();
5950 MachineOperand &Dest0 = MI.getOperand(0);
5951 MachineOperand &Dest1 = MI.getOperand(1);
5952 MachineOperand &Src0 = MI.getOperand(2);
5953 MachineOperand &Src1 = MI.getOperand(3);
5954
5955 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5956 ? AMDGPU::S_ADD_I32
5957 : AMDGPU::S_SUB_I32;
5958 // clang-format off
5959 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5960 .add(Src0)
5961 .add(Src1);
5962 // clang-format on
5963
5964 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5965 .addImm(1)
5966 .addImm(0);
5967
5968 MI.eraseFromParent();
5969 return BB;
5970 }
5971 case AMDGPU::S_ADD_U64_PSEUDO:
5972 case AMDGPU::S_SUB_U64_PSEUDO: {
5973 return Expand64BitScalarArithmetic(MI, BB);
5974 }
5975 case AMDGPU::V_ADD_U64_PSEUDO:
5976 case AMDGPU::V_SUB_U64_PSEUDO: {
5978 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5979 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5980 const DebugLoc &DL = MI.getDebugLoc();
5981
5982 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5983
5984 MachineOperand &Dest = MI.getOperand(0);
5985 MachineOperand &Src0 = MI.getOperand(1);
5986 MachineOperand &Src1 = MI.getOperand(2);
5987
5988 if (ST.hasAddSubU64Insts()) {
5989 auto I = BuildMI(*BB, MI, DL,
5990 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5991 : AMDGPU::V_SUB_U64_e64),
5992 Dest.getReg())
5993 .add(Src0)
5994 .add(Src1)
5995 .addImm(0); // clamp
5996 TII->legalizeOperands(*I);
5997 MI.eraseFromParent();
5998 return BB;
5999 }
6000
6001 if (IsAdd && ST.hasLshlAddU64Inst()) {
6002 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6003 Dest.getReg())
6004 .add(Src0)
6005 .addImm(0)
6006 .add(Src1);
6007 TII->legalizeOperands(*Add);
6008 MI.eraseFromParent();
6009 return BB;
6010 }
6011
6012 const auto *CarryRC = TRI->getWaveMaskRegClass();
6013
6014 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6015 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6016
6017 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6018 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6019
6020 const TargetRegisterClass *Src0RC = Src0.isReg()
6021 ? MRI.getRegClass(Src0.getReg())
6022 : &AMDGPU::VReg_64RegClass;
6023 const TargetRegisterClass *Src1RC = Src1.isReg()
6024 ? MRI.getRegClass(Src1.getReg())
6025 : &AMDGPU::VReg_64RegClass;
6026
6027 const TargetRegisterClass *Src0SubRC =
6028 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6029 const TargetRegisterClass *Src1SubRC =
6030 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6031
6032 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6033 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6034 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6035 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6036
6037 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6038 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6039 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6040 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6041
6042 unsigned LoOpc =
6043 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6044 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6045 .addReg(CarryReg, RegState::Define)
6046 .add(SrcReg0Sub0)
6047 .add(SrcReg1Sub0)
6048 .addImm(0); // clamp bit
6049
6050 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6051 MachineInstr *HiHalf =
6052 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6053 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6054 .add(SrcReg0Sub1)
6055 .add(SrcReg1Sub1)
6056 .addReg(CarryReg, RegState::Kill)
6057 .addImm(0); // clamp bit
6058
6059 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6060 .addReg(DestSub0)
6061 .addImm(AMDGPU::sub0)
6062 .addReg(DestSub1)
6063 .addImm(AMDGPU::sub1);
6064 TII->legalizeOperands(*LoHalf);
6065 TII->legalizeOperands(*HiHalf);
6066 MI.eraseFromParent();
6067 return BB;
6068 }
6069 case AMDGPU::S_ADD_CO_PSEUDO:
6070 case AMDGPU::S_SUB_CO_PSEUDO: {
6071 // This pseudo has a chance to be selected
6072 // only from uniform add/subcarry node. All the VGPR operands
6073 // therefore assumed to be splat vectors.
6075 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6076 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6078 const DebugLoc &DL = MI.getDebugLoc();
6079 MachineOperand &Dest = MI.getOperand(0);
6080 MachineOperand &CarryDest = MI.getOperand(1);
6081 MachineOperand &Src0 = MI.getOperand(2);
6082 MachineOperand &Src1 = MI.getOperand(3);
6083 MachineOperand &Src2 = MI.getOperand(4);
6084 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6085 ? AMDGPU::S_ADDC_U32
6086 : AMDGPU::S_SUBB_U32;
6087 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6088 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6089 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6090 .addReg(Src0.getReg());
6091 Src0.setReg(RegOp0);
6092 }
6093 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6094 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6095 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6096 .addReg(Src1.getReg());
6097 Src1.setReg(RegOp1);
6098 }
6099 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6100 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6101 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6102 .addReg(Src2.getReg());
6103 Src2.setReg(RegOp2);
6104 }
6105
6106 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6107 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
6108 assert(WaveSize == 64 || WaveSize == 32);
6109
6110 if (WaveSize == 64) {
6111 if (ST.hasScalarCompareEq64()) {
6112 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6113 .addReg(Src2.getReg())
6114 .addImm(0);
6115 } else {
6116 const TargetRegisterClass *SubRC =
6117 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6118 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6119 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6120 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6121 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6122 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6123
6124 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6125 .add(Src2Sub0)
6126 .add(Src2Sub1);
6127
6128 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6129 .addReg(Src2_32, RegState::Kill)
6130 .addImm(0);
6131 }
6132 } else {
6133 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6134 .addReg(Src2.getReg())
6135 .addImm(0);
6136 }
6137
6138 // clang-format off
6139 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6140 .add(Src0)
6141 .add(Src1);
6142 // clang-format on
6143
6144 unsigned SelOpc =
6145 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6146
6147 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6148 .addImm(-1)
6149 .addImm(0);
6150
6151 MI.eraseFromParent();
6152 return BB;
6153 }
6154 case AMDGPU::SI_INIT_M0: {
6155 MachineOperand &M0Init = MI.getOperand(0);
6156 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6157 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6158 AMDGPU::M0)
6159 .add(M0Init);
6160 MI.eraseFromParent();
6161 return BB;
6162 }
6163 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6164 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6165 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6166 TII->get(AMDGPU::S_CMP_EQ_U32))
6167 .addImm(0)
6168 .addImm(0);
6169 return BB;
6170 }
6171 case AMDGPU::GET_GROUPSTATICSIZE: {
6172 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6173 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6174 DebugLoc DL = MI.getDebugLoc();
6175 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6176 .add(MI.getOperand(0))
6177 .addImm(MFI->getLDSSize());
6178 MI.eraseFromParent();
6179 return BB;
6180 }
6181 case AMDGPU::GET_SHADERCYCLESHILO: {
6184 const DebugLoc &DL = MI.getDebugLoc();
6185 // The algorithm is:
6186 //
6187 // hi1 = getreg(SHADER_CYCLES_HI)
6188 // lo1 = getreg(SHADER_CYCLES_LO)
6189 // hi2 = getreg(SHADER_CYCLES_HI)
6190 //
6191 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6192 // Otherwise there was overflow and the result is hi2:0. In both cases the
6193 // result should represent the actual time at some point during the sequence
6194 // of three getregs.
6195 using namespace AMDGPU::Hwreg;
6196 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6197 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6198 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6199 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6200 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6201 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6202 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6203 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6204 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6205 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6206 .addReg(RegHi1)
6207 .addReg(RegHi2);
6208 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6209 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6210 .addReg(RegLo1)
6211 .addImm(0);
6212 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6213 .add(MI.getOperand(0))
6214 .addReg(RegLo)
6215 .addImm(AMDGPU::sub0)
6216 .addReg(RegHi2)
6217 .addImm(AMDGPU::sub1);
6218 MI.eraseFromParent();
6219 return BB;
6220 }
6221 case AMDGPU::SI_INDIRECT_SRC_V1:
6222 case AMDGPU::SI_INDIRECT_SRC_V2:
6223 case AMDGPU::SI_INDIRECT_SRC_V4:
6224 case AMDGPU::SI_INDIRECT_SRC_V8:
6225 case AMDGPU::SI_INDIRECT_SRC_V9:
6226 case AMDGPU::SI_INDIRECT_SRC_V10:
6227 case AMDGPU::SI_INDIRECT_SRC_V11:
6228 case AMDGPU::SI_INDIRECT_SRC_V12:
6229 case AMDGPU::SI_INDIRECT_SRC_V16:
6230 case AMDGPU::SI_INDIRECT_SRC_V32:
6231 return emitIndirectSrc(MI, *BB, *getSubtarget());
6232 case AMDGPU::SI_INDIRECT_DST_V1:
6233 case AMDGPU::SI_INDIRECT_DST_V2:
6234 case AMDGPU::SI_INDIRECT_DST_V4:
6235 case AMDGPU::SI_INDIRECT_DST_V8:
6236 case AMDGPU::SI_INDIRECT_DST_V9:
6237 case AMDGPU::SI_INDIRECT_DST_V10:
6238 case AMDGPU::SI_INDIRECT_DST_V11:
6239 case AMDGPU::SI_INDIRECT_DST_V12:
6240 case AMDGPU::SI_INDIRECT_DST_V16:
6241 case AMDGPU::SI_INDIRECT_DST_V32:
6242 return emitIndirectDst(MI, *BB, *getSubtarget());
6243 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6244 case AMDGPU::SI_KILL_I1_PSEUDO:
6245 return splitKillBlock(MI, BB);
6246 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6248 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6249 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6250
6251 Register Dst = MI.getOperand(0).getReg();
6252 const MachineOperand &Src0 = MI.getOperand(1);
6253 const MachineOperand &Src1 = MI.getOperand(2);
6254 const DebugLoc &DL = MI.getDebugLoc();
6255 Register SrcCond = MI.getOperand(3).getReg();
6256
6257 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6258 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6259 const auto *CondRC = TRI->getWaveMaskRegClass();
6260 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6261
6262 const TargetRegisterClass *Src0RC = Src0.isReg()
6263 ? MRI.getRegClass(Src0.getReg())
6264 : &AMDGPU::VReg_64RegClass;
6265 const TargetRegisterClass *Src1RC = Src1.isReg()
6266 ? MRI.getRegClass(Src1.getReg())
6267 : &AMDGPU::VReg_64RegClass;
6268
6269 const TargetRegisterClass *Src0SubRC =
6270 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6271 const TargetRegisterClass *Src1SubRC =
6272 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6273
6274 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6275 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6276 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6277 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6278
6279 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6280 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6281 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6282 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6283
6284 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6285 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6286 .addImm(0)
6287 .add(Src0Sub0)
6288 .addImm(0)
6289 .add(Src1Sub0)
6290 .addReg(SrcCondCopy);
6291 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6292 .addImm(0)
6293 .add(Src0Sub1)
6294 .addImm(0)
6295 .add(Src1Sub1)
6296 .addReg(SrcCondCopy);
6297
6298 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6299 .addReg(DstLo)
6300 .addImm(AMDGPU::sub0)
6301 .addReg(DstHi)
6302 .addImm(AMDGPU::sub1);
6303 MI.eraseFromParent();
6304 return BB;
6305 }
6306 case AMDGPU::SI_BR_UNDEF: {
6308 const DebugLoc &DL = MI.getDebugLoc();
6309 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6310 .add(MI.getOperand(0));
6311 Br->getOperand(1).setIsUndef(); // read undef SCC
6312 MI.eraseFromParent();
6313 return BB;
6314 }
6315 case AMDGPU::ADJCALLSTACKUP:
6316 case AMDGPU::ADJCALLSTACKDOWN: {
6318 MachineInstrBuilder MIB(*MF, &MI);
6319 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6320 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6321 return BB;
6322 }
6323 case AMDGPU::SI_CALL_ISEL: {
6325 const DebugLoc &DL = MI.getDebugLoc();
6326
6327 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6328
6330 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6331
6332 for (const MachineOperand &MO : MI.operands())
6333 MIB.add(MO);
6334
6335 MIB.cloneMemRefs(MI);
6336 MI.eraseFromParent();
6337 return BB;
6338 }
6339 case AMDGPU::V_ADD_CO_U32_e32:
6340 case AMDGPU::V_SUB_CO_U32_e32:
6341 case AMDGPU::V_SUBREV_CO_U32_e32: {
6342 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6343 const DebugLoc &DL = MI.getDebugLoc();
6344 unsigned Opc = MI.getOpcode();
6345
6346 bool NeedClampOperand = false;
6347 if (TII->pseudoToMCOpcode(Opc) == -1) {
6349 NeedClampOperand = true;
6350 }
6351
6352 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6353 if (TII->isVOP3(*I)) {
6354 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6355 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6356 I.addReg(TRI->getVCC(), RegState::Define);
6357 }
6358 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6359 if (NeedClampOperand)
6360 I.addImm(0); // clamp bit for e64 encoding
6361
6362 TII->legalizeOperands(*I);
6363
6364 MI.eraseFromParent();
6365 return BB;
6366 }
6367 case AMDGPU::V_ADDC_U32_e32:
6368 case AMDGPU::V_SUBB_U32_e32:
6369 case AMDGPU::V_SUBBREV_U32_e32:
6370 // These instructions have an implicit use of vcc which counts towards the
6371 // constant bus limit.
6372 TII->legalizeOperands(MI);
6373 return BB;
6374 case AMDGPU::DS_GWS_INIT:
6375 case AMDGPU::DS_GWS_SEMA_BR:
6376 case AMDGPU::DS_GWS_BARRIER:
6377 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6378 [[fallthrough]];
6379 case AMDGPU::DS_GWS_SEMA_V:
6380 case AMDGPU::DS_GWS_SEMA_P:
6381 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6382 // A s_waitcnt 0 is required to be the instruction immediately following.
6383 if (getSubtarget()->hasGWSAutoReplay()) {
6385 return BB;
6386 }
6387
6388 return emitGWSMemViolTestLoop(MI, BB);
6389 case AMDGPU::S_SETREG_B32: {
6390 // Try to optimize cases that only set the denormal mode or rounding mode.
6391 //
6392 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6393 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6394 // instead.
6395 //
6396 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6397 // allow you to have a no side effect instruction in the output of a
6398 // sideeffecting pattern.
6399 auto [ID, Offset, Width] =
6400 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6402 return BB;
6403
6404 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6405 const unsigned SetMask = WidthMask << Offset;
6406
6407 if (getSubtarget()->hasDenormModeInst()) {
6408 unsigned SetDenormOp = 0;
6409 unsigned SetRoundOp = 0;
6410
6411 // The dedicated instructions can only set the whole denorm or round mode
6412 // at once, not a subset of bits in either.
6413 if (SetMask ==
6415 // If this fully sets both the round and denorm mode, emit the two
6416 // dedicated instructions for these.
6417 SetRoundOp = AMDGPU::S_ROUND_MODE;
6418 SetDenormOp = AMDGPU::S_DENORM_MODE;
6419 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6420 SetRoundOp = AMDGPU::S_ROUND_MODE;
6421 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6422 SetDenormOp = AMDGPU::S_DENORM_MODE;
6423 }
6424
6425 if (SetRoundOp || SetDenormOp) {
6427 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6428 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6429 unsigned ImmVal = Def->getOperand(1).getImm();
6430 if (SetRoundOp) {
6431 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6432 .addImm(ImmVal & 0xf);
6433
6434 // If we also have the denorm mode, get just the denorm mode bits.
6435 ImmVal >>= 4;
6436 }
6437
6438 if (SetDenormOp) {
6439 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6440 .addImm(ImmVal & 0xf);
6441 }
6442
6443 MI.eraseFromParent();
6444 return BB;
6445 }
6446 }
6447 }
6448
6449 // If only FP bits are touched, used the no side effects pseudo.
6450 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6451 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6452 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6453
6454 return BB;
6455 }
6456 case AMDGPU::S_INVERSE_BALLOT_U32:
6457 case AMDGPU::S_INVERSE_BALLOT_U64:
6458 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6459 // necessary. After that they are equivalent to a COPY.
6460 MI.setDesc(TII->get(AMDGPU::COPY));
6461 return BB;
6462 case AMDGPU::ENDPGM_TRAP: {
6463 const DebugLoc &DL = MI.getDebugLoc();
6464 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6465 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6466 MI.addOperand(MachineOperand::CreateImm(0));
6467 return BB;
6468 }
6469
6470 // We need a block split to make the real endpgm a terminator. We also don't
6471 // want to break phis in successor blocks, so we can't just delete to the
6472 // end of the block.
6473
6474 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6476 MF->push_back(TrapBB);
6477 // clang-format off
6478 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6479 .addImm(0);
6480 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6481 .addMBB(TrapBB);
6482 // clang-format on
6483
6484 BB->addSuccessor(TrapBB);
6485 MI.eraseFromParent();
6486 return SplitBB;
6487 }
6488 case AMDGPU::SIMULATED_TRAP: {
6489 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6491 MachineBasicBlock *SplitBB =
6492 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6493 MI.eraseFromParent();
6494 return SplitBB;
6495 }
6496 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6497 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6499
6500 // During ISel, it's difficult to propagate the original EXEC mask to use as
6501 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6502 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6503 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6504 Register OriginalExec = Setup->getOperand(0).getReg();
6505 MF->getRegInfo().clearKillFlags(OriginalExec);
6506 MI.getOperand(0).setReg(OriginalExec);
6507 return BB;
6508 }
6509 default:
6510 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6511 if (!MI.mayStore())
6513 return BB;
6514 }
6516 }
6517}
6518
6520 // This currently forces unfolding various combinations of fsub into fma with
6521 // free fneg'd operands. As long as we have fast FMA (controlled by
6522 // isFMAFasterThanFMulAndFAdd), we should perform these.
6523
6524 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6525 // most of these combines appear to be cycle neutral but save on instruction
6526 // count / code size.
6527 return true;
6528}
6529
6531
6533 EVT VT) const {
6534 if (!VT.isVector()) {
6535 return MVT::i1;
6536 }
6537 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6538}
6539
6541 // TODO: Should i16 be used always if legal? For now it would force VALU
6542 // shifts.
6543 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6544}
6545
6547 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6548 ? Ty.changeElementSize(16)
6549 : Ty.changeElementSize(32);
6550}
6551
6552// Answering this is somewhat tricky and depends on the specific device which
6553// have different rates for fma or all f64 operations.
6554//
6555// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6556// regardless of which device (although the number of cycles differs between
6557// devices), so it is always profitable for f64.
6558//
6559// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6560// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6561// which we can always do even without fused FP ops since it returns the same
6562// result as the separate operations and since it is always full
6563// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6564// however does not support denormals, so we do report fma as faster if we have
6565// a fast fma device and require denormals.
6566//
6568 EVT VT) const {
6569 VT = VT.getScalarType();
6570
6571 switch (VT.getSimpleVT().SimpleTy) {
6572 case MVT::f32: {
6573 // If mad is not available this depends only on if f32 fma is full rate.
6574 if (!Subtarget->hasMadMacF32Insts())
6575 return Subtarget->hasFastFMAF32();
6576
6577 // Otherwise f32 mad is always full rate and returns the same result as
6578 // the separate operations so should be preferred over fma.
6579 // However does not support denormals.
6581 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6582
6583 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6584 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6585 }
6586 case MVT::f64:
6587 return true;
6588 case MVT::f16:
6589 case MVT::bf16:
6590 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6591 default:
6592 break;
6593 }
6594
6595 return false;
6596}
6597
6599 LLT Ty) const {
6600 switch (Ty.getScalarSizeInBits()) {
6601 case 16:
6602 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6603 case 32:
6604 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6605 case 64:
6606 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6607 default:
6608 break;
6609 }
6610
6611 return false;
6612}
6613
6615 if (!Ty.isScalar())
6616 return false;
6617
6618 if (Ty.getScalarSizeInBits() == 16)
6619 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6620 if (Ty.getScalarSizeInBits() == 32)
6621 return Subtarget->hasMadMacF32Insts() &&
6622 denormalModeIsFlushAllF32(*MI.getMF());
6623
6624 return false;
6625}
6626
6628 const SDNode *N) const {
6629 // TODO: Check future ftz flag
6630 // v_mad_f32/v_mac_f32 do not support denormals.
6631 EVT VT = N->getValueType(0);
6632 if (VT == MVT::f32)
6633 return Subtarget->hasMadMacF32Insts() &&
6635 if (VT == MVT::f16) {
6636 return Subtarget->hasMadF16() &&
6638 }
6639
6640 return false;
6641}
6642
6643//===----------------------------------------------------------------------===//
6644// Custom DAG Lowering Operations
6645//===----------------------------------------------------------------------===//
6646
6647// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6648// wider vector type is legal.
6650 SelectionDAG &DAG) const {
6651 unsigned Opc = Op.getOpcode();
6652 EVT VT = Op.getValueType();
6653 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6654 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6655 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6656 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6657
6658 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6659
6660 SDLoc SL(Op);
6661 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6662 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6663
6664 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6665}
6666
6667// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6668// regression whereby extra unnecessary instructions were added to codegen
6669// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6670// instructions to extract the result from the vector.
6672 [[maybe_unused]] EVT VT = Op.getValueType();
6673
6674 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6675 VT == MVT::v16i32) &&
6676 "Unexpected ValueType.");
6677
6678 return DAG.UnrollVectorOp(Op.getNode());
6679}
6680
6681// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6682// wider vector type is legal.
6684 SelectionDAG &DAG) const {
6685 unsigned Opc = Op.getOpcode();
6686 EVT VT = Op.getValueType();
6687 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6688 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6689 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6690 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6691 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6692 VT == MVT::v32bf16);
6693
6694 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6695 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6696
6697 SDLoc SL(Op);
6698
6699 SDValue OpLo =
6700 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6701 SDValue OpHi =
6702 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6703
6704 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6705}
6706
6708 SelectionDAG &DAG) const {
6709 unsigned Opc = Op.getOpcode();
6710 EVT VT = Op.getValueType();
6711 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6712 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6713 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6714 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6715 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6716 VT == MVT::v32bf16);
6717
6718 SDValue Op0 = Op.getOperand(0);
6719 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6720 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6721 : std::pair(Op0, Op0);
6722
6723 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6724 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6725
6726 SDLoc SL(Op);
6727 auto ResVT = DAG.GetSplitDestVTs(VT);
6728
6729 SDValue OpLo =
6730 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6731 SDValue OpHi =
6732 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6733
6734 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6735}
6736
6738 switch (Op.getOpcode()) {
6739 default:
6741 case ISD::BRCOND:
6742 return LowerBRCOND(Op, DAG);
6743 case ISD::RETURNADDR:
6744 return LowerRETURNADDR(Op, DAG);
6745 case ISD::LOAD: {
6746 SDValue Result = LowerLOAD(Op, DAG);
6747 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6748 "Load should return a value and a chain");
6749 return Result;
6750 }
6751 case ISD::FSQRT: {
6752 EVT VT = Op.getValueType();
6753 if (VT == MVT::f32)
6754 return lowerFSQRTF32(Op, DAG);
6755 if (VT == MVT::f64)
6756 return lowerFSQRTF64(Op, DAG);
6757 return SDValue();
6758 }
6759 case ISD::FSIN:
6760 case ISD::FCOS:
6761 return LowerTrig(Op, DAG);
6762 case ISD::SELECT:
6763 return LowerSELECT(Op, DAG);
6764 case ISD::FDIV:
6765 return LowerFDIV(Op, DAG);
6766 case ISD::FFREXP:
6767 return LowerFFREXP(Op, DAG);
6768 case ISD::ATOMIC_CMP_SWAP:
6769 return LowerATOMIC_CMP_SWAP(Op, DAG);
6770 case ISD::STORE:
6771 return LowerSTORE(Op, DAG);
6772 case ISD::GlobalAddress: {
6775 return LowerGlobalAddress(MFI, Op, DAG);
6776 }
6778 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6780 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6782 return LowerINTRINSIC_VOID(Op, DAG);
6783 case ISD::ADDRSPACECAST:
6784 return lowerADDRSPACECAST(Op, DAG);
6786 return lowerINSERT_SUBVECTOR(Op, DAG);
6788 return lowerINSERT_VECTOR_ELT(Op, DAG);
6790 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6792 return lowerVECTOR_SHUFFLE(Op, DAG);
6794 return lowerSCALAR_TO_VECTOR(Op, DAG);
6795 case ISD::BUILD_VECTOR:
6796 return lowerBUILD_VECTOR(Op, DAG);
6797 case ISD::FP_ROUND:
6799 return lowerFP_ROUND(Op, DAG);
6800 case ISD::TRAP:
6801 return lowerTRAP(Op, DAG);
6802 case ISD::DEBUGTRAP:
6803 return lowerDEBUGTRAP(Op, DAG);
6804 case ISD::ABS:
6805 case ISD::FABS:
6806 case ISD::FNEG:
6807 case ISD::FCANONICALIZE:
6808 case ISD::BSWAP:
6809 return splitUnaryVectorOp(Op, DAG);
6810 case ISD::FMINNUM:
6811 case ISD::FMAXNUM:
6812 return lowerFMINNUM_FMAXNUM(Op, DAG);
6813 case ISD::FMINIMUMNUM:
6814 case ISD::FMAXIMUMNUM:
6815 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6816 case ISD::FMINIMUM:
6817 case ISD::FMAXIMUM:
6818 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6819 case ISD::FLDEXP:
6820 case ISD::STRICT_FLDEXP:
6821 return lowerFLDEXP(Op, DAG);
6822 case ISD::FMA:
6823 return splitTernaryVectorOp(Op, DAG);
6824 case ISD::FP_TO_SINT:
6825 case ISD::FP_TO_UINT:
6826 return LowerFP_TO_INT(Op, DAG);
6827 case ISD::SHL:
6828 case ISD::SRA:
6829 case ISD::SRL:
6830 case ISD::ADD:
6831 case ISD::SUB:
6832 case ISD::SMIN:
6833 case ISD::SMAX:
6834 case ISD::UMIN:
6835 case ISD::UMAX:
6836 case ISD::FADD:
6837 case ISD::FMUL:
6838 case ISD::FMINNUM_IEEE:
6839 case ISD::FMAXNUM_IEEE:
6840 case ISD::UADDSAT:
6841 case ISD::USUBSAT:
6842 case ISD::SADDSAT:
6843 case ISD::SSUBSAT:
6844 return splitBinaryVectorOp(Op, DAG);
6845 case ISD::FCOPYSIGN:
6846 return lowerFCOPYSIGN(Op, DAG);
6847 case ISD::MUL:
6848 return lowerMUL(Op, DAG);
6849 case ISD::SMULO:
6850 case ISD::UMULO:
6851 return lowerXMULO(Op, DAG);
6852 case ISD::SMUL_LOHI:
6853 case ISD::UMUL_LOHI:
6854 return lowerXMUL_LOHI(Op, DAG);
6855 case ISD::DYNAMIC_STACKALLOC:
6856 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6857 case ISD::STACKSAVE:
6858 return LowerSTACKSAVE(Op, DAG);
6859 case ISD::GET_ROUNDING:
6860 return lowerGET_ROUNDING(Op, DAG);
6861 case ISD::SET_ROUNDING:
6862 return lowerSET_ROUNDING(Op, DAG);
6863 case ISD::PREFETCH:
6864 return lowerPREFETCH(Op, DAG);
6865 case ISD::FP_EXTEND:
6867 return lowerFP_EXTEND(Op, DAG);
6868 case ISD::GET_FPENV:
6869 return lowerGET_FPENV(Op, DAG);
6870 case ISD::SET_FPENV:
6871 return lowerSET_FPENV(Op, DAG);
6872 case ISD::ROTR:
6873 return lowerROTR(Op, DAG);
6874 }
6875 return SDValue();
6876}
6877
6878// Used for D16: Casts the result of an instruction into the right vector,
6879// packs values if loads return unpacked values.
6881 const SDLoc &DL, SelectionDAG &DAG,
6882 bool Unpacked) {
6883 if (!LoadVT.isVector())
6884 return Result;
6885
6886 // Cast back to the original packed type or to a larger type that is a
6887 // multiple of 32 bit for D16. Widening the return type is a required for
6888 // legalization.
6889 EVT FittingLoadVT = LoadVT;
6890 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6891 FittingLoadVT =
6893 LoadVT.getVectorNumElements() + 1);
6894 }
6895
6896 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6897 // Truncate to v2i16/v4i16.
6898 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6899
6900 // Workaround legalizer not scalarizing truncate after vector op
6901 // legalization but not creating intermediate vector trunc.
6903 DAG.ExtractVectorElements(Result, Elts);
6904 for (SDValue &Elt : Elts)
6905 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6906
6907 // Pad illegal v1i16/v3fi6 to v4i16
6908 if ((LoadVT.getVectorNumElements() % 2) == 1)
6909 Elts.push_back(DAG.getPOISON(MVT::i16));
6910
6911 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6912
6913 // Bitcast to original type (v2f16/v4f16).
6914 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6915 }
6916
6917 // Cast back to the original packed type.
6918 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6919}
6920
6921SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6922 SelectionDAG &DAG,
6924 bool IsIntrinsic) const {
6925 SDLoc DL(M);
6926
6927 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6928 EVT LoadVT = M->getValueType(0);
6929
6930 EVT EquivLoadVT = LoadVT;
6931 if (LoadVT.isVector()) {
6932 if (Unpacked) {
6933 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6934 LoadVT.getVectorNumElements());
6935 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6936 // Widen v3f16 to legal type
6937 EquivLoadVT =
6939 LoadVT.getVectorNumElements() + 1);
6940 }
6941 }
6942
6943 // Change from v4f16/v2f16 to EquivLoadVT.
6944 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6945
6947 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6948 M->getMemoryVT(), M->getMemOperand());
6949
6950 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6951
6952 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6953}
6954
6955SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6956 SelectionDAG &DAG,
6957 ArrayRef<SDValue> Ops) const {
6958 SDLoc DL(M);
6959 EVT LoadVT = M->getValueType(0);
6960 EVT EltType = LoadVT.getScalarType();
6961 EVT IntVT = LoadVT.changeTypeToInteger();
6962
6963 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6964
6965 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6966 bool IsTFE = M->getNumValues() == 3;
6967
6968 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6970 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6971 : AMDGPUISD::BUFFER_LOAD;
6972
6973 if (IsD16) {
6974 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6975 }
6976
6977 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6978 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6979 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6980 IsTFE);
6981
6982 if (isTypeLegal(LoadVT)) {
6983 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6984 M->getMemOperand(), DAG);
6985 }
6986
6987 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6988 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6989 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6990 M->getMemOperand(), DAG);
6991 return DAG.getMergeValues(
6992 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6993 DL);
6994}
6995
6997 SelectionDAG &DAG) {
6998 EVT VT = N->getValueType(0);
6999 unsigned CondCode = N->getConstantOperandVal(3);
7000 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7001 return DAG.getPOISON(VT);
7002
7003 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7004
7005 SDValue LHS = N->getOperand(1);
7006 SDValue RHS = N->getOperand(2);
7007
7008 SDLoc DL(N);
7009
7010 EVT CmpVT = LHS.getValueType();
7011 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7012 unsigned PromoteOp =
7014 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7015 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7016 }
7017
7018 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7019
7020 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7021 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7022
7023 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7024 DAG.getCondCode(CCOpcode));
7025 if (VT.bitsEq(CCVT))
7026 return SetCC;
7027 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7028}
7029
7031 SelectionDAG &DAG) {
7032 EVT VT = N->getValueType(0);
7033
7034 unsigned CondCode = N->getConstantOperandVal(3);
7035 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7036 return DAG.getPOISON(VT);
7037
7038 SDValue Src0 = N->getOperand(1);
7039 SDValue Src1 = N->getOperand(2);
7040 EVT CmpVT = Src0.getValueType();
7041 SDLoc SL(N);
7042
7043 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7044 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7045 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7046 }
7047
7048 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7049 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7050 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7051 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7052 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7053 DAG.getCondCode(CCOpcode));
7054 if (VT.bitsEq(CCVT))
7055 return SetCC;
7056 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7057}
7058
7060 SelectionDAG &DAG) {
7061 EVT VT = N->getValueType(0);
7062 SDValue Src = N->getOperand(1);
7063 SDLoc SL(N);
7064
7065 if (Src.getOpcode() == ISD::SETCC) {
7066 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7067 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7068 Src.getOperand(1), Src.getOperand(2));
7069 }
7070 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7071 // (ballot 0) -> 0
7072 if (Arg->isZero())
7073 return DAG.getConstant(0, SL, VT);
7074
7075 // (ballot 1) -> EXEC/EXEC_LO
7076 if (Arg->isOne()) {
7077 Register Exec;
7078 if (VT.getScalarSizeInBits() == 32)
7079 Exec = AMDGPU::EXEC_LO;
7080 else if (VT.getScalarSizeInBits() == 64)
7081 Exec = AMDGPU::EXEC;
7082 else
7083 return SDValue();
7084
7085 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7086 }
7087 }
7088
7089 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7090 // ISD::SETNE)
7091 return DAG.getNode(
7092 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7093 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7094}
7095
7097 SelectionDAG &DAG) {
7098 EVT VT = N->getValueType(0);
7099 unsigned ValSize = VT.getSizeInBits();
7100 unsigned IID = N->getConstantOperandVal(0);
7101 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7102 IID == Intrinsic::amdgcn_permlanex16;
7103 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7104 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7105 SDLoc SL(N);
7106 MVT IntVT = MVT::getIntegerVT(ValSize);
7107 const GCNSubtarget *ST = TLI.getSubtarget();
7108 unsigned SplitSize = 32;
7109 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7110 ST->hasDPALU_DPP() &&
7111 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7112 SplitSize = 64;
7113
7114 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7115 SDValue Src2, MVT ValT) -> SDValue {
7117 switch (IID) {
7118 case Intrinsic::amdgcn_permlane16:
7119 case Intrinsic::amdgcn_permlanex16:
7120 case Intrinsic::amdgcn_update_dpp:
7121 Operands.push_back(N->getOperand(6));
7122 Operands.push_back(N->getOperand(5));
7123 Operands.push_back(N->getOperand(4));
7124 [[fallthrough]];
7125 case Intrinsic::amdgcn_writelane:
7126 Operands.push_back(Src2);
7127 [[fallthrough]];
7128 case Intrinsic::amdgcn_readlane:
7129 case Intrinsic::amdgcn_set_inactive:
7130 case Intrinsic::amdgcn_set_inactive_chain_arg:
7131 case Intrinsic::amdgcn_mov_dpp8:
7132 Operands.push_back(Src1);
7133 [[fallthrough]];
7134 case Intrinsic::amdgcn_readfirstlane:
7135 case Intrinsic::amdgcn_permlane64:
7136 Operands.push_back(Src0);
7137 break;
7138 default:
7139 llvm_unreachable("unhandled lane op");
7140 }
7141
7142 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7143 std::reverse(Operands.begin(), Operands.end());
7144
7145 if (SDNode *GL = N->getGluedNode()) {
7146 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7147 GL = GL->getOperand(0).getNode();
7148 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7149 SDValue(GL, 0)));
7150 }
7151
7152 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7153 };
7154
7155 SDValue Src0 = N->getOperand(1);
7156 SDValue Src1, Src2;
7157 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7158 IID == Intrinsic::amdgcn_mov_dpp8 ||
7159 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7160 Src1 = N->getOperand(2);
7161 if (IID == Intrinsic::amdgcn_writelane ||
7162 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7163 Src2 = N->getOperand(3);
7164 }
7165
7166 if (ValSize == SplitSize) {
7167 // Already legal
7168 return SDValue();
7169 }
7170
7171 if (ValSize < 32) {
7172 bool IsFloat = VT.isFloatingPoint();
7173 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7174 SL, MVT::i32);
7175
7176 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7177 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7178 SL, MVT::i32);
7179 }
7180
7181 if (IID == Intrinsic::amdgcn_writelane) {
7182 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7183 SL, MVT::i32);
7184 }
7185
7186 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7187 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7188 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7189 }
7190
7191 if (ValSize % SplitSize != 0)
7192 return SDValue();
7193
7194 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7195 EVT VT = N->getValueType(0);
7196 unsigned NE = VT.getVectorNumElements();
7197 EVT EltVT = VT.getVectorElementType();
7199 unsigned NumOperands = N->getNumOperands();
7200 SmallVector<SDValue, 4> Operands(NumOperands);
7201 SDNode *GL = N->getGluedNode();
7202
7203 // only handle convergencectrl_glue
7204 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7205
7206 for (unsigned i = 0; i != NE; ++i) {
7207 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7208 ++j) {
7209 SDValue Operand = N->getOperand(j);
7210 EVT OperandVT = Operand.getValueType();
7211 if (OperandVT.isVector()) {
7212 // A vector operand; extract a single element.
7213 EVT OperandEltVT = OperandVT.getVectorElementType();
7214 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7215 Operand, DAG.getVectorIdxConstant(i, SL));
7216 } else {
7217 // A scalar operand; just use it as is.
7218 Operands[j] = Operand;
7219 }
7220 }
7221
7222 if (GL)
7223 Operands[NumOperands - 1] =
7224 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7225 SDValue(GL->getOperand(0).getNode(), 0));
7226
7227 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7228 }
7229
7230 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7231 return DAG.getBuildVector(VecVT, SL, Scalars);
7232 };
7233
7234 if (VT.isVector()) {
7235 switch (MVT::SimpleValueType EltTy =
7237 case MVT::i32:
7238 case MVT::f32:
7239 if (SplitSize == 32) {
7240 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7241 return unrollLaneOp(LaneOp.getNode());
7242 }
7243 [[fallthrough]];
7244 case MVT::i16:
7245 case MVT::f16:
7246 case MVT::bf16: {
7247 unsigned SubVecNumElt =
7248 SplitSize / VT.getVectorElementType().getSizeInBits();
7249 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7251 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7252 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7253 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7254 DAG.getConstant(EltIdx, SL, MVT::i32));
7255
7256 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7257 IsPermLane16)
7258 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7259 DAG.getConstant(EltIdx, SL, MVT::i32));
7260
7261 if (IID == Intrinsic::amdgcn_writelane)
7262 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7263 DAG.getConstant(EltIdx, SL, MVT::i32));
7264
7265 Pieces.push_back(
7266 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7267 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7268 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7269 EltIdx += SubVecNumElt;
7270 }
7271 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7272 }
7273 default:
7274 // Handle all other cases by bitcasting to i32 vectors
7275 break;
7276 }
7277 }
7278
7279 MVT VecVT =
7280 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7281 Src0 = DAG.getBitcast(VecVT, Src0);
7282
7283 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7284 Src1 = DAG.getBitcast(VecVT, Src1);
7285
7286 if (IID == Intrinsic::amdgcn_writelane)
7287 Src2 = DAG.getBitcast(VecVT, Src2);
7288
7289 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7290 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7291 return DAG.getBitcast(VT, UnrolledLaneOp);
7292}
7293
7296 SelectionDAG &DAG) const {
7297 switch (N->getOpcode()) {
7299 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7300 Results.push_back(Res);
7301 return;
7302 }
7304 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7305 Results.push_back(Res);
7306 return;
7307 }
7309 unsigned IID = N->getConstantOperandVal(0);
7310 switch (IID) {
7311 case Intrinsic::amdgcn_make_buffer_rsrc:
7312 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7313 return;
7314 case Intrinsic::amdgcn_cvt_pkrtz: {
7315 SDValue Src0 = N->getOperand(1);
7316 SDValue Src1 = N->getOperand(2);
7317 SDLoc SL(N);
7318 SDValue Cvt =
7319 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7320 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7321 return;
7322 }
7323 case Intrinsic::amdgcn_cvt_pknorm_i16:
7324 case Intrinsic::amdgcn_cvt_pknorm_u16:
7325 case Intrinsic::amdgcn_cvt_pk_i16:
7326 case Intrinsic::amdgcn_cvt_pk_u16: {
7327 SDValue Src0 = N->getOperand(1);
7328 SDValue Src1 = N->getOperand(2);
7329 SDLoc SL(N);
7330 unsigned Opcode;
7331
7332 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7334 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7336 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7338 else
7340
7341 EVT VT = N->getValueType(0);
7342 if (isTypeLegal(VT))
7343 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7344 else {
7345 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7346 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7347 }
7348 return;
7349 }
7350 case Intrinsic::amdgcn_s_buffer_load: {
7351 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7352 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7353 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7354 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7355 // s_buffer_load_i8.
7356 if (!Subtarget->hasScalarSubwordLoads())
7357 return;
7358 SDValue Op = SDValue(N, 0);
7359 SDValue Rsrc = Op.getOperand(1);
7360 SDValue Offset = Op.getOperand(2);
7361 SDValue CachePolicy = Op.getOperand(3);
7362 EVT VT = Op.getValueType();
7363 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7364 SDLoc DL(Op);
7366 const DataLayout &DataLayout = DAG.getDataLayout();
7367 Align Alignment =
7373 VT.getStoreSize(), Alignment);
7374 SDValue LoadVal;
7375 if (!Offset->isDivergent()) {
7376 SDValue Ops[] = {Rsrc, // source register
7377 Offset, CachePolicy};
7378 SDValue BufferLoad =
7380 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7381 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7382 } else {
7383 SDValue Ops[] = {
7384 DAG.getEntryNode(), // Chain
7385 Rsrc, // rsrc
7386 DAG.getConstant(0, DL, MVT::i32), // vindex
7387 {}, // voffset
7388 {}, // soffset
7389 {}, // offset
7390 CachePolicy, // cachepolicy
7391 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7392 };
7393 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7394 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7395 }
7396 Results.push_back(LoadVal);
7397 return;
7398 }
7399 case Intrinsic::amdgcn_dead: {
7400 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7401 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7402 return;
7403 }
7404 }
7405 break;
7406 }
7408 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7409 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7410 // FIXME: Hacky
7411 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7412 Results.push_back(Res.getOperand(I));
7413 }
7414 } else {
7415 Results.push_back(Res);
7416 Results.push_back(Res.getValue(1));
7417 }
7418 return;
7419 }
7420
7421 break;
7422 }
7423 case ISD::SELECT: {
7424 SDLoc SL(N);
7425 EVT VT = N->getValueType(0);
7426 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7427 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7428 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7429
7430 EVT SelectVT = NewVT;
7431 if (NewVT.bitsLT(MVT::i32)) {
7432 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7433 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7434 SelectVT = MVT::i32;
7435 }
7436
7437 SDValue NewSelect =
7438 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7439
7440 if (NewVT != SelectVT)
7441 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7442 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7443 return;
7444 }
7445 case ISD::FNEG: {
7446 if (N->getValueType(0) != MVT::v2f16)
7447 break;
7448
7449 SDLoc SL(N);
7450 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7451
7452 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7453 DAG.getConstant(0x80008000, SL, MVT::i32));
7454 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7455 return;
7456 }
7457 case ISD::FABS: {
7458 if (N->getValueType(0) != MVT::v2f16)
7459 break;
7460
7461 SDLoc SL(N);
7462 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7463
7464 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7465 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7466 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7467 return;
7468 }
7469 case ISD::FSQRT: {
7470 if (N->getValueType(0) != MVT::f16)
7471 break;
7472 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7473 break;
7474 }
7475 default:
7477 break;
7478 }
7479}
7480
7481/// Helper function for LowerBRCOND
7482static SDNode *findUser(SDValue Value, unsigned Opcode) {
7483
7484 for (SDUse &U : Value->uses()) {
7485 if (U.get() != Value)
7486 continue;
7487
7488 if (U.getUser()->getOpcode() == Opcode)
7489 return U.getUser();
7490 }
7491 return nullptr;
7492}
7493
7494unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7495 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7496 switch (Intr->getConstantOperandVal(1)) {
7497 case Intrinsic::amdgcn_if:
7498 return AMDGPUISD::IF;
7499 case Intrinsic::amdgcn_else:
7500 return AMDGPUISD::ELSE;
7501 case Intrinsic::amdgcn_loop:
7502 return AMDGPUISD::LOOP;
7503 case Intrinsic::amdgcn_end_cf:
7504 llvm_unreachable("should not occur");
7505 default:
7506 return 0;
7507 }
7508 }
7509
7510 // break, if_break, else_break are all only used as inputs to loop, not
7511 // directly as branch conditions.
7512 return 0;
7513}
7514
7521
7523 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7524 return false;
7525
7526 // FIXME: Either avoid relying on address space here or change the default
7527 // address space for functions to avoid the explicit check.
7528 return (GV->getValueType()->isFunctionTy() ||
7531}
7532
7534 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7535}
7536
7538 if (!GV->hasExternalLinkage())
7539 return true;
7540
7541 const auto OS = getTargetMachine().getTargetTriple().getOS();
7542 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7543}
7544
7545/// This transforms the control flow intrinsics to get the branch destination as
7546/// last parameter, also switches branch target with BR if the need arise
7547SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7548 SDLoc DL(BRCOND);
7549
7550 SDNode *Intr = BRCOND.getOperand(1).getNode();
7551 SDValue Target = BRCOND.getOperand(2);
7552 SDNode *BR = nullptr;
7553 SDNode *SetCC = nullptr;
7554
7555 if (Intr->getOpcode() == ISD::SETCC) {
7556 // As long as we negate the condition everything is fine
7557 SetCC = Intr;
7558 Intr = SetCC->getOperand(0).getNode();
7559
7560 } else {
7561 // Get the target from BR if we don't negate the condition
7562 BR = findUser(BRCOND, ISD::BR);
7563 assert(BR && "brcond missing unconditional branch user");
7564 Target = BR->getOperand(1);
7565 }
7566
7567 unsigned CFNode = isCFIntrinsic(Intr);
7568 if (CFNode == 0) {
7569 // This is a uniform branch so we don't need to legalize.
7570 return BRCOND;
7571 }
7572
7573 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7575
7576 assert(!SetCC ||
7577 (SetCC->getConstantOperandVal(1) == 1 &&
7578 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7579 ISD::SETNE));
7580
7581 // operands of the new intrinsic call
7583 if (HaveChain)
7584 Ops.push_back(BRCOND.getOperand(0));
7585
7586 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7587 Ops.push_back(Target);
7588
7589 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7590
7591 // build the new intrinsic call
7592 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7593
7594 if (!HaveChain) {
7595 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7596
7598 }
7599
7600 if (BR) {
7601 // Give the branch instruction our target
7602 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7603 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7604 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7605 }
7606
7607 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7608
7609 // Copy the intrinsic results to registers
7610 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7611 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7612 if (!CopyToReg)
7613 continue;
7614
7615 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7616 SDValue(Result, i - 1), SDValue());
7617
7618 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7619 }
7620
7621 // Remove the old intrinsic from the chain
7622 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7623 Intr->getOperand(0));
7624
7625 return Chain;
7626}
7627
7628SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7629 MVT VT = Op.getSimpleValueType();
7630 SDLoc DL(Op);
7631 // Checking the depth
7632 if (Op.getConstantOperandVal(0) != 0)
7633 return DAG.getConstant(0, DL, VT);
7634
7635 MachineFunction &MF = DAG.getMachineFunction();
7636 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7637 // Check for kernel and shader functions
7638 if (Info->isEntryFunction())
7639 return DAG.getConstant(0, DL, VT);
7640
7641 MachineFrameInfo &MFI = MF.getFrameInfo();
7642 // There is a call to @llvm.returnaddress in this function
7643 MFI.setReturnAddressIsTaken(true);
7644
7645 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7646 // Get the return address reg and mark it as an implicit live-in
7647 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7648 getRegClassFor(VT, Op.getNode()->isDivergent()));
7649
7650 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7651}
7652
7653SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7654 const SDLoc &DL, EVT VT) const {
7655 return Op.getValueType().bitsLE(VT)
7656 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7657 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7658 DAG.getTargetConstant(0, DL, MVT::i32));
7659}
7660
7661SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7662 SelectionDAG &DAG) const {
7663 EVT DstVT = Op.getValueType();
7664 unsigned NumElts = DstVT.getVectorNumElements();
7665 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7666
7667 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7668
7669 SDLoc DL(Op);
7670 unsigned Opc = Op.getOpcode();
7671 SDValue Flags = Op.getOperand(1);
7672 EVT HalfDstVT =
7673 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7674 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7675 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7676
7677 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7678}
7679
7680SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7681 SDValue Src = Op.getOperand(0);
7682 EVT SrcVT = Src.getValueType();
7683 EVT DstVT = Op.getValueType();
7684
7685 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7686 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7687 if (SrcVT.getScalarType() != MVT::f32)
7688 return SDValue();
7689 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7690 }
7691
7692 if (SrcVT.getScalarType() != MVT::f64)
7693 return Op;
7694
7695 SDLoc DL(Op);
7696 if (DstVT == MVT::f16) {
7697 // TODO: Handle strictfp
7698 if (Op.getOpcode() != ISD::FP_ROUND)
7699 return Op;
7700
7701 if (!Subtarget->has16BitInsts()) {
7702 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7703 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7704 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7705 }
7706 if (Op->getFlags().hasApproximateFuncs()) {
7707 SDValue Flags = Op.getOperand(1);
7708 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7709 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7710 }
7711 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7712 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7713 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7714 }
7715
7716 assert(DstVT.getScalarType() == MVT::bf16 &&
7717 "custom lower FP_ROUND for f16 or bf16");
7718 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7719
7720 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7721 // hardware f32 -> bf16 instruction.
7722 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7723 MVT::f32;
7724 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7725 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7726 DAG.getTargetConstant(0, DL, MVT::i32));
7727}
7728
7729SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7730 SelectionDAG &DAG) const {
7731 EVT VT = Op.getValueType();
7732 const MachineFunction &MF = DAG.getMachineFunction();
7733 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7734 bool IsIEEEMode = Info->getMode().IEEE;
7735
7736 // FIXME: Assert during selection that this is only selected for
7737 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7738 // mode functions, but this happens to be OK since it's only done in cases
7739 // where there is known no sNaN.
7740 if (IsIEEEMode)
7741 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7742
7743 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7744 VT == MVT::v16bf16)
7745 return splitBinaryVectorOp(Op, DAG);
7746 return Op;
7747}
7748
7749SDValue
7750SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7751 SelectionDAG &DAG) const {
7752 EVT VT = Op.getValueType();
7753 const MachineFunction &MF = DAG.getMachineFunction();
7754 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7755 bool IsIEEEMode = Info->getMode().IEEE;
7756
7757 if (IsIEEEMode)
7758 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7759
7760 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7761 VT == MVT::v16bf16)
7762 return splitBinaryVectorOp(Op, DAG);
7763 return Op;
7764}
7765
7766SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7767 SelectionDAG &DAG) const {
7768 EVT VT = Op.getValueType();
7769 if (VT.isVector())
7770 return splitBinaryVectorOp(Op, DAG);
7771
7772 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7773 !Subtarget->hasMinimum3Maximum3F16() &&
7774 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7775 "should not need to widen f16 minimum/maximum to v2f16");
7776
7777 // Widen f16 operation to v2f16
7778
7779 // fminimum f16:x, f16:y ->
7780 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7781 // (v2f16 (scalar_to_vector y))), 0
7782 SDLoc SL(Op);
7783 SDValue WideSrc0 =
7784 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7785 SDValue WideSrc1 =
7786 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7787
7788 SDValue Widened =
7789 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7790
7791 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7792 DAG.getConstant(0, SL, MVT::i32));
7793}
7794
7795SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7796 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7797 EVT VT = Op.getValueType();
7798 assert(VT == MVT::f16);
7799
7800 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7801 EVT ExpVT = Exp.getValueType();
7802 if (ExpVT == MVT::i16)
7803 return Op;
7804
7805 SDLoc DL(Op);
7806
7807 // Correct the exponent type for f16 to i16.
7808 // Clamp the range of the exponent to the instruction's range.
7809
7810 // TODO: This should be a generic narrowing legalization, and can easily be
7811 // for GlobalISel.
7812
7813 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7814 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7815
7816 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7817 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7818
7819 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7820
7821 if (IsStrict) {
7822 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7823 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7824 }
7825
7826 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7827}
7828
7830 switch (Op->getOpcode()) {
7831 case ISD::SRA:
7832 case ISD::SMIN:
7833 case ISD::SMAX:
7834 return ISD::SIGN_EXTEND;
7835 case ISD::SRL:
7836 case ISD::UMIN:
7837 case ISD::UMAX:
7838 return ISD::ZERO_EXTEND;
7839 case ISD::ADD:
7840 case ISD::SUB:
7841 case ISD::AND:
7842 case ISD::OR:
7843 case ISD::XOR:
7844 case ISD::SHL:
7845 case ISD::SELECT:
7846 case ISD::MUL:
7847 // operation result won't be influenced by garbage high bits.
7848 // TODO: are all of those cases correct, and are there more?
7849 return ISD::ANY_EXTEND;
7850 case ISD::SETCC: {
7851 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7853 }
7854 default:
7855 llvm_unreachable("unexpected opcode!");
7856 }
7857}
7858
7859SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7860 DAGCombinerInfo &DCI) const {
7861 const unsigned Opc = Op.getOpcode();
7862 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7863 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7864 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7865 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7866 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7867
7868 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7869 : Op->getOperand(0).getValueType();
7870 auto ExtTy = OpTy.changeElementType(MVT::i32);
7871
7872 if (DCI.isBeforeLegalizeOps() ||
7873 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7874 return SDValue();
7875
7876 auto &DAG = DCI.DAG;
7877
7878 SDLoc DL(Op);
7879 SDValue LHS;
7880 SDValue RHS;
7881 if (Opc == ISD::SELECT) {
7882 LHS = Op->getOperand(1);
7883 RHS = Op->getOperand(2);
7884 } else {
7885 LHS = Op->getOperand(0);
7886 RHS = Op->getOperand(1);
7887 }
7888
7889 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7890 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7891
7892 // Special case: for shifts, the RHS always needs a zext.
7893 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7894 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7895 else
7896 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7897
7898 // setcc always return i1/i1 vec so no need to truncate after.
7899 if (Opc == ISD::SETCC) {
7900 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7901 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7902 }
7903
7904 // For other ops, we extend the operation's return type as well so we need to
7905 // truncate back to the original type.
7906 SDValue NewVal;
7907 if (Opc == ISD::SELECT)
7908 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7909 else
7910 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7911
7912 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7913}
7914
7915SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7916 SDValue Mag = Op.getOperand(0);
7917 EVT MagVT = Mag.getValueType();
7918
7919 if (MagVT.getVectorNumElements() > 2)
7920 return splitBinaryVectorOp(Op, DAG);
7921
7922 SDValue Sign = Op.getOperand(1);
7923 EVT SignVT = Sign.getValueType();
7924
7925 if (MagVT == SignVT)
7926 return Op;
7927
7928 // fcopysign v2f16:mag, v2f32:sign ->
7929 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7930
7931 SDLoc SL(Op);
7932 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7933 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7934
7935 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7936
7937 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7938}
7939
7940// Custom lowering for vector multiplications and s_mul_u64.
7941SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7942 EVT VT = Op.getValueType();
7943
7944 // Split vector operands.
7945 if (VT.isVector())
7946 return splitBinaryVectorOp(Op, DAG);
7947
7948 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7949
7950 // There are four ways to lower s_mul_u64:
7951 //
7952 // 1. If all the operands are uniform, then we lower it as it is.
7953 //
7954 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7955 // multiplications because there is not a vector equivalent of s_mul_u64.
7956 //
7957 // 3. If the cost model decides that it is more efficient to use vector
7958 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7959 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7960 //
7961 // 4. If the cost model decides to use vector registers and both of the
7962 // operands are zero-extended/sign-extended from 32-bits, then we split the
7963 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7964 // possible to check if the operands are zero-extended or sign-extended in
7965 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7966 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7967 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7968 // If the cost model decides that we have to use vector registers, then
7969 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7970 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7971 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7972 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7973 // SIInstrInfo.cpp .
7974
7975 if (Op->isDivergent())
7976 return SDValue();
7977
7978 SDValue Op0 = Op.getOperand(0);
7979 SDValue Op1 = Op.getOperand(1);
7980 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7981 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7982 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7983 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7984 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7985 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7986 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7987 SDLoc SL(Op);
7988 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7989 return SDValue(
7990 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7991 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7992 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7993 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7994 return SDValue(
7995 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7996 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7997 return Op;
7998}
7999
8000SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8001 EVT VT = Op.getValueType();
8002 SDLoc SL(Op);
8003 SDValue LHS = Op.getOperand(0);
8004 SDValue RHS = Op.getOperand(1);
8005 bool isSigned = Op.getOpcode() == ISD::SMULO;
8006
8007 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8008 const APInt &C = RHSC->getAPIntValue();
8009 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8010 if (C.isPowerOf2()) {
8011 // smulo(x, signed_min) is same as umulo(x, signed_min).
8012 bool UseArithShift = isSigned && !C.isMinSignedValue();
8013 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8014 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8015 SDValue Overflow =
8016 DAG.getSetCC(SL, MVT::i1,
8017 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8018 Result, ShiftAmt),
8019 LHS, ISD::SETNE);
8020 return DAG.getMergeValues({Result, Overflow}, SL);
8021 }
8022 }
8023
8024 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8025 SDValue Top =
8026 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8027
8028 SDValue Sign = isSigned
8029 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8030 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8031 SL, MVT::i32))
8032 : DAG.getConstant(0, SL, VT);
8033 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8034
8035 return DAG.getMergeValues({Result, Overflow}, SL);
8036}
8037
8038SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8039 if (Op->isDivergent()) {
8040 // Select to V_MAD_[IU]64_[IU]32.
8041 return Op;
8042 }
8043 if (Subtarget->hasSMulHi()) {
8044 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8045 return SDValue();
8046 }
8047 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8048 // calculate the high part, so we might as well do the whole thing with
8049 // V_MAD_[IU]64_[IU]32.
8050 return Op;
8051}
8052
8053SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8054 if (!Subtarget->isTrapHandlerEnabled() ||
8055 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8056 return lowerTrapEndpgm(Op, DAG);
8057
8058 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8059 : lowerTrapHsaQueuePtr(Op, DAG);
8060}
8061
8062SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8063 SDLoc SL(Op);
8064 SDValue Chain = Op.getOperand(0);
8065 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8066}
8067
8068SDValue
8069SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8070 const SDLoc &DL, Align Alignment,
8071 ImplicitParameter Param) const {
8072 MachineFunction &MF = DAG.getMachineFunction();
8073 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8074 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8075 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8076 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8079}
8080
8081SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8082 SelectionDAG &DAG) const {
8083 SDLoc SL(Op);
8084 SDValue Chain = Op.getOperand(0);
8085
8086 SDValue QueuePtr;
8087 // For code object version 5, QueuePtr is passed through implicit kernarg.
8088 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8090 QueuePtr =
8091 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8092 } else {
8093 MachineFunction &MF = DAG.getMachineFunction();
8094 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8095 Register UserSGPR = Info->getQueuePtrUserSGPR();
8096
8097 if (UserSGPR == AMDGPU::NoRegister) {
8098 // We probably are in a function incorrectly marked with
8099 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8100 // trap, so just use a null pointer.
8101 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8102 } else {
8103 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8104 MVT::i64);
8105 }
8106 }
8107
8108 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8109 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8110
8111 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8112 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8113 ToReg.getValue(1)};
8114 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8115}
8116
8117SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8118 SDLoc SL(Op);
8119 SDValue Chain = Op.getOperand(0);
8120
8121 // We need to simulate the 's_trap 2' instruction on targets that run in
8122 // PRIV=1 (where it is treated as a nop).
8123 if (Subtarget->hasPrivEnabledTrap2NopBug())
8124 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8125
8126 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8127 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8128 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8129}
8130
8131SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8132 SDLoc SL(Op);
8133 SDValue Chain = Op.getOperand(0);
8134 MachineFunction &MF = DAG.getMachineFunction();
8135
8136 if (!Subtarget->isTrapHandlerEnabled() ||
8137 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8138 LLVMContext &Ctx = MF.getFunction().getContext();
8139 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8140 "debugtrap handler not supported",
8141 Op.getDebugLoc(), DS_Warning));
8142 return Chain;
8143 }
8144
8145 uint64_t TrapID =
8146 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8147 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8148 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8149}
8150
8151SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8152 SelectionDAG &DAG) const {
8153 if (Subtarget->hasApertureRegs()) {
8154 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8155 ? AMDGPU::SRC_SHARED_BASE
8156 : AMDGPU::SRC_PRIVATE_BASE;
8157 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8158 !Subtarget->hasGloballyAddressableScratch()) &&
8159 "Cannot use src_private_base with globally addressable scratch!");
8160 // Note: this feature (register) is broken. When used as a 32-bit operand,
8161 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8162 // bits.
8163 //
8164 // To work around the issue, emit a 64 bit copy from this register
8165 // then extract the high bits. Note that this shouldn't even result in a
8166 // shift being emitted and simply become a pair of registers (e.g.):
8167 // s_mov_b64 s[6:7], src_shared_base
8168 // v_mov_b32_e32 v1, s7
8169 SDValue Copy =
8170 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8171 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8172 }
8173
8174 // For code object version 5, private_base and shared_base are passed through
8175 // implicit kernargs.
8176 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8180 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8181 }
8182
8183 MachineFunction &MF = DAG.getMachineFunction();
8184 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8185 Register UserSGPR = Info->getQueuePtrUserSGPR();
8186 if (UserSGPR == AMDGPU::NoRegister) {
8187 // We probably are in a function incorrectly marked with
8188 // amdgpu-no-queue-ptr. This is undefined.
8189 return DAG.getPOISON(MVT::i32);
8190 }
8191
8192 SDValue QueuePtr =
8193 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8194
8195 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8196 // private_segment_aperture_base_hi.
8197 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8198
8199 SDValue Ptr =
8200 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8201
8202 // TODO: Use custom target PseudoSourceValue.
8203 // TODO: We should use the value from the IR intrinsic call, but it might not
8204 // be available and how do we get it?
8205 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8206 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8207 commonAlignment(Align(64), StructOffset),
8210}
8211
8212/// Return true if the value is a known valid address, such that a null check is
8213/// not necessary.
8215 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8217 return true;
8218
8219 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8220 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8221
8222 // TODO: Search through arithmetic, handle arguments and loads
8223 // marked nonnull.
8224 return false;
8225}
8226
8227SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8228 SelectionDAG &DAG) const {
8229 SDLoc SL(Op);
8230
8231 const AMDGPUTargetMachine &TM =
8232 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8233
8234 unsigned DestAS, SrcAS;
8235 SDValue Src;
8236 bool IsNonNull = false;
8237 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8238 SrcAS = ASC->getSrcAddressSpace();
8239 Src = ASC->getOperand(0);
8240 DestAS = ASC->getDestAddressSpace();
8241 } else {
8242 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8243 Op.getConstantOperandVal(0) ==
8244 Intrinsic::amdgcn_addrspacecast_nonnull);
8245 Src = Op->getOperand(1);
8246 SrcAS = Op->getConstantOperandVal(2);
8247 DestAS = Op->getConstantOperandVal(3);
8248 IsNonNull = true;
8249 }
8250
8251 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8252
8253 // flat -> local/private
8254 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8255 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8256 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8257 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8258
8259 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8260 Subtarget->hasGloballyAddressableScratch()) {
8261 // flat -> private with globally addressable scratch: subtract
8262 // src_flat_scratch_base_lo.
8263 SDValue FlatScratchBaseLo(
8264 DAG.getMachineNode(
8265 AMDGPU::S_MOV_B32, SL, MVT::i32,
8266 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8267 0);
8268 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8269 }
8270
8271 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8272 return Ptr;
8273
8274 unsigned NullVal = TM.getNullPointerValue(DestAS);
8275 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8276 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8277
8278 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8279 SegmentNullPtr);
8280 }
8281 }
8282
8283 // local/private -> flat
8284 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8285 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8286 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8287 SDValue CvtPtr;
8288 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8289 Subtarget->hasGloballyAddressableScratch()) {
8290 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8291 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8292 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8293 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8294 ThreadID = DAG.getNode(
8295 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8296 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8297 AllOnes, ThreadID);
8298 if (Subtarget->isWave64())
8299 ThreadID = DAG.getNode(
8300 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8301 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8302 AllOnes, ThreadID);
8303 SDValue ShAmt = DAG.getShiftAmountConstant(
8304 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8305 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8306 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8307 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8308 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8309 // 64-bit hi:lo value.
8310 SDValue FlatScratchBase = {
8311 DAG.getMachineNode(
8312 AMDGPU::S_MOV_B64, SL, MVT::i64,
8313 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8314 0};
8315 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8316 } else {
8317 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8318 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8319 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8320 }
8321
8322 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8323 return CvtPtr;
8324
8325 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8326 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8327
8328 SDValue NonNull =
8329 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8330
8331 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8332 FlatNullPtr);
8333 }
8334 }
8335
8336 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8337 Op.getValueType() == MVT::i64) {
8338 const SIMachineFunctionInfo *Info =
8339 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8340 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8341 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8342 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8343 }
8344
8345 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8346 Src.getValueType() == MVT::i64)
8347 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8348
8349 // global <-> flat are no-ops and never emitted.
8350
8351 // Invalid casts are poison.
8352 return DAG.getPOISON(Op->getValueType(0));
8353}
8354
8355// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8356// the small vector and inserting them into the big vector. That is better than
8357// the default expansion of doing it via a stack slot. Even though the use of
8358// the stack slot would be optimized away afterwards, the stack slot itself
8359// remains.
8360SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8361 SelectionDAG &DAG) const {
8362 SDValue Vec = Op.getOperand(0);
8363 SDValue Ins = Op.getOperand(1);
8364 SDValue Idx = Op.getOperand(2);
8365 EVT VecVT = Vec.getValueType();
8366 EVT InsVT = Ins.getValueType();
8367 EVT EltVT = VecVT.getVectorElementType();
8368 unsigned InsNumElts = InsVT.getVectorNumElements();
8369 unsigned IdxVal = Idx->getAsZExtVal();
8370 SDLoc SL(Op);
8371
8372 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8373 // Insert 32-bit registers at a time.
8374 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8375
8376 unsigned VecNumElts = VecVT.getVectorNumElements();
8377 EVT NewVecVT =
8378 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8379 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8381 MVT::i32, InsNumElts / 2);
8382
8383 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8384 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8385
8386 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8387 SDValue Elt;
8388 if (InsNumElts == 2) {
8389 Elt = Ins;
8390 } else {
8391 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8392 DAG.getConstant(I, SL, MVT::i32));
8393 }
8394 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8395 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8396 }
8397
8398 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8399 }
8400
8401 for (unsigned I = 0; I != InsNumElts; ++I) {
8402 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8403 DAG.getConstant(I, SL, MVT::i32));
8404 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8405 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8406 }
8407 return Vec;
8408}
8409
8410SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8411 SelectionDAG &DAG) const {
8412 SDValue Vec = Op.getOperand(0);
8413 SDValue InsVal = Op.getOperand(1);
8414 SDValue Idx = Op.getOperand(2);
8415 EVT VecVT = Vec.getValueType();
8416 EVT EltVT = VecVT.getVectorElementType();
8417 unsigned VecSize = VecVT.getSizeInBits();
8418 unsigned EltSize = EltVT.getSizeInBits();
8419 SDLoc SL(Op);
8420
8421 // Specially handle the case of v4i16 with static indexing.
8422 unsigned NumElts = VecVT.getVectorNumElements();
8423 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8424 if (NumElts == 4 && EltSize == 16 && KIdx) {
8425 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8426
8427 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8428 DAG.getConstant(0, SL, MVT::i32));
8429 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8430 DAG.getConstant(1, SL, MVT::i32));
8431
8432 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8433 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8434
8435 unsigned Idx = KIdx->getZExtValue();
8436 bool InsertLo = Idx < 2;
8437 SDValue InsHalf = DAG.getNode(
8438 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8439 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8440 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8441
8442 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8443
8444 SDValue Concat =
8445 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8446 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8447
8448 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8449 }
8450
8451 // Static indexing does not lower to stack access, and hence there is no need
8452 // for special custom lowering to avoid stack access.
8453 if (isa<ConstantSDNode>(Idx))
8454 return SDValue();
8455
8456 // Avoid stack access for dynamic indexing by custom lowering to
8457 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8458
8459 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8460
8461 MVT IntVT = MVT::getIntegerVT(VecSize);
8462
8463 // Convert vector index to bit-index and get the required bit mask.
8464 assert(isPowerOf2_32(EltSize));
8465 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8466 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8467 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8468 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8469 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8470
8471 // 1. Create a congruent vector with the target value in each element.
8472 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8473 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8474
8475 // 2. Mask off all other indices except the required index within (1).
8476 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8477
8478 // 3. Mask off the required index within the target vector.
8479 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8480 SDValue RHS =
8481 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8482
8483 // 4. Get (2) and (3) ORed into the target vector.
8484 SDValue BFI =
8485 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8486
8487 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8488}
8489
8490SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8491 SelectionDAG &DAG) const {
8492 SDLoc SL(Op);
8493
8494 EVT ResultVT = Op.getValueType();
8495 SDValue Vec = Op.getOperand(0);
8496 SDValue Idx = Op.getOperand(1);
8497 EVT VecVT = Vec.getValueType();
8498 unsigned VecSize = VecVT.getSizeInBits();
8499 EVT EltVT = VecVT.getVectorElementType();
8500
8501 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8502
8503 // Make sure we do any optimizations that will make it easier to fold
8504 // source modifiers before obscuring it with bit operations.
8505
8506 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8507 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8508 return Combined;
8509
8510 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8511 SDValue Lo, Hi;
8512 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8513
8514 if (VecSize == 128) {
8515 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8516 Lo = DAG.getBitcast(LoVT,
8517 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8518 DAG.getConstant(0, SL, MVT::i32)));
8519 Hi = DAG.getBitcast(HiVT,
8520 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8521 DAG.getConstant(1, SL, MVT::i32)));
8522 } else if (VecSize == 256) {
8523 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8524 SDValue Parts[4];
8525 for (unsigned P = 0; P < 4; ++P) {
8526 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8527 DAG.getConstant(P, SL, MVT::i32));
8528 }
8529
8530 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8531 Parts[0], Parts[1]));
8532 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8533 Parts[2], Parts[3]));
8534 } else {
8535 assert(VecSize == 512);
8536
8537 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8538 SDValue Parts[8];
8539 for (unsigned P = 0; P < 8; ++P) {
8540 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8541 DAG.getConstant(P, SL, MVT::i32));
8542 }
8543
8544 Lo = DAG.getBitcast(LoVT,
8545 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8546 Parts[0], Parts[1], Parts[2], Parts[3]));
8547 Hi = DAG.getBitcast(HiVT,
8548 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8549 Parts[4], Parts[5], Parts[6], Parts[7]));
8550 }
8551
8552 EVT IdxVT = Idx.getValueType();
8553 unsigned NElem = VecVT.getVectorNumElements();
8554 assert(isPowerOf2_32(NElem));
8555 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8556 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8557 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8558 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8559 }
8560
8561 assert(VecSize <= 64);
8562
8563 MVT IntVT = MVT::getIntegerVT(VecSize);
8564
8565 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8566 SDValue VecBC = peekThroughBitcasts(Vec);
8567 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8568 SDValue Src = VecBC.getOperand(0);
8569 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8570 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8571 }
8572
8573 unsigned EltSize = EltVT.getSizeInBits();
8574 assert(isPowerOf2_32(EltSize));
8575
8576 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8577
8578 // Convert vector index to bit-index (* EltSize)
8579 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8580
8581 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8582 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8583
8584 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8585 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8586 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8587 }
8588
8589 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8590}
8591
8592static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8593 assert(Elt % 2 == 0);
8594 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8595}
8596
8597static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8598 assert(Elt % 2 == 0);
8599 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8600 !(Mask[Elt + 1] & 1);
8601}
8602
8603SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8604 SelectionDAG &DAG) const {
8605 SDLoc SL(Op);
8606 EVT ResultVT = Op.getValueType();
8607 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8608 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8609 const int NewSrcNumElts = 2;
8610 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8611 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8612
8613 // Break up the shuffle into registers sized pieces.
8614 //
8615 // We're trying to form sub-shuffles that the register allocation pipeline
8616 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8617 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8618 // pair of copies into a consecutive register copy, so use the ordinary
8619 // extract_vector_elt lowering unless we can use the shuffle.
8620 //
8621 // TODO: This is a bit of hack, and we should probably always use
8622 // extract_subvector for the largest possible subvector we can (or at least
8623 // use it for PackVT aligned pieces). However we have worse support for
8624 // combines on them don't directly treat extract_subvector / insert_subvector
8625 // as legal. The DAG scheduler also ends up doing a worse job with the
8626 // extract_subvectors.
8627 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8628
8629 // vector_shuffle <0,1,6,7> lhs, rhs
8630 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8631 //
8632 // vector_shuffle <6,7,2,3> lhs, rhs
8633 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8634 //
8635 // vector_shuffle <6,7,0,1> lhs, rhs
8636 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8637
8638 // Avoid scalarizing when both halves are reading from consecutive elements.
8639
8640 // If we're treating 2 element shuffles as legal, also create odd-to-even
8641 // shuffles of neighboring pairs.
8642 //
8643 // vector_shuffle <3,2,7,6> lhs, rhs
8644 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8645 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8646
8648 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8649 if (ShouldUseConsecutiveExtract &&
8651 const int Idx = SVN->getMaskElt(I);
8652 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8653 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8654 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8655 SVN->getOperand(VecIdx),
8656 DAG.getConstant(EltIdx, SL, MVT::i32));
8657 Pieces.push_back(SubVec);
8658 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8660 int Idx0 = SVN->getMaskElt(I);
8661 int Idx1 = SVN->getMaskElt(I + 1);
8662
8663 SDValue SrcOp0 = SVN->getOperand(0);
8664 SDValue SrcOp1 = SrcOp0;
8665 if (Idx0 >= SrcNumElts) {
8666 SrcOp0 = SVN->getOperand(1);
8667 Idx0 -= SrcNumElts;
8668 }
8669
8670 if (Idx1 >= SrcNumElts) {
8671 SrcOp1 = SVN->getOperand(1);
8672 Idx1 -= SrcNumElts;
8673 }
8674
8675 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8676 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8677
8678 // Extract nearest even aligned piece.
8679 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8680 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8681 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8682 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8683
8684 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8685 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8686
8687 SDValue Result0 = SubVec0;
8688 SDValue Result1 = SubVec0;
8689
8690 if (SubVec0 != SubVec1) {
8691 NewMaskIdx1 += NewSrcNumElts;
8692 Result1 = SubVec1;
8693 } else {
8694 Result1 = DAG.getPOISON(PackVT);
8695 }
8696
8697 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8698 {NewMaskIdx0, NewMaskIdx1});
8699 Pieces.push_back(Shuf);
8700 } else {
8701 const int Idx0 = SVN->getMaskElt(I);
8702 const int Idx1 = SVN->getMaskElt(I + 1);
8703 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8704 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8705 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8706 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8707
8708 SDValue Vec0 = SVN->getOperand(VecIdx0);
8709 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8710 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8711
8712 SDValue Vec1 = SVN->getOperand(VecIdx1);
8713 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8714 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8715 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8716 }
8717 }
8718
8719 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8720}
8721
8722SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8723 SelectionDAG &DAG) const {
8724 SDValue SVal = Op.getOperand(0);
8725 EVT ResultVT = Op.getValueType();
8726 EVT SValVT = SVal.getValueType();
8727 SDValue UndefVal = DAG.getPOISON(SValVT);
8728 SDLoc SL(Op);
8729
8731 VElts.push_back(SVal);
8732 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8733 VElts.push_back(UndefVal);
8734
8735 return DAG.getBuildVector(ResultVT, SL, VElts);
8736}
8737
8738SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8739 SelectionDAG &DAG) const {
8740 SDLoc SL(Op);
8741 EVT VT = Op.getValueType();
8742
8743 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8744 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8745
8746 SDValue Lo = Op.getOperand(0);
8747 SDValue Hi = Op.getOperand(1);
8748
8749 // Avoid adding defined bits with the zero_extend.
8750 if (Hi.isUndef()) {
8751 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8752 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8753 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8754 }
8755
8756 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8757 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8758
8759 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8760 DAG.getConstant(16, SL, MVT::i32));
8761 if (Lo.isUndef())
8762 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8763
8764 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8765 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8766
8767 SDValue Or =
8768 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8769 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8770 }
8771
8772 // Split into 2-element chunks.
8773 const unsigned NumParts = VT.getVectorNumElements() / 2;
8774 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8775 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8776
8778 for (unsigned P = 0; P < NumParts; ++P) {
8779 SDValue Vec = DAG.getBuildVector(
8780 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8781 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8782 }
8783
8784 SDValue Blend =
8785 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8786 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8787}
8788
8790 const GlobalAddressSDNode *GA) const {
8791 // OSes that use ELF REL relocations (instead of RELA) can only store a
8792 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8793 // which can create arbitrary 64-bit addends. (This is only a problem for
8794 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8795 // the high 32 bits of the addend.)
8796 //
8797 // This should be kept in sync with how HasRelocationAddend is initialized in
8798 // the constructor of ELFAMDGPUAsmBackend.
8799 if (!Subtarget->isAmdHsaOS())
8800 return false;
8801
8802 // We can fold offsets for anything that doesn't require a GOT relocation.
8803 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8807}
8808
8809static SDValue
8811 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8812 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8813 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8814 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8815 // lowered to the following code sequence:
8816 //
8817 // For constant address space:
8818 // s_getpc_b64 s[0:1]
8819 // s_add_u32 s0, s0, $symbol
8820 // s_addc_u32 s1, s1, 0
8821 //
8822 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8823 // a fixup or relocation is emitted to replace $symbol with a literal
8824 // constant, which is a pc-relative offset from the encoding of the $symbol
8825 // operand to the global variable.
8826 //
8827 // For global address space:
8828 // s_getpc_b64 s[0:1]
8829 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8830 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8831 //
8832 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8833 // fixups or relocations are emitted to replace $symbol@*@lo and
8834 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8835 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8836 // operand to the global variable.
8837 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8838 assert(GAFlags != SIInstrInfo::MO_NONE);
8839
8840 SDValue Ptr =
8841 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8842 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8843 }
8844
8845 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8846 SDValue PtrHi;
8847 if (GAFlags == SIInstrInfo::MO_NONE)
8848 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8849 else
8850 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8851 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8852}
8853
8854SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8855 SDValue Op,
8856 SelectionDAG &DAG) const {
8857 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8858 SDLoc DL(GSD);
8859 EVT PtrVT = Op.getValueType();
8860
8861 const GlobalValue *GV = GSD->getGlobal();
8867 GV->hasExternalLinkage()) {
8868 Type *Ty = GV->getValueType();
8869 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8870 // zero-sized type in other languages to declare the dynamic shared
8871 // memory which size is not known at the compile time. They will be
8872 // allocated by the runtime and placed directly after the static
8873 // allocated ones. They all share the same offset.
8874 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8875 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8876 // Adjust alignment for that dynamic shared memory array.
8879 MFI->setUsesDynamicLDS(true);
8880 return SDValue(
8881 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8882 }
8883 }
8885 }
8886
8888 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8890 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8891 }
8892
8893 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8894 if (Subtarget->has64BitLiterals()) {
8896 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8897 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8898 0);
8899 }
8900
8901 SDValue AddrLo = DAG.getTargetGlobalAddress(
8902 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8903 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8904
8905 SDValue AddrHi = DAG.getTargetGlobalAddress(
8906 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8907 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8908
8909 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8910 }
8911
8912 if (shouldEmitFixup(GV))
8913 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8914
8915 if (shouldEmitPCReloc(GV))
8916 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8918
8919 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8921 PointerType *PtrTy =
8923 const DataLayout &DataLayout = DAG.getDataLayout();
8924 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8925 MachinePointerInfo PtrInfo =
8927
8928 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8931}
8932
8934 const SDLoc &DL, SDValue V) const {
8935 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8936 // the destination register.
8937 //
8938 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8939 // so we will end up with redundant moves to m0.
8940 //
8941 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8942
8943 // A Null SDValue creates a glue result.
8944 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8945 V, Chain);
8946 return SDValue(M0, 0);
8947}
8948
8949SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8950 MVT VT,
8951 unsigned Offset) const {
8952 SDLoc SL(Op);
8953 SDValue Param = lowerKernargMemParameter(
8954 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8955 // The local size values will have the hi 16-bits as zero.
8956 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8957 DAG.getValueType(VT));
8958}
8959
8961 EVT VT) {
8964 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8965 return DAG.getPOISON(VT);
8966}
8967
8969 EVT VT) {
8972 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8973 return DAG.getPOISON(VT);
8974}
8975
8977 ArrayRef<SDValue> Elts) {
8978 assert(!Elts.empty());
8979 MVT Type;
8980 unsigned NumElts = Elts.size();
8981
8982 if (NumElts <= 12) {
8983 Type = MVT::getVectorVT(MVT::f32, NumElts);
8984 } else {
8985 assert(Elts.size() <= 16);
8986 Type = MVT::v16f32;
8987 NumElts = 16;
8988 }
8989
8990 SmallVector<SDValue, 16> VecElts(NumElts);
8991 for (unsigned i = 0; i < Elts.size(); ++i) {
8992 SDValue Elt = Elts[i];
8993 if (Elt.getValueType() != MVT::f32)
8994 Elt = DAG.getBitcast(MVT::f32, Elt);
8995 VecElts[i] = Elt;
8996 }
8997 for (unsigned i = Elts.size(); i < NumElts; ++i)
8998 VecElts[i] = DAG.getPOISON(MVT::f32);
8999
9000 if (NumElts == 1)
9001 return VecElts[0];
9002 return DAG.getBuildVector(Type, DL, VecElts);
9003}
9004
9005static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9006 SDValue Src, int ExtraElts) {
9007 EVT SrcVT = Src.getValueType();
9008
9010
9011 if (SrcVT.isVector())
9012 DAG.ExtractVectorElements(Src, Elts);
9013 else
9014 Elts.push_back(Src);
9015
9016 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9017 while (ExtraElts--)
9018 Elts.push_back(Undef);
9019
9020 return DAG.getBuildVector(CastVT, DL, Elts);
9021}
9022
9023// Re-construct the required return value for a image load intrinsic.
9024// This is more complicated due to the optional use TexFailCtrl which means the
9025// required return type is an aggregate
9027 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9028 bool Unpacked, bool IsD16, int DMaskPop,
9029 int NumVDataDwords, bool IsAtomicPacked16Bit,
9030 const SDLoc &DL) {
9031 // Determine the required return type. This is the same regardless of
9032 // IsTexFail flag
9033 EVT ReqRetVT = ResultTypes[0];
9034 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9035 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9036 ? (ReqRetNumElts + 1) / 2
9037 : ReqRetNumElts;
9038
9039 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9040
9041 MVT DataDwordVT =
9042 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9043
9044 MVT MaskPopVT =
9045 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9046
9047 SDValue Data(Result, 0);
9048 SDValue TexFail;
9049
9050 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9051 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9052 if (MaskPopVT.isVector()) {
9053 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9054 SDValue(Result, 0), ZeroIdx);
9055 } else {
9056 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9057 SDValue(Result, 0), ZeroIdx);
9058 }
9059 }
9060
9061 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9062 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9063 NumDataDwords - MaskPopDwords);
9064
9065 if (IsD16)
9066 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9067
9068 EVT LegalReqRetVT = ReqRetVT;
9069 if (!ReqRetVT.isVector()) {
9070 if (!Data.getValueType().isInteger())
9071 Data = DAG.getNode(ISD::BITCAST, DL,
9072 Data.getValueType().changeTypeToInteger(), Data);
9073 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9074 } else {
9075 // We need to widen the return vector to a legal type
9076 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9077 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9078 LegalReqRetVT =
9080 ReqRetVT.getVectorNumElements() + 1);
9081 }
9082 }
9083 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9084
9085 if (IsTexFail) {
9086 TexFail =
9087 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9088 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9089
9090 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9091 }
9092
9093 if (Result->getNumValues() == 1)
9094 return Data;
9095
9096 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9097}
9098
9099static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9100 SDValue *LWE, bool &IsTexFail) {
9101 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9102
9103 uint64_t Value = TexFailCtrlConst->getZExtValue();
9104 if (Value) {
9105 IsTexFail = true;
9106 }
9107
9108 SDLoc DL(TexFailCtrlConst);
9109 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9110 Value &= ~(uint64_t)0x1;
9111 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9112 Value &= ~(uint64_t)0x2;
9113
9114 return Value == 0;
9115}
9116
9118 MVT PackVectorVT,
9119 SmallVectorImpl<SDValue> &PackedAddrs,
9120 unsigned DimIdx, unsigned EndIdx,
9121 unsigned NumGradients) {
9122 SDLoc DL(Op);
9123 for (unsigned I = DimIdx; I < EndIdx; I++) {
9124 SDValue Addr = Op.getOperand(I);
9125
9126 // Gradients are packed with undef for each coordinate.
9127 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9128 // 1D: undef,dx/dh; undef,dx/dv
9129 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9130 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9131 if (((I + 1) >= EndIdx) ||
9132 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9133 I == DimIdx + NumGradients - 1))) {
9134 if (Addr.getValueType() != MVT::i16)
9135 Addr = DAG.getBitcast(MVT::i16, Addr);
9136 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9137 } else {
9138 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9139 I++;
9140 }
9141 Addr = DAG.getBitcast(MVT::f32, Addr);
9142 PackedAddrs.push_back(Addr);
9143 }
9144}
9145
9146SDValue SITargetLowering::lowerImage(SDValue Op,
9148 SelectionDAG &DAG, bool WithChain) const {
9149 SDLoc DL(Op);
9150 MachineFunction &MF = DAG.getMachineFunction();
9151 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9152 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9154 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9155 unsigned IntrOpcode = Intr->BaseOpcode;
9156 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9157 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9158 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9159
9160 SmallVector<EVT, 3> ResultTypes(Op->values());
9161 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9162 bool IsD16 = false;
9163 bool IsG16 = false;
9164 bool IsA16 = false;
9165 SDValue VData;
9166 int NumVDataDwords = 0;
9167 bool AdjustRetType = false;
9168 bool IsAtomicPacked16Bit = false;
9169
9170 // Offset of intrinsic arguments
9171 const unsigned ArgOffset = WithChain ? 2 : 1;
9172
9173 unsigned DMask;
9174 unsigned DMaskLanes = 0;
9175
9176 if (BaseOpcode->Atomic) {
9177 VData = Op.getOperand(2);
9178
9179 IsAtomicPacked16Bit =
9180 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9181 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9182
9183 bool Is64Bit = VData.getValueSizeInBits() == 64;
9184 if (BaseOpcode->AtomicX2) {
9185 SDValue VData2 = Op.getOperand(3);
9186 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9187 {VData, VData2});
9188 if (Is64Bit)
9189 VData = DAG.getBitcast(MVT::v4i32, VData);
9190
9191 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9192 DMask = Is64Bit ? 0xf : 0x3;
9193 NumVDataDwords = Is64Bit ? 4 : 2;
9194 } else {
9195 DMask = Is64Bit ? 0x3 : 0x1;
9196 NumVDataDwords = Is64Bit ? 2 : 1;
9197 }
9198 } else {
9199 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9200 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9201
9202 if (BaseOpcode->Store) {
9203 VData = Op.getOperand(2);
9204
9205 MVT StoreVT = VData.getSimpleValueType();
9206 if (StoreVT.getScalarType() == MVT::f16) {
9207 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9208 return Op; // D16 is unsupported for this instruction
9209
9210 IsD16 = true;
9211 VData = handleD16VData(VData, DAG, true);
9212 }
9213
9214 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9215 } else if (!BaseOpcode->NoReturn) {
9216 // Work out the num dwords based on the dmask popcount and underlying type
9217 // and whether packing is supported.
9218 MVT LoadVT = ResultTypes[0].getSimpleVT();
9219 if (LoadVT.getScalarType() == MVT::f16) {
9220 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9221 return Op; // D16 is unsupported for this instruction
9222
9223 IsD16 = true;
9224 }
9225
9226 // Confirm that the return type is large enough for the dmask specified
9227 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9228 (!LoadVT.isVector() && DMaskLanes > 1))
9229 return Op;
9230
9231 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9232 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9233 // instructions.
9234 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9235 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9236 NumVDataDwords = (DMaskLanes + 1) / 2;
9237 else
9238 NumVDataDwords = DMaskLanes;
9239
9240 AdjustRetType = true;
9241 }
9242 }
9243
9244 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9246
9247 // Check for 16 bit addresses or derivatives and pack if true.
9248 MVT VAddrVT =
9249 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9250 MVT VAddrScalarVT = VAddrVT.getScalarType();
9251 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9252 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9253
9254 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9255 VAddrScalarVT = VAddrVT.getScalarType();
9256 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9257 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9258
9259 // Push back extra arguments.
9260 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9261 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9262 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9263 // Special handling of bias when A16 is on. Bias is of type half but
9264 // occupies full 32-bit.
9265 SDValue Bias = DAG.getBuildVector(
9266 MVT::v2f16, DL,
9267 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9268 VAddrs.push_back(Bias);
9269 } else {
9270 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9271 "Bias needs to be converted to 16 bit in A16 mode");
9272 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9273 }
9274 }
9275
9276 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9277 // 16 bit gradients are supported, but are tied to the A16 control
9278 // so both gradients and addresses must be 16 bit
9279 LLVM_DEBUG(
9280 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9281 "require 16 bit args for both gradients and addresses");
9282 return Op;
9283 }
9284
9285 if (IsA16) {
9286 if (!ST->hasA16()) {
9287 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9288 "support 16 bit addresses\n");
9289 return Op;
9290 }
9291 }
9292
9293 // We've dealt with incorrect input so we know that if IsA16, IsG16
9294 // are set then we have to compress/pack operands (either address,
9295 // gradient or both)
9296 // In the case where a16 and gradients are tied (no G16 support) then we
9297 // have already verified that both IsA16 and IsG16 are true
9298 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9299 // Activate g16
9300 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9302 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9303 }
9304
9305 // Add gradients (packed or unpacked)
9306 if (IsG16) {
9307 // Pack the gradients
9308 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9309 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9310 ArgOffset + Intr->GradientStart,
9311 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9312 } else {
9313 for (unsigned I = ArgOffset + Intr->GradientStart;
9314 I < ArgOffset + Intr->CoordStart; I++)
9315 VAddrs.push_back(Op.getOperand(I));
9316 }
9317
9318 // Add addresses (packed or unpacked)
9319 if (IsA16) {
9320 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9321 ArgOffset + Intr->CoordStart, VAddrEnd,
9322 0 /* No gradients */);
9323 } else {
9324 // Add uncompressed address
9325 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9326 VAddrs.push_back(Op.getOperand(I));
9327 }
9328
9329 // If the register allocator cannot place the address registers contiguously
9330 // without introducing moves, then using the non-sequential address encoding
9331 // is always preferable, since it saves VALU instructions and is usually a
9332 // wash in terms of code size or even better.
9333 //
9334 // However, we currently have no way of hinting to the register allocator that
9335 // MIMG addresses should be placed contiguously when it is possible to do so,
9336 // so force non-NSA for the common 2-address case as a heuristic.
9337 //
9338 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9339 // allocation when possible.
9340 //
9341 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9342 // set of the remaining addresses.
9343 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9344 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9345 const bool UseNSA = ST->hasNSAEncoding() &&
9346 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9347 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9348 const bool UsePartialNSA =
9349 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9350
9351 SDValue VAddr;
9352 if (UsePartialNSA) {
9353 VAddr = getBuildDwordsVector(DAG, DL,
9354 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9355 } else if (!UseNSA) {
9356 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9357 }
9358
9359 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9360 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9361 SDValue Unorm;
9362 if (!BaseOpcode->Sampler) {
9363 Unorm = True;
9364 } else {
9365 uint64_t UnormConst =
9366 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9367
9368 Unorm = UnormConst ? True : False;
9369 }
9370
9371 SDValue TFE;
9372 SDValue LWE;
9373 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9374 bool IsTexFail = false;
9375 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9376 return Op;
9377
9378 if (IsTexFail) {
9379 if (!DMaskLanes) {
9380 // Expecting to get an error flag since TFC is on - and dmask is 0
9381 // Force dmask to be at least 1 otherwise the instruction will fail
9382 DMask = 0x1;
9383 DMaskLanes = 1;
9384 NumVDataDwords = 1;
9385 }
9386 NumVDataDwords += 1;
9387 AdjustRetType = true;
9388 }
9389
9390 // Has something earlier tagged that the return type needs adjusting
9391 // This happens if the instruction is a load or has set TexFailCtrl flags
9392 if (AdjustRetType) {
9393 // NumVDataDwords reflects the true number of dwords required in the return
9394 // type
9395 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9396 // This is a no-op load. This can be eliminated
9397 SDValue Undef = DAG.getPOISON(Op.getValueType());
9398 if (isa<MemSDNode>(Op))
9399 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9400 return Undef;
9401 }
9402
9403 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9404 MVT::i32, NumVDataDwords)
9405 : MVT::i32;
9406
9407 ResultTypes[0] = NewVT;
9408 if (ResultTypes.size() == 3) {
9409 // Original result was aggregate type used for TexFailCtrl results
9410 // The actual instruction returns as a vector type which has now been
9411 // created. Remove the aggregate result.
9412 ResultTypes.erase(&ResultTypes[1]);
9413 }
9414 }
9415
9416 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9417 if (BaseOpcode->Atomic)
9418 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9419 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9421 return Op;
9422
9424 if (BaseOpcode->Store || BaseOpcode->Atomic)
9425 Ops.push_back(VData); // vdata
9426 if (UsePartialNSA) {
9427 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9428 Ops.push_back(VAddr);
9429 } else if (UseNSA)
9430 append_range(Ops, VAddrs);
9431 else
9432 Ops.push_back(VAddr);
9433 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9434 EVT RsrcVT = Rsrc.getValueType();
9435 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9436 return Op;
9437 Ops.push_back(Rsrc);
9438 if (BaseOpcode->Sampler) {
9439 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9440 if (Samp.getValueType() != MVT::v4i32)
9441 return Op;
9442 Ops.push_back(Samp);
9443 }
9444 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9445 if (IsGFX10Plus)
9446 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9447 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9448 Ops.push_back(Unorm);
9449 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9450 Ops.push_back(IsA16 && // r128, a16 for gfx9
9451 ST->hasFeature(AMDGPU::FeatureR128A16)
9452 ? True
9453 : False);
9454 if (IsGFX10Plus)
9455 Ops.push_back(IsA16 ? True : False);
9456
9457 if (!Subtarget->hasGFX90AInsts())
9458 Ops.push_back(TFE); // tfe
9459 else if (TFE->getAsZExtVal()) {
9460 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9462 "TFE is not supported on this GPU", DL.getDebugLoc()));
9463 }
9464
9465 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9466 Ops.push_back(LWE); // lwe
9467 if (!IsGFX10Plus)
9468 Ops.push_back(DimInfo->DA ? True : False);
9469 if (BaseOpcode->HasD16)
9470 Ops.push_back(IsD16 ? True : False);
9471 if (isa<MemSDNode>(Op))
9472 Ops.push_back(Op.getOperand(0)); // chain
9473
9474 int NumVAddrDwords =
9475 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9476 int Opcode = -1;
9477
9478 if (IsGFX12Plus) {
9479 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9480 NumVDataDwords, NumVAddrDwords);
9481 } else if (IsGFX11Plus) {
9482 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9483 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9484 : AMDGPU::MIMGEncGfx11Default,
9485 NumVDataDwords, NumVAddrDwords);
9486 } else if (IsGFX10Plus) {
9487 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9488 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9489 : AMDGPU::MIMGEncGfx10Default,
9490 NumVDataDwords, NumVAddrDwords);
9491 } else {
9492 if (Subtarget->hasGFX90AInsts()) {
9493 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9494 NumVDataDwords, NumVAddrDwords);
9495 if (Opcode == -1) {
9496 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9498 "requested image instruction is not supported on this GPU",
9499 DL.getDebugLoc()));
9500
9501 unsigned Idx = 0;
9502 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9503 for (EVT VT : OrigResultTypes) {
9504 if (VT == MVT::Other)
9505 RetValues[Idx++] = Op.getOperand(0); // Chain
9506 else
9507 RetValues[Idx++] = DAG.getPOISON(VT);
9508 }
9509
9510 return DAG.getMergeValues(RetValues, DL);
9511 }
9512 }
9513 if (Opcode == -1 &&
9514 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9515 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9516 NumVDataDwords, NumVAddrDwords);
9517 if (Opcode == -1)
9518 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9519 NumVDataDwords, NumVAddrDwords);
9520 }
9521 if (Opcode == -1)
9522 return Op;
9523
9524 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9525 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9526 MachineMemOperand *MemRef = MemOp->getMemOperand();
9527 DAG.setNodeMemRefs(NewNode, {MemRef});
9528 }
9529
9530 if (BaseOpcode->AtomicX2) {
9532 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9533 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9534 }
9535 if (BaseOpcode->NoReturn)
9536 return SDValue(NewNode, 0);
9537 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9538 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9539 NumVDataDwords, IsAtomicPacked16Bit, DL);
9540}
9541
9542SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9543 SDValue Offset, SDValue CachePolicy,
9544 SelectionDAG &DAG) const {
9545 MachineFunction &MF = DAG.getMachineFunction();
9546
9547 const DataLayout &DataLayout = DAG.getDataLayout();
9548 Align Alignment =
9549 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9550
9551 MachineMemOperand *MMO = MF.getMachineMemOperand(
9552 MachinePointerInfo(),
9555 VT.getStoreSize(), Alignment);
9556
9557 if (!Offset->isDivergent()) {
9558 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9559
9560 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9561 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9562 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9563 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9564 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9565 SDValue BufferLoad =
9567 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9568 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9569 }
9570
9571 // Widen vec3 load to vec4.
9572 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9573 !Subtarget->hasScalarDwordx3Loads()) {
9574 EVT WidenedVT =
9576 auto WidenedOp = DAG.getMemIntrinsicNode(
9577 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9578 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9579 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9580 DAG.getVectorIdxConstant(0, DL));
9581 return Subvector;
9582 }
9583
9585 DAG.getVTList(VT), Ops, VT, MMO);
9586 }
9587
9588 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9589 // assume that the buffer is unswizzled.
9590 SDValue Ops[] = {
9591 DAG.getEntryNode(), // Chain
9592 Rsrc, // rsrc
9593 DAG.getConstant(0, DL, MVT::i32), // vindex
9594 {}, // voffset
9595 {}, // soffset
9596 {}, // offset
9597 CachePolicy, // cachepolicy
9598 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9599 };
9600 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9601 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9602 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9603 }
9604
9606 unsigned NumLoads = 1;
9607 MVT LoadVT = VT.getSimpleVT();
9608 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9609 assert((LoadVT.getScalarType() == MVT::i32 ||
9610 LoadVT.getScalarType() == MVT::f32));
9611
9612 if (NumElts == 8 || NumElts == 16) {
9613 NumLoads = NumElts / 4;
9614 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9615 }
9616
9617 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9618
9619 // Use the alignment to ensure that the required offsets will fit into the
9620 // immediate offsets.
9621 setBufferOffsets(Offset, DAG, &Ops[3],
9622 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9623
9624 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9625 for (unsigned i = 0; i < NumLoads; ++i) {
9626 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9627 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9628 LoadVT, MMO, DAG));
9629 }
9630
9631 if (NumElts == 8 || NumElts == 16)
9632 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9633
9634 return Loads[0];
9635}
9636
9637SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9638 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9639 if (!Subtarget->hasArchitectedSGPRs())
9640 return {};
9641 SDLoc SL(Op);
9642 MVT VT = MVT::i32;
9643 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9644 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9645 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9646}
9647
9648SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9649 AMDGPU::Hwreg::Id HwReg,
9650 unsigned LowBit,
9651 unsigned Width) const {
9652 SDLoc SL(Op);
9653 using namespace AMDGPU::Hwreg;
9654 return {DAG.getMachineNode(
9655 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9656 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9657 SL, MVT::i32)),
9658 0};
9659}
9660
9661SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9662 unsigned Dim,
9663 const ArgDescriptor &Arg) const {
9664 SDLoc SL(Op);
9665 MachineFunction &MF = DAG.getMachineFunction();
9666 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9667 if (MaxID == 0)
9668 return DAG.getConstant(0, SL, MVT::i32);
9669
9670 // It's undefined behavior if a function marked with the amdgpu-no-*
9671 // attributes uses the corresponding intrinsic.
9672 if (!Arg)
9673 return DAG.getPOISON(Op->getValueType(0));
9674
9675 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9676 SDLoc(DAG.getEntryNode()), Arg);
9677
9678 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9679 // masking operations anyway.
9680 //
9681 // TODO: We could assert the top bit is 0 for the source copy.
9682 if (Arg.isMasked())
9683 return Val;
9684
9685 // Preserve the known bits after expansion to a copy.
9686 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9687 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9688 DAG.getValueType(SmallVT));
9689}
9690
9691SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9692 SelectionDAG &DAG) const {
9693 MachineFunction &MF = DAG.getMachineFunction();
9694 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9695
9696 EVT VT = Op.getValueType();
9697 SDLoc DL(Op);
9698 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9699
9700 // TODO: Should this propagate fast-math-flags?
9701
9702 switch (IntrinsicID) {
9703 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9704 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9705 return emitNonHSAIntrinsicError(DAG, DL, VT);
9706 return getPreloadedValue(DAG, *MFI, VT,
9708 }
9709 case Intrinsic::amdgcn_dispatch_ptr:
9710 case Intrinsic::amdgcn_queue_ptr: {
9711 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9712 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9713 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9714 DL.getDebugLoc()));
9715 return DAG.getPOISON(VT);
9716 }
9717
9718 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9721 return getPreloadedValue(DAG, *MFI, VT, RegID);
9722 }
9723 case Intrinsic::amdgcn_implicitarg_ptr: {
9724 if (MFI->isEntryFunction())
9725 return getImplicitArgPtr(DAG, DL);
9726 return getPreloadedValue(DAG, *MFI, VT,
9728 }
9729 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9731 // This only makes sense to call in a kernel, so just lower to null.
9732 return DAG.getConstant(0, DL, VT);
9733 }
9734
9735 return getPreloadedValue(DAG, *MFI, VT,
9737 }
9738 case Intrinsic::amdgcn_dispatch_id: {
9739 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9740 }
9741 case Intrinsic::amdgcn_rcp:
9742 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9743 case Intrinsic::amdgcn_rsq:
9744 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9745 case Intrinsic::amdgcn_rsq_legacy:
9746 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9747 return emitRemovedIntrinsicError(DAG, DL, VT);
9748 return SDValue();
9749 case Intrinsic::amdgcn_rcp_legacy:
9750 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9751 return emitRemovedIntrinsicError(DAG, DL, VT);
9752 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9753 case Intrinsic::amdgcn_rsq_clamp: {
9754 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9755 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9756
9757 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9758 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9759 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9760
9761 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9762 SDValue Tmp =
9763 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9764 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9765 DAG.getConstantFP(Min, DL, VT));
9766 }
9767 case Intrinsic::r600_read_ngroups_x:
9768 if (Subtarget->isAmdHsaOS())
9769 return emitNonHSAIntrinsicError(DAG, DL, VT);
9770
9771 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9773 false);
9774 case Intrinsic::r600_read_ngroups_y:
9775 if (Subtarget->isAmdHsaOS())
9776 return emitNonHSAIntrinsicError(DAG, DL, VT);
9777
9778 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9780 false);
9781 case Intrinsic::r600_read_ngroups_z:
9782 if (Subtarget->isAmdHsaOS())
9783 return emitNonHSAIntrinsicError(DAG, DL, VT);
9784
9785 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9787 false);
9788 case Intrinsic::r600_read_local_size_x:
9789 if (Subtarget->isAmdHsaOS())
9790 return emitNonHSAIntrinsicError(DAG, DL, VT);
9791
9792 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9794 case Intrinsic::r600_read_local_size_y:
9795 if (Subtarget->isAmdHsaOS())
9796 return emitNonHSAIntrinsicError(DAG, DL, VT);
9797
9798 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9800 case Intrinsic::r600_read_local_size_z:
9801 if (Subtarget->isAmdHsaOS())
9802 return emitNonHSAIntrinsicError(DAG, DL, VT);
9803
9804 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9806 case Intrinsic::amdgcn_workgroup_id_x:
9807 return lowerWorkGroupId(DAG, *MFI, VT,
9811 case Intrinsic::amdgcn_workgroup_id_y:
9812 return lowerWorkGroupId(DAG, *MFI, VT,
9816 case Intrinsic::amdgcn_workgroup_id_z:
9817 return lowerWorkGroupId(DAG, *MFI, VT,
9821 case Intrinsic::amdgcn_cluster_id_x:
9822 return Subtarget->hasClusters()
9823 ? getPreloadedValue(DAG, *MFI, VT,
9825 : DAG.getPOISON(VT);
9826 case Intrinsic::amdgcn_cluster_id_y:
9827 return Subtarget->hasClusters()
9828 ? getPreloadedValue(DAG, *MFI, VT,
9830 : DAG.getPOISON(VT);
9831 case Intrinsic::amdgcn_cluster_id_z:
9832 return Subtarget->hasClusters()
9833 ? getPreloadedValue(DAG, *MFI, VT,
9835 : DAG.getPOISON(VT);
9836 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9837 return Subtarget->hasClusters()
9838 ? getPreloadedValue(
9839 DAG, *MFI, VT,
9841 : DAG.getPOISON(VT);
9842 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9843 return Subtarget->hasClusters()
9844 ? getPreloadedValue(
9845 DAG, *MFI, VT,
9847 : DAG.getPOISON(VT);
9848 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9849 return Subtarget->hasClusters()
9850 ? getPreloadedValue(
9851 DAG, *MFI, VT,
9853 : DAG.getPOISON(VT);
9854 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9855 return Subtarget->hasClusters()
9856 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9857 : SDValue();
9858 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9859 return Subtarget->hasClusters()
9860 ? getPreloadedValue(
9861 DAG, *MFI, VT,
9863 : DAG.getPOISON(VT);
9864 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9865 return Subtarget->hasClusters()
9866 ? getPreloadedValue(
9867 DAG, *MFI, VT,
9869 : DAG.getPOISON(VT);
9870 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9871 return Subtarget->hasClusters()
9872 ? getPreloadedValue(
9873 DAG, *MFI, VT,
9875 : DAG.getPOISON(VT);
9876 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9877 return Subtarget->hasClusters()
9878 ? getPreloadedValue(
9879 DAG, *MFI, VT,
9881 : DAG.getPOISON(VT);
9882 case Intrinsic::amdgcn_wave_id:
9883 return lowerWaveID(DAG, Op);
9884 case Intrinsic::amdgcn_lds_kernel_id: {
9885 if (MFI->isEntryFunction())
9886 return getLDSKernelId(DAG, DL);
9887 return getPreloadedValue(DAG, *MFI, VT,
9889 }
9890 case Intrinsic::amdgcn_workitem_id_x:
9891 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9892 case Intrinsic::amdgcn_workitem_id_y:
9893 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9894 case Intrinsic::amdgcn_workitem_id_z:
9895 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9896 case Intrinsic::amdgcn_wavefrontsize:
9897 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9898 SDLoc(Op), MVT::i32);
9899 case Intrinsic::amdgcn_s_buffer_load: {
9900 unsigned CPol = Op.getConstantOperandVal(3);
9901 // s_buffer_load, because of how it's optimized, can't be volatile
9902 // so reject ones with the volatile bit set.
9903 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9906 return Op;
9907 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9908 Op.getOperand(3), DAG);
9909 }
9910 case Intrinsic::amdgcn_fdiv_fast:
9911 return lowerFDIV_FAST(Op, DAG);
9912 case Intrinsic::amdgcn_sin:
9913 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9914
9915 case Intrinsic::amdgcn_cos:
9916 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9917
9918 case Intrinsic::amdgcn_mul_u24:
9919 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9920 Op.getOperand(2));
9921 case Intrinsic::amdgcn_mul_i24:
9922 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9923 Op.getOperand(2));
9924
9925 case Intrinsic::amdgcn_log_clamp: {
9926 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9927 return SDValue();
9928
9929 return emitRemovedIntrinsicError(DAG, DL, VT);
9930 }
9931 case Intrinsic::amdgcn_fract:
9932 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9933
9934 case Intrinsic::amdgcn_class:
9935 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9936 Op.getOperand(2));
9937 case Intrinsic::amdgcn_div_fmas:
9938 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9939 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9940
9941 case Intrinsic::amdgcn_div_fixup:
9942 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9943 Op.getOperand(2), Op.getOperand(3));
9944
9945 case Intrinsic::amdgcn_div_scale: {
9946 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9947
9948 // Translate to the operands expected by the machine instruction. The
9949 // first parameter must be the same as the first instruction.
9950 SDValue Numerator = Op.getOperand(1);
9951 SDValue Denominator = Op.getOperand(2);
9952
9953 // Note this order is opposite of the machine instruction's operations,
9954 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9955 // intrinsic has the numerator as the first operand to match a normal
9956 // division operation.
9957
9958 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9959
9960 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9961 Denominator, Numerator);
9962 }
9963 case Intrinsic::amdgcn_icmp: {
9964 // There is a Pat that handles this variant, so return it as-is.
9965 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9966 Op.getConstantOperandVal(2) == 0 &&
9967 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9968 return Op;
9969 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9970 }
9971 case Intrinsic::amdgcn_fcmp: {
9972 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9973 }
9974 case Intrinsic::amdgcn_ballot:
9975 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9976 case Intrinsic::amdgcn_fmed3:
9977 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9978 Op.getOperand(2), Op.getOperand(3));
9979 case Intrinsic::amdgcn_fdot2:
9980 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9981 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9982 case Intrinsic::amdgcn_fmul_legacy:
9983 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9984 Op.getOperand(2));
9985 case Intrinsic::amdgcn_sffbh:
9986 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9987 case Intrinsic::amdgcn_sbfe:
9988 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9989 Op.getOperand(2), Op.getOperand(3));
9990 case Intrinsic::amdgcn_ubfe:
9991 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9992 Op.getOperand(2), Op.getOperand(3));
9993 case Intrinsic::amdgcn_cvt_pkrtz:
9994 case Intrinsic::amdgcn_cvt_pknorm_i16:
9995 case Intrinsic::amdgcn_cvt_pknorm_u16:
9996 case Intrinsic::amdgcn_cvt_pk_i16:
9997 case Intrinsic::amdgcn_cvt_pk_u16: {
9998 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9999 EVT VT = Op.getValueType();
10000 unsigned Opcode;
10001
10002 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10004 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10006 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10008 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10010 else
10012
10013 if (isTypeLegal(VT))
10014 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10015
10016 SDValue Node =
10017 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10018 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10019 }
10020 case Intrinsic::amdgcn_fmad_ftz:
10021 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10022 Op.getOperand(2), Op.getOperand(3));
10023
10024 case Intrinsic::amdgcn_if_break:
10025 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10026 Op->getOperand(1), Op->getOperand(2)),
10027 0);
10028
10029 case Intrinsic::amdgcn_groupstaticsize: {
10031 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10032 return Op;
10033
10034 const Module *M = MF.getFunction().getParent();
10035 const GlobalValue *GV =
10036 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10037 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10039 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10040 }
10041 case Intrinsic::amdgcn_is_shared:
10042 case Intrinsic::amdgcn_is_private: {
10043 SDLoc SL(Op);
10044 SDValue SrcVec =
10045 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10046 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10047 DAG.getConstant(1, SL, MVT::i32));
10048
10049 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10051 : AMDGPUAS::PRIVATE_ADDRESS;
10052 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10053 Subtarget->hasGloballyAddressableScratch()) {
10054 SDValue FlatScratchBaseHi(
10055 DAG.getMachineNode(
10056 AMDGPU::S_MOV_B32, DL, MVT::i32,
10057 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10058 0);
10059 // Test bits 63..58 against the aperture address.
10060 return DAG.getSetCC(
10061 SL, MVT::i1,
10062 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10063 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10064 }
10065
10066 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10067 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10068 }
10069 case Intrinsic::amdgcn_perm:
10070 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10071 Op.getOperand(2), Op.getOperand(3));
10072 case Intrinsic::amdgcn_reloc_constant: {
10073 Module *M = MF.getFunction().getParent();
10074 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10075 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10076 auto *RelocSymbol = cast<GlobalVariable>(
10077 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10078 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10080 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10081 }
10082 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10083 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10084 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10085 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10086 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10087 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10088 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10089 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10090 if (Op.getOperand(4).getValueType() == MVT::i32)
10091 return SDValue();
10092
10093 SDLoc SL(Op);
10094 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10095 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10096 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10097 Op.getOperand(3), IndexKeyi32);
10098 }
10099 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10100 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10101 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10102 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10103 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10104 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10105 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10106 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10107 if (Op.getOperand(4).getValueType() == MVT::i64)
10108 return SDValue();
10109
10110 SDLoc SL(Op);
10111 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10112 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10113 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10114 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10115 Op.getOperand(6)});
10116 }
10117 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10118 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10119 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10120 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10121 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10122 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10123 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10124 ? MVT::i64
10125 : MVT::i32;
10126 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10127 return SDValue();
10128
10129 SDLoc SL(Op);
10130 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10131 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10132 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10133 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10134 IndexKey, Op.getOperand(7),
10135 Op.getOperand(8)}); // No clamp operand
10136 }
10137 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10138 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10139 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10140 if (Op.getOperand(6).getValueType() == MVT::i32)
10141 return SDValue();
10142
10143 SDLoc SL(Op);
10144 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10145 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10146 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10147 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10148 IndexKeyi32, Op.getOperand(7)});
10149 }
10150 case Intrinsic::amdgcn_addrspacecast_nonnull:
10151 return lowerADDRSPACECAST(Op, DAG);
10152 case Intrinsic::amdgcn_readlane:
10153 case Intrinsic::amdgcn_readfirstlane:
10154 case Intrinsic::amdgcn_writelane:
10155 case Intrinsic::amdgcn_permlane16:
10156 case Intrinsic::amdgcn_permlanex16:
10157 case Intrinsic::amdgcn_permlane64:
10158 case Intrinsic::amdgcn_set_inactive:
10159 case Intrinsic::amdgcn_set_inactive_chain_arg:
10160 case Intrinsic::amdgcn_mov_dpp8:
10161 case Intrinsic::amdgcn_update_dpp:
10162 return lowerLaneOp(*this, Op.getNode(), DAG);
10163 case Intrinsic::amdgcn_dead: {
10165 for (const EVT ValTy : Op.getNode()->values())
10166 Poisons.push_back(DAG.getPOISON(ValTy));
10167 return DAG.getMergeValues(Poisons, SDLoc(Op));
10168 }
10169 default:
10170 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10172 return lowerImage(Op, ImageDimIntr, DAG, false);
10173
10174 return Op;
10175 }
10176}
10177
10178// On targets not supporting constant in soffset field, turn zero to
10179// SGPR_NULL to avoid generating an extra s_mov with zero.
10181 const GCNSubtarget *Subtarget) {
10182 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10183 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10184 return SOffset;
10185}
10186
10187SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10188 SelectionDAG &DAG,
10189 unsigned NewOpcode) const {
10190 SDLoc DL(Op);
10191
10192 SDValue VData = Op.getOperand(2);
10193 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10194 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10195 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10196 SDValue Ops[] = {
10197 Op.getOperand(0), // Chain
10198 VData, // vdata
10199 Rsrc, // rsrc
10200 DAG.getConstant(0, DL, MVT::i32), // vindex
10201 VOffset, // voffset
10202 SOffset, // soffset
10203 Offset, // offset
10204 Op.getOperand(6), // cachepolicy
10205 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10206 };
10207
10208 auto *M = cast<MemSDNode>(Op);
10209
10210 EVT MemVT = VData.getValueType();
10211 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10212 M->getMemOperand());
10213}
10214
10215SDValue
10216SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10217 unsigned NewOpcode) const {
10218 SDLoc DL(Op);
10219
10220 SDValue VData = Op.getOperand(2);
10221 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10222 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10223 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10224 SDValue Ops[] = {
10225 Op.getOperand(0), // Chain
10226 VData, // vdata
10227 Rsrc, // rsrc
10228 Op.getOperand(4), // vindex
10229 VOffset, // voffset
10230 SOffset, // soffset
10231 Offset, // offset
10232 Op.getOperand(7), // cachepolicy
10233 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10234 };
10235
10236 auto *M = cast<MemSDNode>(Op);
10237
10238 EVT MemVT = VData.getValueType();
10239 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10240 M->getMemOperand());
10241}
10242
10243SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10244 SelectionDAG &DAG) const {
10245 unsigned IntrID = Op.getConstantOperandVal(1);
10246 SDLoc DL(Op);
10247
10248 switch (IntrID) {
10249 case Intrinsic::amdgcn_ds_ordered_add:
10250 case Intrinsic::amdgcn_ds_ordered_swap: {
10251 MemSDNode *M = cast<MemSDNode>(Op);
10252 SDValue Chain = M->getOperand(0);
10253 SDValue M0 = M->getOperand(2);
10254 SDValue Value = M->getOperand(3);
10255 unsigned IndexOperand = M->getConstantOperandVal(7);
10256 unsigned WaveRelease = M->getConstantOperandVal(8);
10257 unsigned WaveDone = M->getConstantOperandVal(9);
10258
10259 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10260 IndexOperand &= ~0x3f;
10261 unsigned CountDw = 0;
10262
10263 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10264 CountDw = (IndexOperand >> 24) & 0xf;
10265 IndexOperand &= ~(0xf << 24);
10266
10267 if (CountDw < 1 || CountDw > 4) {
10268 const Function &Fn = DAG.getMachineFunction().getFunction();
10269 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10270 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10271 DL.getDebugLoc()));
10272 CountDw = 1;
10273 }
10274 }
10275
10276 if (IndexOperand) {
10277 const Function &Fn = DAG.getMachineFunction().getFunction();
10278 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10279 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10280 }
10281
10282 if (WaveDone && !WaveRelease) {
10283 // TODO: Move this to IR verifier
10284 const Function &Fn = DAG.getMachineFunction().getFunction();
10285 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10286 Fn, "ds_ordered_count: wave_done requires wave_release",
10287 DL.getDebugLoc()));
10288 }
10289
10290 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10291 unsigned ShaderType =
10293 unsigned Offset0 = OrderedCountIndex << 2;
10294 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10295
10296 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10297 Offset1 |= (CountDw - 1) << 6;
10298
10299 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10300 Offset1 |= ShaderType << 2;
10301
10302 unsigned Offset = Offset0 | (Offset1 << 8);
10303
10304 SDValue Ops[] = {
10305 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10306 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10307 };
10309 M->getVTList(), Ops, M->getMemoryVT(),
10310 M->getMemOperand());
10311 }
10312 case Intrinsic::amdgcn_raw_buffer_load:
10313 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10314 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10315 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10316 case Intrinsic::amdgcn_raw_buffer_load_format:
10317 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10318 const bool IsFormat =
10319 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10320 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10321
10322 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10323 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10324 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10325 SDValue Ops[] = {
10326 Op.getOperand(0), // Chain
10327 Rsrc, // rsrc
10328 DAG.getConstant(0, DL, MVT::i32), // vindex
10329 VOffset, // voffset
10330 SOffset, // soffset
10331 Offset, // offset
10332 Op.getOperand(5), // cachepolicy, swizzled buffer
10333 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10334 };
10335
10336 auto *M = cast<MemSDNode>(Op);
10337 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10338 }
10339 case Intrinsic::amdgcn_struct_buffer_load:
10340 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10341 case Intrinsic::amdgcn_struct_buffer_load_format:
10342 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10343 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10344 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10345 const bool IsFormat =
10346 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10347 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10348
10349 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10350 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10351 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10352 SDValue Ops[] = {
10353 Op.getOperand(0), // Chain
10354 Rsrc, // rsrc
10355 Op.getOperand(3), // vindex
10356 VOffset, // voffset
10357 SOffset, // soffset
10358 Offset, // offset
10359 Op.getOperand(6), // cachepolicy, swizzled buffer
10360 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10361 };
10362
10363 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10364 }
10365 case Intrinsic::amdgcn_raw_tbuffer_load:
10366 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10367 MemSDNode *M = cast<MemSDNode>(Op);
10368 EVT LoadVT = Op.getValueType();
10369 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10370 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10371 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10372
10373 SDValue Ops[] = {
10374 Op.getOperand(0), // Chain
10375 Rsrc, // rsrc
10376 DAG.getConstant(0, DL, MVT::i32), // vindex
10377 VOffset, // voffset
10378 SOffset, // soffset
10379 Offset, // offset
10380 Op.getOperand(5), // format
10381 Op.getOperand(6), // cachepolicy, swizzled buffer
10382 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10383 };
10384
10385 if (LoadVT.getScalarType() == MVT::f16)
10386 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10387 Ops);
10388 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10389 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10390 DAG);
10391 }
10392 case Intrinsic::amdgcn_struct_tbuffer_load:
10393 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10394 MemSDNode *M = cast<MemSDNode>(Op);
10395 EVT LoadVT = Op.getValueType();
10396 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10397 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10398 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10399
10400 SDValue Ops[] = {
10401 Op.getOperand(0), // Chain
10402 Rsrc, // rsrc
10403 Op.getOperand(3), // vindex
10404 VOffset, // voffset
10405 SOffset, // soffset
10406 Offset, // offset
10407 Op.getOperand(6), // format
10408 Op.getOperand(7), // cachepolicy, swizzled buffer
10409 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10410 };
10411
10412 if (LoadVT.getScalarType() == MVT::f16)
10413 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10414 Ops);
10415 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10416 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10417 DAG);
10418 }
10419 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10420 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10421 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10422 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10424 return lowerStructBufferAtomicIntrin(Op, DAG,
10426 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10427 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10428 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10429 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10430 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10431 return lowerStructBufferAtomicIntrin(Op, DAG,
10433 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10434 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10435 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10436 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10437 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10438 return lowerStructBufferAtomicIntrin(Op, DAG,
10440 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10442 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10443 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10445 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10446 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10448 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10449 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10451 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10452 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10454 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10455 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10457 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10458 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10460 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10461 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10463 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10464 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10466 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10467 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10469 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10470 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10471 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10472 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10473 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10474 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10475 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10476 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10477 return lowerRawBufferAtomicIntrin(Op, DAG,
10479 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10480 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10481 return lowerStructBufferAtomicIntrin(Op, DAG,
10483 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10485 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10486 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10487 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10488 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10489 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10491 return lowerStructBufferAtomicIntrin(Op, DAG,
10493 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10495 return lowerStructBufferAtomicIntrin(Op, DAG,
10497 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10498 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10499 return lowerStructBufferAtomicIntrin(Op, DAG,
10501 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10502 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10503 return lowerStructBufferAtomicIntrin(Op, DAG,
10505 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10506 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10507 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10508 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10509 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10510 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10511 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10512 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10513 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10514 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10515 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10516 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10517 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10518 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10519 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10520 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10521 return lowerStructBufferAtomicIntrin(Op, DAG,
10523
10524 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10525 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10526 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10527 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10528 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10529 SDValue Ops[] = {
10530 Op.getOperand(0), // Chain
10531 Op.getOperand(2), // src
10532 Op.getOperand(3), // cmp
10533 Rsrc, // rsrc
10534 DAG.getConstant(0, DL, MVT::i32), // vindex
10535 VOffset, // voffset
10536 SOffset, // soffset
10537 Offset, // offset
10538 Op.getOperand(7), // cachepolicy
10539 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10540 };
10541 EVT VT = Op.getValueType();
10542 auto *M = cast<MemSDNode>(Op);
10543
10545 Op->getVTList(), Ops, VT,
10546 M->getMemOperand());
10547 }
10548 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10549 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10550 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10551 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10552 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10553 SDValue Ops[] = {
10554 Op.getOperand(0), // Chain
10555 Op.getOperand(2), // src
10556 Op.getOperand(3), // cmp
10557 Rsrc, // rsrc
10558 Op.getOperand(5), // vindex
10559 VOffset, // voffset
10560 SOffset, // soffset
10561 Offset, // offset
10562 Op.getOperand(8), // cachepolicy
10563 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10564 };
10565 EVT VT = Op.getValueType();
10566 auto *M = cast<MemSDNode>(Op);
10567
10569 Op->getVTList(), Ops, VT,
10570 M->getMemOperand());
10571 }
10572 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10573 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10574 MemSDNode *M = cast<MemSDNode>(Op);
10575 SDValue NodePtr = M->getOperand(2);
10576 SDValue RayExtent = M->getOperand(3);
10577 SDValue InstanceMask = M->getOperand(4);
10578 SDValue RayOrigin = M->getOperand(5);
10579 SDValue RayDir = M->getOperand(6);
10580 SDValue Offsets = M->getOperand(7);
10581 SDValue TDescr = M->getOperand(8);
10582
10583 assert(NodePtr.getValueType() == MVT::i64);
10584 assert(RayDir.getValueType() == MVT::v3f32);
10585
10586 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10587 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10588 return SDValue();
10589 }
10590
10591 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10592 const unsigned NumVDataDwords = 10;
10593 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10594 int Opcode = AMDGPU::getMIMGOpcode(
10595 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10596 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10597 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10598 assert(Opcode != -1);
10599
10601 Ops.push_back(NodePtr);
10602 Ops.push_back(DAG.getBuildVector(
10603 MVT::v2i32, DL,
10604 {DAG.getBitcast(MVT::i32, RayExtent),
10605 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10606 Ops.push_back(RayOrigin);
10607 Ops.push_back(RayDir);
10608 Ops.push_back(Offsets);
10609 Ops.push_back(TDescr);
10610 Ops.push_back(M->getChain());
10611
10612 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10613 MachineMemOperand *MemRef = M->getMemOperand();
10614 DAG.setNodeMemRefs(NewNode, {MemRef});
10615 return SDValue(NewNode, 0);
10616 }
10617 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10618 MemSDNode *M = cast<MemSDNode>(Op);
10619 SDValue NodePtr = M->getOperand(2);
10620 SDValue RayExtent = M->getOperand(3);
10621 SDValue RayOrigin = M->getOperand(4);
10622 SDValue RayDir = M->getOperand(5);
10623 SDValue RayInvDir = M->getOperand(6);
10624 SDValue TDescr = M->getOperand(7);
10625
10626 assert(NodePtr.getValueType() == MVT::i32 ||
10627 NodePtr.getValueType() == MVT::i64);
10628 assert(RayDir.getValueType() == MVT::v3f16 ||
10629 RayDir.getValueType() == MVT::v3f32);
10630
10631 if (!Subtarget->hasGFX10_AEncoding()) {
10632 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10633 return SDValue();
10634 }
10635
10636 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10637 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10638 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10639 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10640 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10641 const unsigned NumVDataDwords = 4;
10642 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10643 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10644 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10645 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10646 IsGFX12Plus;
10647 const unsigned BaseOpcodes[2][2] = {
10648 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10649 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10650 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10651 int Opcode;
10652 if (UseNSA) {
10653 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10654 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10655 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10656 : AMDGPU::MIMGEncGfx10NSA,
10657 NumVDataDwords, NumVAddrDwords);
10658 } else {
10659 assert(!IsGFX12Plus);
10660 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10661 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10662 : AMDGPU::MIMGEncGfx10Default,
10663 NumVDataDwords, NumVAddrDwords);
10664 }
10665 assert(Opcode != -1);
10666
10668
10669 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10671 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10672 if (Lanes[0].getValueSizeInBits() == 32) {
10673 for (unsigned I = 0; I < 3; ++I)
10674 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10675 } else {
10676 if (IsAligned) {
10677 Ops.push_back(DAG.getBitcast(
10678 MVT::i32,
10679 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10680 Ops.push_back(Lanes[2]);
10681 } else {
10682 SDValue Elt0 = Ops.pop_back_val();
10683 Ops.push_back(DAG.getBitcast(
10684 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10685 Ops.push_back(DAG.getBitcast(
10686 MVT::i32,
10687 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10688 }
10689 }
10690 };
10691
10692 if (UseNSA && IsGFX11Plus) {
10693 Ops.push_back(NodePtr);
10694 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10695 Ops.push_back(RayOrigin);
10696 if (IsA16) {
10697 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10698 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10699 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10700 for (unsigned I = 0; I < 3; ++I) {
10701 MergedLanes.push_back(DAG.getBitcast(
10702 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10703 {DirLanes[I], InvDirLanes[I]})));
10704 }
10705 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10706 } else {
10707 Ops.push_back(RayDir);
10708 Ops.push_back(RayInvDir);
10709 }
10710 } else {
10711 if (Is64)
10712 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10713 2);
10714 else
10715 Ops.push_back(NodePtr);
10716
10717 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10718 packLanes(RayOrigin, true);
10719 packLanes(RayDir, true);
10720 packLanes(RayInvDir, false);
10721 }
10722
10723 if (!UseNSA) {
10724 // Build a single vector containing all the operands so far prepared.
10725 if (NumVAddrDwords > 12) {
10726 SDValue Undef = DAG.getPOISON(MVT::i32);
10727 Ops.append(16 - Ops.size(), Undef);
10728 }
10729 assert(Ops.size() >= 8 && Ops.size() <= 12);
10730 SDValue MergedOps =
10731 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10732 Ops.clear();
10733 Ops.push_back(MergedOps);
10734 }
10735
10736 Ops.push_back(TDescr);
10737 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10738 Ops.push_back(M->getChain());
10739
10740 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10741 MachineMemOperand *MemRef = M->getMemOperand();
10742 DAG.setNodeMemRefs(NewNode, {MemRef});
10743 return SDValue(NewNode, 0);
10744 }
10745 case Intrinsic::amdgcn_global_atomic_fmin_num:
10746 case Intrinsic::amdgcn_global_atomic_fmax_num:
10747 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10748 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10749 MemSDNode *M = cast<MemSDNode>(Op);
10750 SDValue Ops[] = {
10751 M->getOperand(0), // Chain
10752 M->getOperand(2), // Ptr
10753 M->getOperand(3) // Value
10754 };
10755 unsigned Opcode = 0;
10756 switch (IntrID) {
10757 case Intrinsic::amdgcn_global_atomic_fmin_num:
10758 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10759 Opcode = ISD::ATOMIC_LOAD_FMIN;
10760 break;
10761 }
10762 case Intrinsic::amdgcn_global_atomic_fmax_num:
10763 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10764 Opcode = ISD::ATOMIC_LOAD_FMAX;
10765 break;
10766 }
10767 default:
10768 llvm_unreachable("unhandled atomic opcode");
10769 }
10770 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10771 Ops, M->getMemOperand());
10772 }
10773 case Intrinsic::amdgcn_s_get_barrier_state:
10774 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10775 SDValue Chain = Op->getOperand(0);
10777 unsigned Opc;
10778
10779 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10780 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10781 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10782 BarID = (BarID >> 4) & 0x3F;
10783 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10784 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10785 Ops.push_back(K);
10786 Ops.push_back(Chain);
10787 } else {
10788 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10789 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10790 SDValue M0Val;
10791 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10792 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10793 M0Val = SDValue(
10794 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10795 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10796 0);
10797 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10798 } else
10799 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10800 }
10801
10802 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10803 return SDValue(NewMI, 0);
10804 }
10805 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10806 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10807 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10808 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10809 SDValue Chain = Op->getOperand(0);
10810 SDValue Ptr = Op->getOperand(2);
10811 EVT VT = Op->getValueType(0);
10812 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10813 Chain, Ptr, MII->getMemOperand());
10814 }
10815 default:
10816
10817 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10819 return lowerImage(Op, ImageDimIntr, DAG, true);
10820
10821 return SDValue();
10822 }
10823}
10824
10825// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10826// dwordx4 if on SI and handle TFE loads.
10827SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10828 SDVTList VTList,
10829 ArrayRef<SDValue> Ops, EVT MemVT,
10830 MachineMemOperand *MMO,
10831 SelectionDAG &DAG) const {
10832 LLVMContext &C = *DAG.getContext();
10833 MachineFunction &MF = DAG.getMachineFunction();
10834 EVT VT = VTList.VTs[0];
10835
10836 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10837 bool IsTFE = VTList.NumVTs == 3;
10838 if (IsTFE) {
10839 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10840 unsigned NumOpDWords = NumValueDWords + 1;
10841 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10842 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10843 MachineMemOperand *OpDWordsMMO =
10844 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10845 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10846 OpDWordsVT, OpDWordsMMO, DAG);
10847 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10848 DAG.getVectorIdxConstant(NumValueDWords, DL));
10849 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10850 SDValue ValueDWords =
10851 NumValueDWords == 1
10852 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10854 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10855 ZeroIdx);
10856 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10857 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10858 }
10859
10860 if (!Subtarget->hasDwordx3LoadStores() &&
10861 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10862 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10863 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10864 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10865 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10866 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10867 WidenedMemVT, WidenedMMO);
10869 DAG.getVectorIdxConstant(0, DL));
10870 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10871 }
10872
10873 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10874}
10875
10876SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10877 bool ImageStore) const {
10878 EVT StoreVT = VData.getValueType();
10879
10880 // No change for f16 and legal vector D16 types.
10881 if (!StoreVT.isVector())
10882 return VData;
10883
10884 SDLoc DL(VData);
10885 unsigned NumElements = StoreVT.getVectorNumElements();
10886
10887 if (Subtarget->hasUnpackedD16VMem()) {
10888 // We need to unpack the packed data to store.
10889 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10890 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10891
10892 EVT EquivStoreVT =
10893 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10894 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10895 return DAG.UnrollVectorOp(ZExt.getNode());
10896 }
10897
10898 // The sq block of gfx8.1 does not estimate register use correctly for d16
10899 // image store instructions. The data operand is computed as if it were not a
10900 // d16 image instruction.
10901 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10902 // Bitcast to i16
10903 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10904 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10905
10906 // Decompose into scalars
10908 DAG.ExtractVectorElements(IntVData, Elts);
10909
10910 // Group pairs of i16 into v2i16 and bitcast to i32
10911 SmallVector<SDValue, 4> PackedElts;
10912 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10913 SDValue Pair =
10914 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10915 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10916 PackedElts.push_back(IntPair);
10917 }
10918 if ((NumElements % 2) == 1) {
10919 // Handle v3i16
10920 unsigned I = Elts.size() / 2;
10921 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10922 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10923 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10924 PackedElts.push_back(IntPair);
10925 }
10926
10927 // Pad using UNDEF
10928 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10929
10930 // Build final vector
10931 EVT VecVT =
10932 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10933 return DAG.getBuildVector(VecVT, DL, PackedElts);
10934 }
10935
10936 if (NumElements == 3) {
10937 EVT IntStoreVT =
10939 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10940
10941 EVT WidenedStoreVT = EVT::getVectorVT(
10942 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10943 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10944 WidenedStoreVT.getStoreSizeInBits());
10945 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10946 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10947 }
10948
10949 assert(isTypeLegal(StoreVT));
10950 return VData;
10951}
10952
10953SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10954 SelectionDAG &DAG) const {
10955 SDLoc DL(Op);
10956 SDValue Chain = Op.getOperand(0);
10957 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10958 MachineFunction &MF = DAG.getMachineFunction();
10959
10960 switch (IntrinsicID) {
10961 case Intrinsic::amdgcn_exp_compr: {
10962 if (!Subtarget->hasCompressedExport()) {
10963 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10965 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10966 }
10967 SDValue Src0 = Op.getOperand(4);
10968 SDValue Src1 = Op.getOperand(5);
10969 // Hack around illegal type on SI by directly selecting it.
10970 if (isTypeLegal(Src0.getValueType()))
10971 return SDValue();
10972
10973 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10974 SDValue Undef = DAG.getPOISON(MVT::f32);
10975 const SDValue Ops[] = {
10976 Op.getOperand(2), // tgt
10977 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10978 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10979 Undef, // src2
10980 Undef, // src3
10981 Op.getOperand(7), // vm
10982 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10983 Op.getOperand(3), // en
10984 Op.getOperand(0) // Chain
10985 };
10986
10987 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10988 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10989 }
10990
10991 case Intrinsic::amdgcn_struct_tbuffer_store:
10992 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10993 SDValue VData = Op.getOperand(2);
10994 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10995 if (IsD16)
10996 VData = handleD16VData(VData, DAG);
10997 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10998 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10999 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11000 SDValue Ops[] = {
11001 Chain,
11002 VData, // vdata
11003 Rsrc, // rsrc
11004 Op.getOperand(4), // vindex
11005 VOffset, // voffset
11006 SOffset, // soffset
11007 Offset, // offset
11008 Op.getOperand(7), // format
11009 Op.getOperand(8), // cachepolicy, swizzled buffer
11010 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11011 };
11012 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11014 MemSDNode *M = cast<MemSDNode>(Op);
11015 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11016 M->getMemoryVT(), M->getMemOperand());
11017 }
11018
11019 case Intrinsic::amdgcn_raw_tbuffer_store:
11020 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11021 SDValue VData = Op.getOperand(2);
11022 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11023 if (IsD16)
11024 VData = handleD16VData(VData, DAG);
11025 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11026 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11027 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11028 SDValue Ops[] = {
11029 Chain,
11030 VData, // vdata
11031 Rsrc, // rsrc
11032 DAG.getConstant(0, DL, MVT::i32), // vindex
11033 VOffset, // voffset
11034 SOffset, // soffset
11035 Offset, // offset
11036 Op.getOperand(6), // format
11037 Op.getOperand(7), // cachepolicy, swizzled buffer
11038 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11039 };
11040 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11042 MemSDNode *M = cast<MemSDNode>(Op);
11043 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11044 M->getMemoryVT(), M->getMemOperand());
11045 }
11046
11047 case Intrinsic::amdgcn_raw_buffer_store:
11048 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11049 case Intrinsic::amdgcn_raw_buffer_store_format:
11050 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11051 const bool IsFormat =
11052 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11053 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11054
11055 SDValue VData = Op.getOperand(2);
11056 EVT VDataVT = VData.getValueType();
11057 EVT EltType = VDataVT.getScalarType();
11058 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11059 if (IsD16) {
11060 VData = handleD16VData(VData, DAG);
11061 VDataVT = VData.getValueType();
11062 }
11063
11064 if (!isTypeLegal(VDataVT)) {
11065 VData =
11066 DAG.getNode(ISD::BITCAST, DL,
11067 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11068 }
11069
11070 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11071 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11072 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11073 SDValue Ops[] = {
11074 Chain,
11075 VData,
11076 Rsrc,
11077 DAG.getConstant(0, DL, MVT::i32), // vindex
11078 VOffset, // voffset
11079 SOffset, // soffset
11080 Offset, // offset
11081 Op.getOperand(6), // cachepolicy, swizzled buffer
11082 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11083 };
11084 unsigned Opc =
11087 MemSDNode *M = cast<MemSDNode>(Op);
11088
11089 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11090 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11091 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11092
11093 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11094 M->getMemoryVT(), M->getMemOperand());
11095 }
11096
11097 case Intrinsic::amdgcn_struct_buffer_store:
11098 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11099 case Intrinsic::amdgcn_struct_buffer_store_format:
11100 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11101 const bool IsFormat =
11102 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11103 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11104
11105 SDValue VData = Op.getOperand(2);
11106 EVT VDataVT = VData.getValueType();
11107 EVT EltType = VDataVT.getScalarType();
11108 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11109
11110 if (IsD16) {
11111 VData = handleD16VData(VData, DAG);
11112 VDataVT = VData.getValueType();
11113 }
11114
11115 if (!isTypeLegal(VDataVT)) {
11116 VData =
11117 DAG.getNode(ISD::BITCAST, DL,
11118 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11119 }
11120
11121 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11122 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11123 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11124 SDValue Ops[] = {
11125 Chain,
11126 VData,
11127 Rsrc,
11128 Op.getOperand(4), // vindex
11129 VOffset, // voffset
11130 SOffset, // soffset
11131 Offset, // offset
11132 Op.getOperand(7), // cachepolicy, swizzled buffer
11133 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11134 };
11135 unsigned Opc =
11138 MemSDNode *M = cast<MemSDNode>(Op);
11139
11140 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11141 EVT VDataType = VData.getValueType().getScalarType();
11142 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11143 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11144
11145 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11146 M->getMemoryVT(), M->getMemOperand());
11147 }
11148 case Intrinsic::amdgcn_raw_buffer_load_lds:
11149 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11150 case Intrinsic::amdgcn_struct_buffer_load_lds:
11151 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11152 if (!Subtarget->hasVMemToLDSLoad())
11153 return SDValue();
11154 unsigned Opc;
11155 bool HasVIndex =
11156 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11157 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11158 unsigned OpOffset = HasVIndex ? 1 : 0;
11159 SDValue VOffset = Op.getOperand(5 + OpOffset);
11160 bool HasVOffset = !isNullConstant(VOffset);
11161 unsigned Size = Op->getConstantOperandVal(4);
11162
11163 switch (Size) {
11164 default:
11165 return SDValue();
11166 case 1:
11167 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11168 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11169 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11170 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11171 break;
11172 case 2:
11173 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11174 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11175 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11176 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11177 break;
11178 case 4:
11179 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11180 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11181 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11182 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11183 break;
11184 case 12:
11185 if (!Subtarget->hasLDSLoadB96_B128())
11186 return SDValue();
11187 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11188 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11189 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11190 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11191 break;
11192 case 16:
11193 if (!Subtarget->hasLDSLoadB96_B128())
11194 return SDValue();
11195 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11196 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11197 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11198 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11199 break;
11200 }
11201
11202 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11203
11205
11206 if (HasVIndex && HasVOffset)
11207 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11208 {Op.getOperand(5), // VIndex
11209 VOffset}));
11210 else if (HasVIndex)
11211 Ops.push_back(Op.getOperand(5));
11212 else if (HasVOffset)
11213 Ops.push_back(VOffset);
11214
11215 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11216 Ops.push_back(Rsrc);
11217 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11218 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11219 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11220 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11221 Ops.push_back(DAG.getTargetConstant(
11222 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11223 DL, MVT::i8)); // cpol
11224 Ops.push_back(DAG.getTargetConstant(
11225 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11226 ? 1
11227 : 0,
11228 DL, MVT::i8)); // swz
11229 Ops.push_back(M0Val.getValue(0)); // Chain
11230 Ops.push_back(M0Val.getValue(1)); // Glue
11231
11232 auto *M = cast<MemSDNode>(Op);
11233 MachineMemOperand *LoadMMO = M->getMemOperand();
11234 // Don't set the offset value here because the pointer points to the base of
11235 // the buffer.
11236 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11237
11238 MachinePointerInfo StorePtrI = LoadPtrI;
11239 LoadPtrI.V = PoisonValue::get(
11243
11244 auto F = LoadMMO->getFlags() &
11246 LoadMMO =
11248 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11249
11250 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11251 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11252 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11253
11254 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11255 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11256
11257 return SDValue(Load, 0);
11258 }
11259 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11260 // for "trust me" that the remaining cases are global pointers until
11261 // such time as we can put two mem operands on an intrinsic.
11262 case Intrinsic::amdgcn_load_to_lds:
11263 case Intrinsic::amdgcn_global_load_lds: {
11264 if (!Subtarget->hasVMemToLDSLoad())
11265 return SDValue();
11266
11267 unsigned Opc;
11268 unsigned Size = Op->getConstantOperandVal(4);
11269 switch (Size) {
11270 default:
11271 return SDValue();
11272 case 1:
11273 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11274 break;
11275 case 2:
11276 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11277 break;
11278 case 4:
11279 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11280 break;
11281 case 12:
11282 if (!Subtarget->hasLDSLoadB96_B128())
11283 return SDValue();
11284 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11285 break;
11286 case 16:
11287 if (!Subtarget->hasLDSLoadB96_B128())
11288 return SDValue();
11289 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11290 break;
11291 }
11292
11293 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11294
11296
11297 SDValue Addr = Op.getOperand(2); // Global ptr
11298 SDValue VOffset;
11299 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11300 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11301 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
11302 SDValue LHS = Addr.getOperand(0);
11303 SDValue RHS = Addr.getOperand(1);
11304
11305 if (LHS->isDivergent())
11306 std::swap(LHS, RHS);
11307
11308 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11309 RHS.getOperand(0).getValueType() == MVT::i32) {
11310 // add (i64 sgpr), (zero_extend (i32 vgpr))
11311 Addr = LHS;
11312 VOffset = RHS.getOperand(0);
11313 }
11314 }
11315
11316 Ops.push_back(Addr);
11317 if (!Addr->isDivergent()) {
11319 if (!VOffset)
11320 VOffset =
11321 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11322 DAG.getTargetConstant(0, DL, MVT::i32)),
11323 0);
11324 Ops.push_back(VOffset);
11325 }
11326
11327 Ops.push_back(Op.getOperand(5)); // Offset
11328 Ops.push_back(Op.getOperand(6)); // CPol
11329 Ops.push_back(M0Val.getValue(0)); // Chain
11330 Ops.push_back(M0Val.getValue(1)); // Glue
11331
11332 auto *M = cast<MemSDNode>(Op);
11333 MachineMemOperand *LoadMMO = M->getMemOperand();
11334 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11335 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11336 MachinePointerInfo StorePtrI = LoadPtrI;
11337 LoadPtrI.V = PoisonValue::get(
11341 auto F = LoadMMO->getFlags() &
11343 LoadMMO =
11345 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11346 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11347 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11348 LoadMMO->getAAInfo());
11349
11350 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11351 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11352
11353 return SDValue(Load, 0);
11354 }
11355 case Intrinsic::amdgcn_end_cf:
11356 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11357 Op->getOperand(2), Chain),
11358 0);
11359 case Intrinsic::amdgcn_s_barrier_init:
11360 case Intrinsic::amdgcn_s_barrier_signal_var: {
11361 // these two intrinsics have two operands: barrier pointer and member count
11362 SDValue Chain = Op->getOperand(0);
11364 SDValue BarOp = Op->getOperand(2);
11365 SDValue CntOp = Op->getOperand(3);
11366 SDValue M0Val;
11367 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11368 ? AMDGPU::S_BARRIER_INIT_M0
11369 : AMDGPU::S_BARRIER_SIGNAL_M0;
11370 // extract the BarrierID from bits 4-9 of BarOp
11371 SDValue BarID;
11372 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11373 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11374 BarID =
11375 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11376 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11377 0);
11378 // Member count should be put into M0[ShAmt:+6]
11379 // Barrier ID should be put into M0[5:0]
11380 M0Val =
11381 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11382 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11383 0);
11384 constexpr unsigned ShAmt = 16;
11385 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11386 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11387
11388 M0Val = SDValue(
11389 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11390
11391 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11392
11393 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11394 return SDValue(NewMI, 0);
11395 }
11396 case Intrinsic::amdgcn_s_barrier_join: {
11397 // these three intrinsics have one operand: barrier pointer
11398 SDValue Chain = Op->getOperand(0);
11400 SDValue BarOp = Op->getOperand(2);
11401 unsigned Opc;
11402
11403 if (isa<ConstantSDNode>(BarOp)) {
11404 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11405 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11406
11407 // extract the BarrierID from bits 4-9 of the immediate
11408 unsigned BarID = (BarVal >> 4) & 0x3F;
11409 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11410 Ops.push_back(K);
11411 Ops.push_back(Chain);
11412 } else {
11413 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11414
11415 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11416 SDValue M0Val;
11417 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11418 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11419 M0Val =
11420 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11421 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11422 0);
11423 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11424 }
11425
11426 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11427 return SDValue(NewMI, 0);
11428 }
11429 case Intrinsic::amdgcn_s_prefetch_data: {
11430 // For non-global address space preserve the chain and remove the call.
11432 return Op.getOperand(0);
11433 return Op;
11434 }
11435 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11436 SDValue Ops[] = {
11437 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11438 Op.getOperand(3), // offset
11439 Op.getOperand(4), // length
11440 };
11441
11442 MemSDNode *M = cast<MemSDNode>(Op);
11444 Op->getVTList(), Ops, M->getMemoryVT(),
11445 M->getMemOperand());
11446 }
11447 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11448 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11449 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11450 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11451 SDValue Chain = Op->getOperand(0);
11452 SDValue Ptr = Op->getOperand(2);
11453 SDValue Val = Op->getOperand(3);
11454 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11455 Ptr, MII->getMemOperand());
11456 }
11457 default: {
11458 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11460 return lowerImage(Op, ImageDimIntr, DAG, true);
11461
11462 return Op;
11463 }
11464 }
11465}
11466
11467// Return whether the operation has NoUnsignedWrap property.
11468static bool isNoUnsignedWrap(SDValue Addr) {
11469 return (Addr.getOpcode() == ISD::ADD &&
11470 Addr->getFlags().hasNoUnsignedWrap()) ||
11471 Addr->getOpcode() == ISD::OR;
11472}
11473
11475 EVT PtrVT) const {
11476 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11477}
11478
11479// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11480// offset (the offset that is included in bounds checking and swizzling, to be
11481// split between the instruction's voffset and immoffset fields) and soffset
11482// (the offset that is excluded from bounds checking and swizzling, to go in
11483// the instruction's soffset field). This function takes the first kind of
11484// offset and figures out how to split it between voffset and immoffset.
11485std::pair<SDValue, SDValue>
11486SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11487 SDLoc DL(Offset);
11488 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11489 SDValue N0 = Offset;
11490 ConstantSDNode *C1 = nullptr;
11491
11492 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11493 N0 = SDValue();
11494 else if (DAG.isBaseWithConstantOffset(N0)) {
11495 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11496 // being added, so we can only safely match a 32-bit addition with no
11497 // unsigned overflow.
11498 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11499 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11500 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11501 N0 = N0.getOperand(0);
11502 }
11503 }
11504
11505 if (C1) {
11506 unsigned ImmOffset = C1->getZExtValue();
11507 // If the immediate value is too big for the immoffset field, put only bits
11508 // that would normally fit in the immoffset field. The remaining value that
11509 // is copied/added for the voffset field is a large power of 2, and it
11510 // stands more chance of being CSEd with the copy/add for another similar
11511 // load/store.
11512 // However, do not do that rounding down if that is a negative
11513 // number, as it appears to be illegal to have a negative offset in the
11514 // vgpr, even if adding the immediate offset makes it positive.
11515 unsigned Overflow = ImmOffset & ~MaxImm;
11516 ImmOffset -= Overflow;
11517 if ((int32_t)Overflow < 0) {
11518 Overflow += ImmOffset;
11519 ImmOffset = 0;
11520 }
11521 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11522 if (Overflow) {
11523 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11524 if (!N0)
11525 N0 = OverflowVal;
11526 else {
11527 SDValue Ops[] = {N0, OverflowVal};
11528 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11529 }
11530 }
11531 }
11532 if (!N0)
11533 N0 = DAG.getConstant(0, DL, MVT::i32);
11534 if (!C1)
11535 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11536 return {N0, SDValue(C1, 0)};
11537}
11538
11539// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11540// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11541// pointed to by Offsets.
11542void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11543 SelectionDAG &DAG, SDValue *Offsets,
11544 Align Alignment) const {
11545 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11546 SDLoc DL(CombinedOffset);
11547 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11548 uint32_t Imm = C->getZExtValue();
11549 uint32_t SOffset, ImmOffset;
11550 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11551 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11552 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11553 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11554 return;
11555 }
11556 }
11557 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11558 SDValue N0 = CombinedOffset.getOperand(0);
11559 SDValue N1 = CombinedOffset.getOperand(1);
11560 uint32_t SOffset, ImmOffset;
11561 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11562 if (Offset >= 0 &&
11563 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11564 Offsets[0] = N0;
11565 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11566 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11567 return;
11568 }
11569 }
11570
11571 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11572 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11573 : DAG.getConstant(0, DL, MVT::i32);
11574
11575 Offsets[0] = CombinedOffset;
11576 Offsets[1] = SOffsetZero;
11577 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11578}
11579
11580SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11581 SelectionDAG &DAG) const {
11582 if (!MaybePointer.getValueType().isScalarInteger())
11583 return MaybePointer;
11584
11585 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11586 return Rsrc;
11587}
11588
11589// Wrap a global or flat pointer into a buffer intrinsic using the flags
11590// specified in the intrinsic.
11591SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11592 SelectionDAG &DAG) const {
11593 SDLoc Loc(Op);
11594
11595 SDValue Pointer = Op->getOperand(1);
11596 SDValue Stride = Op->getOperand(2);
11597 SDValue NumRecords = Op->getOperand(3);
11598 SDValue Flags = Op->getOperand(4);
11599
11600 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11601 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11602 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11603 std::optional<uint32_t> ConstStride = std::nullopt;
11604 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11605 ConstStride = ConstNode->getZExtValue();
11606
11607 SDValue NewHighHalf = Masked;
11608 if (!ConstStride || *ConstStride != 0) {
11609 SDValue ShiftedStride;
11610 if (ConstStride) {
11611 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11612 } else {
11613 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11614 ShiftedStride =
11615 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11616 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11617 }
11618 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11619 }
11620
11621 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11622 NewHighHalf, NumRecords, Flags);
11623 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11624 return RsrcPtr;
11625}
11626
11627// Handle 8 bit and 16 bit buffer loads
11628SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11629 EVT LoadVT, SDLoc DL,
11631 MachineMemOperand *MMO,
11632 bool IsTFE) const {
11633 EVT IntVT = LoadVT.changeTypeToInteger();
11634
11635 if (IsTFE) {
11636 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11639 MachineFunction &MF = DAG.getMachineFunction();
11640 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11641 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11642 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11643 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11644 DAG.getConstant(1, DL, MVT::i32));
11645 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11646 DAG.getConstant(0, DL, MVT::i32));
11647 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11648 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11649 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11650 }
11651
11652 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11655
11656 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11657 SDValue BufferLoad =
11658 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11659 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11660 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11661
11662 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11663}
11664
11665// Handle 8 bit and 16 bit buffer stores
11666SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11667 EVT VDataType, SDLoc DL,
11668 SDValue Ops[],
11669 MemSDNode *M) const {
11670 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11671 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11672
11673 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11674 Ops[1] = BufferStoreExt;
11675 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11676 : AMDGPUISD::BUFFER_STORE_SHORT;
11677 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11678 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11679 M->getMemOperand());
11680}
11681
11683 SDValue Op, const SDLoc &SL, EVT VT) {
11684 if (VT.bitsLT(Op.getValueType()))
11685 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11686
11687 switch (ExtType) {
11688 case ISD::SEXTLOAD:
11689 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11690 case ISD::ZEXTLOAD:
11691 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11692 case ISD::EXTLOAD:
11693 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11694 case ISD::NON_EXTLOAD:
11695 return Op;
11696 }
11697
11698 llvm_unreachable("invalid ext type");
11699}
11700
11701// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11702// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11703SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11704 DAGCombinerInfo &DCI) const {
11705 SelectionDAG &DAG = DCI.DAG;
11706 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11707 return SDValue();
11708
11709 // FIXME: Constant loads should all be marked invariant.
11710 unsigned AS = Ld->getAddressSpace();
11711 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11713 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11714 return SDValue();
11715
11716 // Don't do this early, since it may interfere with adjacent load merging for
11717 // illegal types. We can avoid losing alignment information for exotic types
11718 // pre-legalize.
11719 EVT MemVT = Ld->getMemoryVT();
11720 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11721 MemVT.getSizeInBits() >= 32)
11722 return SDValue();
11723
11724 SDLoc SL(Ld);
11725
11726 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11727 "unexpected vector extload");
11728
11729 // TODO: Drop only high part of range.
11730 SDValue Ptr = Ld->getBasePtr();
11731 SDValue NewLoad = DAG.getLoad(
11732 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11733 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11734 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11735 nullptr); // Drop ranges
11736
11737 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11738 if (MemVT.isFloatingPoint()) {
11740 "unexpected fp extload");
11741 TruncVT = MemVT.changeTypeToInteger();
11742 }
11743
11744 SDValue Cvt = NewLoad;
11745 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11746 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11747 DAG.getValueType(TruncVT));
11748 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11750 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11751 } else {
11753 }
11754
11755 EVT VT = Ld->getValueType(0);
11756 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11757
11758 DCI.AddToWorklist(Cvt.getNode());
11759
11760 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11761 // the appropriate extension from the 32-bit load.
11762 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11763 DCI.AddToWorklist(Cvt.getNode());
11764
11765 // Handle conversion back to floating point if necessary.
11766 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11767
11768 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11769}
11770
11772 const SIMachineFunctionInfo &Info) {
11773 // TODO: Should check if the address can definitely not access stack.
11774 if (Info.isEntryFunction())
11775 return Info.getUserSGPRInfo().hasFlatScratchInit();
11776 return true;
11777}
11778
11779SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11780 SDLoc DL(Op);
11781 LoadSDNode *Load = cast<LoadSDNode>(Op);
11782 ISD::LoadExtType ExtType = Load->getExtensionType();
11783 EVT MemVT = Load->getMemoryVT();
11784 MachineMemOperand *MMO = Load->getMemOperand();
11785
11786 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11787 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11788 return SDValue();
11789
11790 // FIXME: Copied from PPC
11791 // First, load into 32 bits, then truncate to 1 bit.
11792
11793 SDValue Chain = Load->getChain();
11794 SDValue BasePtr = Load->getBasePtr();
11795
11796 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11797
11798 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11799 RealMemVT, MMO);
11800
11801 if (!MemVT.isVector()) {
11802 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11803 NewLD.getValue(1)};
11804
11805 return DAG.getMergeValues(Ops, DL);
11806 }
11807
11809 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11810 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11811 DAG.getConstant(I, DL, MVT::i32));
11812
11813 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11814 }
11815
11816 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11817
11818 return DAG.getMergeValues(Ops, DL);
11819 }
11820
11821 if (!MemVT.isVector())
11822 return SDValue();
11823
11824 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11825 "Custom lowering for non-i32 vectors hasn't been implemented.");
11826
11827 Align Alignment = Load->getAlign();
11828 unsigned AS = Load->getAddressSpace();
11829 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11830 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11831 return SplitVectorLoad(Op, DAG);
11832 }
11833
11834 MachineFunction &MF = DAG.getMachineFunction();
11835 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11836 // If there is a possibility that flat instruction access scratch memory
11837 // then we need to use the same legalization rules we use for private.
11838 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11839 !Subtarget->hasMultiDwordFlatScratchAddressing())
11840 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11843
11844 unsigned NumElements = MemVT.getVectorNumElements();
11845
11846 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11848 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11849 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11851 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11852 Alignment >= Align(4) && NumElements < 32) {
11853 if (MemVT.isPow2VectorType() ||
11854 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11855 return SDValue();
11856 return WidenOrSplitVectorLoad(Op, DAG);
11857 }
11858 // Non-uniform loads will be selected to MUBUF instructions, so they
11859 // have the same legalization requirements as global and private
11860 // loads.
11861 //
11862 }
11863 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11866 if (NumElements > 4)
11867 return SplitVectorLoad(Op, DAG);
11868 // v3 loads not supported on SI.
11869 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11870 return WidenOrSplitVectorLoad(Op, DAG);
11871
11872 // v3 and v4 loads are supported for private and global memory.
11873 return SDValue();
11874 }
11875 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11876 // Depending on the setting of the private_element_size field in the
11877 // resource descriptor, we can only make private accesses up to a certain
11878 // size.
11879 switch (Subtarget->getMaxPrivateElementSize()) {
11880 case 4: {
11881 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11882 return DAG.getMergeValues({Op0, Op1}, DL);
11883 }
11884 case 8:
11885 if (NumElements > 2)
11886 return SplitVectorLoad(Op, DAG);
11887 return SDValue();
11888 case 16:
11889 // Same as global/flat
11890 if (NumElements > 4)
11891 return SplitVectorLoad(Op, DAG);
11892 // v3 loads not supported on SI.
11893 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11894 return WidenOrSplitVectorLoad(Op, DAG);
11895
11896 return SDValue();
11897 default:
11898 llvm_unreachable("unsupported private_element_size");
11899 }
11900 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11901 unsigned Fast = 0;
11902 auto Flags = Load->getMemOperand()->getFlags();
11904 Load->getAlign(), Flags, &Fast) &&
11905 Fast > 1)
11906 return SDValue();
11907
11908 if (MemVT.isVector())
11909 return SplitVectorLoad(Op, DAG);
11910 }
11911
11913 MemVT, *Load->getMemOperand())) {
11914 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11915 return DAG.getMergeValues({Op0, Op1}, DL);
11916 }
11917
11918 return SDValue();
11919}
11920
11921SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11922 EVT VT = Op.getValueType();
11923 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11924 VT.getSizeInBits() == 512)
11925 return splitTernaryVectorOp(Op, DAG);
11926
11927 assert(VT.getSizeInBits() == 64);
11928
11929 SDLoc DL(Op);
11930 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11931
11932 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11933 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11934
11935 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11936 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11937
11938 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11939 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11940
11941 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11942
11943 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11944 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11945
11946 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11947
11948 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11949 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11950}
11951
11952// Catch division cases where we can use shortcuts with rcp and rsq
11953// instructions.
11954SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11955 SelectionDAG &DAG) const {
11956 SDLoc SL(Op);
11957 SDValue LHS = Op.getOperand(0);
11958 SDValue RHS = Op.getOperand(1);
11959 EVT VT = Op.getValueType();
11960 const SDNodeFlags Flags = Op->getFlags();
11961
11962 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11963
11964 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11965 // Without !fpmath accuracy information, we can't do more because we don't
11966 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11967 // f16 is always accurate enough
11968 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11969 return SDValue();
11970
11971 if (CLHS->isExactlyValue(1.0)) {
11972 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11973 // the CI documentation has a worst case error of 1 ulp.
11974 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11975 // use it as long as we aren't trying to use denormals.
11976 //
11977 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11978
11979 // 1.0 / sqrt(x) -> rsq(x)
11980
11981 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11982 // error seems really high at 2^29 ULP.
11983 // 1.0 / x -> rcp(x)
11984 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11985 }
11986
11987 // Same as for 1.0, but expand the sign out of the constant.
11988 if (CLHS->isExactlyValue(-1.0)) {
11989 // -1.0 / x -> rcp (fneg x)
11990 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11991 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11992 }
11993 }
11994
11995 // For f16 and bf16 require afn or arcp.
11996 // For f32 require afn.
11997 if (!AllowInaccurateRcp &&
11998 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
11999 return SDValue();
12000
12001 // Turn into multiply by the reciprocal.
12002 // x / y -> x * (1.0 / y)
12003 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12004 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12005}
12006
12007SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12008 SelectionDAG &DAG) const {
12009 SDLoc SL(Op);
12010 SDValue X = Op.getOperand(0);
12011 SDValue Y = Op.getOperand(1);
12012 EVT VT = Op.getValueType();
12013 const SDNodeFlags Flags = Op->getFlags();
12014
12015 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12016 if (!AllowInaccurateDiv)
12017 return SDValue();
12018
12019 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12020 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12021
12022 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12023 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12024
12025 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12026 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12027 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12028 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12029 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12030 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12031}
12032
12033static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12034 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12035 SDNodeFlags Flags) {
12036 if (GlueChain->getNumValues() <= 1) {
12037 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12038 }
12039
12040 assert(GlueChain->getNumValues() == 3);
12041
12042 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12043 switch (Opcode) {
12044 default:
12045 llvm_unreachable("no chain equivalent for opcode");
12046 case ISD::FMUL:
12047 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12048 break;
12049 }
12050
12051 return DAG.getNode(Opcode, SL, VTList,
12052 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12053 Flags);
12054}
12055
12056static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12057 EVT VT, SDValue A, SDValue B, SDValue C,
12058 SDValue GlueChain, SDNodeFlags Flags) {
12059 if (GlueChain->getNumValues() <= 1) {
12060 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12061 }
12062
12063 assert(GlueChain->getNumValues() == 3);
12064
12065 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12066 switch (Opcode) {
12067 default:
12068 llvm_unreachable("no chain equivalent for opcode");
12069 case ISD::FMA:
12070 Opcode = AMDGPUISD::FMA_W_CHAIN;
12071 break;
12072 }
12073
12074 return DAG.getNode(Opcode, SL, VTList,
12075 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12076 Flags);
12077}
12078
12079SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12080 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12081 return FastLowered;
12082
12083 SDLoc SL(Op);
12084 EVT VT = Op.getValueType();
12085 SDValue LHS = Op.getOperand(0);
12086 SDValue RHS = Op.getOperand(1);
12087
12088 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12089 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12090
12091 if (VT == MVT::bf16) {
12092 SDValue ExtDiv =
12093 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12094 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12095 DAG.getTargetConstant(0, SL, MVT::i32));
12096 }
12097
12098 assert(VT == MVT::f16);
12099
12100 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12101 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12102 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12103 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12104 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12105 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12106 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12107 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12108 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12109 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12110 // q16.u = opx(V_CVT_F16_F32, q32.u);
12111 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12112
12113 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12114 unsigned FMADOpCode =
12116 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12117 SDValue Rcp =
12118 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12119 SDValue Quot =
12120 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12121 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12122 Op->getFlags());
12123 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12124 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12125 Op->getFlags());
12126 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12127 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12128 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12129 DAG.getConstant(0xff800000, SL, MVT::i32));
12130 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12131 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12132 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12133 DAG.getTargetConstant(0, SL, MVT::i32));
12134 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12135 Op->getFlags());
12136}
12137
12138// Faster 2.5 ULP division that does not support denormals.
12139SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12140 SDNodeFlags Flags = Op->getFlags();
12141 SDLoc SL(Op);
12142 SDValue LHS = Op.getOperand(1);
12143 SDValue RHS = Op.getOperand(2);
12144
12145 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12146
12147 const APFloat K0Val(0x1p+96f);
12148 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12149
12150 const APFloat K1Val(0x1p-32f);
12151 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12152
12153 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12154
12155 EVT SetCCVT =
12156 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12157
12158 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12159
12160 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12161
12162 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12163
12164 // rcp does not support denormals.
12165 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12166
12167 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12168
12169 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12170}
12171
12172// Returns immediate value for setting the F32 denorm mode when using the
12173// S_DENORM_MODE instruction.
12176 const GCNSubtarget *ST) {
12177 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12178 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12179 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12180 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12181}
12182
12183SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12184 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12185 return FastLowered;
12186
12187 // The selection matcher assumes anything with a chain selecting to a
12188 // mayRaiseFPException machine instruction. Since we're introducing a chain
12189 // here, we need to explicitly report nofpexcept for the regular fdiv
12190 // lowering.
12191 SDNodeFlags Flags = Op->getFlags();
12192 Flags.setNoFPExcept(true);
12193
12194 SDLoc SL(Op);
12195 SDValue LHS = Op.getOperand(0);
12196 SDValue RHS = Op.getOperand(1);
12197
12198 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12199
12200 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12201
12202 SDValue DenominatorScaled =
12203 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12204 SDValue NumeratorScaled =
12205 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12206
12207 // Denominator is scaled to not be denormal, so using rcp is ok.
12208 SDValue ApproxRcp =
12209 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12210 SDValue NegDivScale0 =
12211 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12212
12213 using namespace AMDGPU::Hwreg;
12214 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12215 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12216
12217 const MachineFunction &MF = DAG.getMachineFunction();
12218 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12219 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12220
12221 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12222 const bool HasDynamicDenormals =
12223 (DenormMode.Input == DenormalMode::Dynamic) ||
12224 (DenormMode.Output == DenormalMode::Dynamic);
12225
12226 SDValue SavedDenormMode;
12227
12228 if (!PreservesDenormals) {
12229 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12230 // lowering. The chain dependence is insufficient, and we need glue. We do
12231 // not need the glue variants in a strictfp function.
12232
12233 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12234
12235 SDValue Glue = DAG.getEntryNode();
12236 if (HasDynamicDenormals) {
12237 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12238 DAG.getVTList(MVT::i32, MVT::Glue),
12239 {BitField, Glue});
12240 SavedDenormMode = SDValue(GetReg, 0);
12241
12242 Glue = DAG.getMergeValues(
12243 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12244 }
12245
12246 SDNode *EnableDenorm;
12247 if (Subtarget->hasDenormModeInst()) {
12248 const SDValue EnableDenormValue =
12250
12251 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12252 EnableDenormValue)
12253 .getNode();
12254 } else {
12255 const SDValue EnableDenormValue =
12256 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12257 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12258 {EnableDenormValue, BitField, Glue});
12259 }
12260
12261 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12262 SDValue(EnableDenorm, 1)};
12263
12264 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12265 }
12266
12267 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12268 ApproxRcp, One, NegDivScale0, Flags);
12269
12270 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12271 ApproxRcp, Fma0, Flags);
12272
12273 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12274 Fma1, Flags);
12275
12276 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12277 NumeratorScaled, Mul, Flags);
12278
12279 SDValue Fma3 =
12280 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12281
12282 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12283 NumeratorScaled, Fma3, Flags);
12284
12285 if (!PreservesDenormals) {
12286 SDNode *DisableDenorm;
12287 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12288 const SDValue DisableDenormValue = getSPDenormModeValue(
12289 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12290
12291 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12292 DisableDenorm =
12293 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12294 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12295 .getNode();
12296 } else {
12297 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12298 const SDValue DisableDenormValue =
12299 HasDynamicDenormals
12300 ? SavedDenormMode
12301 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12302
12303 DisableDenorm = DAG.getMachineNode(
12304 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12305 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12306 }
12307
12308 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12309 SDValue(DisableDenorm, 0), DAG.getRoot());
12310 DAG.setRoot(OutputChain);
12311 }
12312
12313 SDValue Scale = NumeratorScaled.getValue(1);
12314 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12315 {Fma4, Fma1, Fma3, Scale}, Flags);
12316
12317 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12318}
12319
12320SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12321 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12322 return FastLowered;
12323
12324 SDLoc SL(Op);
12325 SDValue X = Op.getOperand(0);
12326 SDValue Y = Op.getOperand(1);
12327
12328 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12329
12330 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12331
12332 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12333
12334 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12335
12336 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12337
12338 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12339
12340 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12341
12342 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12343
12344 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12345
12346 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12347 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12348
12349 SDValue Fma4 =
12350 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12351
12352 SDValue Scale;
12353
12354 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12355 // Workaround a hardware bug on SI where the condition output from div_scale
12356 // is not usable.
12357
12358 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12359
12360 // Figure out if the scale to use for div_fmas.
12361 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12362 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12363 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12364 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12365
12366 SDValue NumHi =
12367 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12368 SDValue DenHi =
12369 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12370
12371 SDValue Scale0Hi =
12372 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12373 SDValue Scale1Hi =
12374 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12375
12376 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12377 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12378 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12379 } else {
12380 Scale = DivScale1.getValue(1);
12381 }
12382
12383 SDValue Fmas =
12384 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12385
12386 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12387}
12388
12389SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12390 EVT VT = Op.getValueType();
12391
12392 if (VT == MVT::f32)
12393 return LowerFDIV32(Op, DAG);
12394
12395 if (VT == MVT::f64)
12396 return LowerFDIV64(Op, DAG);
12397
12398 if (VT == MVT::f16 || VT == MVT::bf16)
12399 return LowerFDIV16(Op, DAG);
12400
12401 llvm_unreachable("Unexpected type for fdiv");
12402}
12403
12404SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12405 SDLoc dl(Op);
12406 SDValue Val = Op.getOperand(0);
12407 EVT VT = Val.getValueType();
12408 EVT ResultExpVT = Op->getValueType(1);
12409 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12410
12411 SDValue Mant = DAG.getNode(
12413 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12414
12415 SDValue Exp = DAG.getNode(
12416 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12417 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12418
12419 if (Subtarget->hasFractBug()) {
12420 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12421 SDValue Inf =
12423
12424 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12425 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12426 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12427 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12428 }
12429
12430 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12431 return DAG.getMergeValues({Mant, CastExp}, dl);
12432}
12433
12434SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12435 SDLoc DL(Op);
12436 StoreSDNode *Store = cast<StoreSDNode>(Op);
12437 EVT VT = Store->getMemoryVT();
12438
12439 if (VT == MVT::i1) {
12440 return DAG.getTruncStore(
12441 Store->getChain(), DL,
12442 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12443 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12444 }
12445
12446 assert(VT.isVector() &&
12447 Store->getValue().getValueType().getScalarType() == MVT::i32);
12448
12449 unsigned AS = Store->getAddressSpace();
12450 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12451 Store->getAlign().value() < VT.getStoreSize() &&
12452 VT.getSizeInBits() > 32) {
12453 return SplitVectorStore(Op, DAG);
12454 }
12455
12456 MachineFunction &MF = DAG.getMachineFunction();
12457 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12458 // If there is a possibility that flat instruction access scratch memory
12459 // then we need to use the same legalization rules we use for private.
12460 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12461 !Subtarget->hasMultiDwordFlatScratchAddressing())
12462 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12465
12466 unsigned NumElements = VT.getVectorNumElements();
12468 if (NumElements > 4)
12469 return SplitVectorStore(Op, DAG);
12470 // v3 stores not supported on SI.
12471 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12472 return SplitVectorStore(Op, DAG);
12473
12475 VT, *Store->getMemOperand()))
12476 return expandUnalignedStore(Store, DAG);
12477
12478 return SDValue();
12479 }
12480 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12481 switch (Subtarget->getMaxPrivateElementSize()) {
12482 case 4:
12483 return scalarizeVectorStore(Store, DAG);
12484 case 8:
12485 if (NumElements > 2)
12486 return SplitVectorStore(Op, DAG);
12487 return SDValue();
12488 case 16:
12489 if (NumElements > 4 ||
12490 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12491 return SplitVectorStore(Op, DAG);
12492 return SDValue();
12493 default:
12494 llvm_unreachable("unsupported private_element_size");
12495 }
12496 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12497 unsigned Fast = 0;
12498 auto Flags = Store->getMemOperand()->getFlags();
12500 Store->getAlign(), Flags, &Fast) &&
12501 Fast > 1)
12502 return SDValue();
12503
12504 if (VT.isVector())
12505 return SplitVectorStore(Op, DAG);
12506
12507 return expandUnalignedStore(Store, DAG);
12508 }
12509
12510 // Probably an invalid store. If so we'll end up emitting a selection error.
12511 return SDValue();
12512}
12513
12514// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12515SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12516 SDLoc SL(Op);
12517 assert(!Subtarget->has16BitInsts());
12518 SDNodeFlags Flags = Op->getFlags();
12519 SDValue Ext =
12520 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12521
12522 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12523 SDValue Sqrt =
12524 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12525
12526 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12527 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12528}
12529
12530SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12531 SDLoc DL(Op);
12532 SDNodeFlags Flags = Op->getFlags();
12533 MVT VT = Op.getValueType().getSimpleVT();
12534 const SDValue X = Op.getOperand(0);
12535
12536 if (allowApproxFunc(DAG, Flags)) {
12537 // Instruction is 1ulp but ignores denormals.
12538 return DAG.getNode(
12540 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12541 }
12542
12543 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12544 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12545
12546 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12547
12548 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12549
12550 SDValue SqrtX =
12551 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12552
12553 SDValue SqrtS;
12554 if (needsDenormHandlingF32(DAG, X, Flags)) {
12555 SDValue SqrtID =
12556 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12557 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12558
12559 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12560 SDValue SqrtSNextDownInt =
12561 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12562 DAG.getAllOnesConstant(DL, MVT::i32));
12563 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12564
12565 SDValue NegSqrtSNextDown =
12566 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12567
12568 SDValue SqrtVP =
12569 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12570
12571 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12572 DAG.getConstant(1, DL, MVT::i32));
12573 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12574
12575 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12576 SDValue SqrtVS =
12577 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12578
12579 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12580 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12581
12582 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12583 Flags);
12584
12585 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12586 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12587 Flags);
12588 } else {
12589 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12590
12591 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12592
12593 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12594 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12595 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12596
12597 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12598 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12599 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12600
12601 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12602 SDValue SqrtD =
12603 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12604 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12605 }
12606
12607 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12608
12609 SDValue ScaledDown =
12610 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12611
12612 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12613 SDValue IsZeroOrInf =
12614 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12615 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12616
12617 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12618}
12619
12620SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12621 // For double type, the SQRT and RSQ instructions don't have required
12622 // precision, we apply Goldschmidt's algorithm to improve the result:
12623 //
12624 // y0 = rsq(x)
12625 // g0 = x * y0
12626 // h0 = 0.5 * y0
12627 //
12628 // r0 = 0.5 - h0 * g0
12629 // g1 = g0 * r0 + g0
12630 // h1 = h0 * r0 + h0
12631 //
12632 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12633 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12634 // h2 = h1 * r1 + h1
12635 //
12636 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12637 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12638 //
12639 // sqrt(x) = g3
12640
12641 SDNodeFlags Flags = Op->getFlags();
12642
12643 SDLoc DL(Op);
12644
12645 SDValue X = Op.getOperand(0);
12646 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12647
12648 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12649
12650 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12651
12652 // Scale up input if it is too small.
12653 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12654 SDValue ScaleUp =
12655 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12656 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12657
12658 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12659
12660 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12661
12662 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12663 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12664
12665 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12666 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12667
12668 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12669
12670 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12671
12672 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12673 SDValue SqrtD0 =
12674 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12675
12676 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12677
12678 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12679 SDValue SqrtD1 =
12680 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12681
12682 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12683
12684 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12685 SDValue ScaleDown =
12686 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12687 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12688
12689 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12690 // with finite only or nsz because rsq(+/-0) = +/-inf
12691
12692 // TODO: Check for DAZ and expand to subnormals
12693 SDValue IsZeroOrInf =
12694 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12695 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12696
12697 // If x is +INF, +0, or -0, use its original value
12698 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12699 Flags);
12700}
12701
12702SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12703 SDLoc DL(Op);
12704 EVT VT = Op.getValueType();
12705 SDValue Arg = Op.getOperand(0);
12706 SDValue TrigVal;
12707
12708 // Propagate fast-math flags so that the multiply we introduce can be folded
12709 // if Arg is already the result of a multiply by constant.
12710 auto Flags = Op->getFlags();
12711
12712 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12713
12714 if (Subtarget->hasTrigReducedRange()) {
12715 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12716 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12717 } else {
12718 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12719 }
12720
12721 switch (Op.getOpcode()) {
12722 case ISD::FCOS:
12723 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12724 case ISD::FSIN:
12725 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12726 default:
12727 llvm_unreachable("Wrong trig opcode");
12728 }
12729}
12730
12731SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12732 SelectionDAG &DAG) const {
12733 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12734 assert(AtomicNode->isCompareAndSwap());
12735 unsigned AS = AtomicNode->getAddressSpace();
12736
12737 // No custom lowering required for local address space
12739 return Op;
12740
12741 // Non-local address space requires custom lowering for atomic compare
12742 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12743 SDLoc DL(Op);
12744 SDValue ChainIn = Op.getOperand(0);
12745 SDValue Addr = Op.getOperand(1);
12746 SDValue Old = Op.getOperand(2);
12747 SDValue New = Op.getOperand(3);
12748 EVT VT = Op.getValueType();
12749 MVT SimpleVT = VT.getSimpleVT();
12750 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12751
12752 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12753 SDValue Ops[] = {ChainIn, Addr, NewOld};
12754
12756 Op->getVTList(), Ops, VT,
12757 AtomicNode->getMemOperand());
12758}
12759
12760//===----------------------------------------------------------------------===//
12761// Custom DAG optimizations
12762//===----------------------------------------------------------------------===//
12763
12764SDValue
12765SITargetLowering::performUCharToFloatCombine(SDNode *N,
12766 DAGCombinerInfo &DCI) const {
12767 EVT VT = N->getValueType(0);
12768 EVT ScalarVT = VT.getScalarType();
12769 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12770 return SDValue();
12771
12772 SelectionDAG &DAG = DCI.DAG;
12773 SDLoc DL(N);
12774
12775 SDValue Src = N->getOperand(0);
12776 EVT SrcVT = Src.getValueType();
12777
12778 // TODO: We could try to match extracting the higher bytes, which would be
12779 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12780 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12781 // about in practice.
12782 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12783 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12784 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12785 DCI.AddToWorklist(Cvt.getNode());
12786
12787 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12788 if (ScalarVT != MVT::f32) {
12789 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12790 DAG.getTargetConstant(0, DL, MVT::i32));
12791 }
12792 return Cvt;
12793 }
12794 }
12795
12796 return SDValue();
12797}
12798
12799SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12800 DAGCombinerInfo &DCI) const {
12801 SDValue MagnitudeOp = N->getOperand(0);
12802 SDValue SignOp = N->getOperand(1);
12803
12804 // The generic combine for fcopysign + fp cast is too conservative with
12805 // vectors, and also gets confused by the splitting we will perform here, so
12806 // peek through FP casts.
12807 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12808 SignOp.getOpcode() == ISD::FP_ROUND)
12809 SignOp = SignOp.getOperand(0);
12810
12811 SelectionDAG &DAG = DCI.DAG;
12812 SDLoc DL(N);
12813 EVT SignVT = SignOp.getValueType();
12814
12815 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12816 // lower half with a copy.
12817 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12818 EVT MagVT = MagnitudeOp.getValueType();
12819
12820 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12821
12822 if (MagVT.getScalarType() == MVT::f64) {
12823 EVT F32VT = MagVT.isVector()
12824 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12825 : MVT::v2f32;
12826
12827 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12828
12830 for (unsigned I = 0; I != NumElts; ++I) {
12831 SDValue MagLo =
12832 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12833 DAG.getConstant(2 * I, DL, MVT::i32));
12834 SDValue MagHi =
12835 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12836 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12837
12838 SDValue SignOpElt =
12839 MagVT.isVector()
12841 SignOp, DAG.getConstant(I, DL, MVT::i32))
12842 : SignOp;
12843
12844 SDValue HiOp =
12845 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12846
12847 SDValue Vector =
12848 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12849
12850 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12851 NewElts.push_back(NewElt);
12852 }
12853
12854 if (NewElts.size() == 1)
12855 return NewElts[0];
12856
12857 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12858 }
12859
12860 if (SignVT.getScalarType() != MVT::f64)
12861 return SDValue();
12862
12863 // Reduce width of sign operand, we only need the highest bit.
12864 //
12865 // fcopysign f64:x, f64:y ->
12866 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12867 // TODO: In some cases it might make sense to go all the way to f16.
12868
12869 EVT F32VT = MagVT.isVector()
12870 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12871 : MVT::v2f32;
12872
12873 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12874
12875 SmallVector<SDValue, 8> F32Signs;
12876 for (unsigned I = 0; I != NumElts; ++I) {
12877 // Take sign from odd elements of cast vector
12878 SDValue SignAsF32 =
12879 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12880 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12881 F32Signs.push_back(SignAsF32);
12882 }
12883
12884 SDValue NewSign =
12885 NumElts == 1
12886 ? F32Signs.back()
12888 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12889 F32Signs);
12890
12891 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12892 NewSign);
12893}
12894
12895// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12896// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12897// bits
12898
12899// This is a variant of
12900// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12901//
12902// The normal DAG combiner will do this, but only if the add has one use since
12903// that would increase the number of instructions.
12904//
12905// This prevents us from seeing a constant offset that can be folded into a
12906// memory instruction's addressing mode. If we know the resulting add offset of
12907// a pointer can be folded into an addressing offset, we can replace the pointer
12908// operand with the add of new constant offset. This eliminates one of the uses,
12909// and may allow the remaining use to also be simplified.
12910//
12911SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12912 EVT MemVT,
12913 DAGCombinerInfo &DCI) const {
12914 SDValue N0 = N->getOperand(0);
12915 SDValue N1 = N->getOperand(1);
12916
12917 // We only do this to handle cases where it's profitable when there are
12918 // multiple uses of the add, so defer to the standard combine.
12919 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12920 N0->hasOneUse())
12921 return SDValue();
12922
12923 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12924 if (!CN1)
12925 return SDValue();
12926
12927 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12928 if (!CAdd)
12929 return SDValue();
12930
12931 SelectionDAG &DAG = DCI.DAG;
12932
12933 if (N0->getOpcode() == ISD::OR &&
12934 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12935 return SDValue();
12936
12937 // If the resulting offset is too large, we can't fold it into the
12938 // addressing mode offset.
12939 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12940 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12941
12942 AddrMode AM;
12943 AM.HasBaseReg = true;
12944 AM.BaseOffs = Offset.getSExtValue();
12945 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12946 return SDValue();
12947
12948 SDLoc SL(N);
12949 EVT VT = N->getValueType(0);
12950
12951 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12952 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12953
12954 SDNodeFlags Flags;
12955 Flags.setNoUnsignedWrap(
12956 N->getFlags().hasNoUnsignedWrap() &&
12957 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12958
12959 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12960}
12961
12962/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12963/// by the chain and intrinsic ID. Theoretically we would also need to check the
12964/// specific intrinsic, but they all place the pointer operand first.
12965static unsigned getBasePtrIndex(const MemSDNode *N) {
12966 switch (N->getOpcode()) {
12967 case ISD::STORE:
12970 return 2;
12971 default:
12972 return 1;
12973 }
12974}
12975
12976SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12977 DAGCombinerInfo &DCI) const {
12978 SelectionDAG &DAG = DCI.DAG;
12979
12980 unsigned PtrIdx = getBasePtrIndex(N);
12981 SDValue Ptr = N->getOperand(PtrIdx);
12982
12983 // TODO: We could also do this for multiplies.
12984 if (Ptr.getOpcode() == ISD::SHL) {
12985 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12986 N->getMemoryVT(), DCI);
12987 if (NewPtr) {
12988 SmallVector<SDValue, 8> NewOps(N->ops());
12989
12990 NewOps[PtrIdx] = NewPtr;
12991 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12992 }
12993 }
12994
12995 return SDValue();
12996}
12997
12998static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12999 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13000 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13001 (Opc == ISD::XOR && Val == 0);
13002}
13003
13004// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13005// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13006// integer combine opportunities since most 64-bit operations are decomposed
13007// this way. TODO: We won't want this for SALU especially if it is an inline
13008// immediate.
13009SDValue SITargetLowering::splitBinaryBitConstantOp(
13010 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13011 const ConstantSDNode *CRHS) const {
13012 uint64_t Val = CRHS->getZExtValue();
13013 uint32_t ValLo = Lo_32(Val);
13014 uint32_t ValHi = Hi_32(Val);
13015 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13016
13017 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13019 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13020 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13021 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13022 !CRHS->user_begin()->isDivergent())
13023 return SDValue();
13024
13025 // If we need to materialize a 64-bit immediate, it will be split up later
13026 // anyway. Avoid creating the harder to understand 64-bit immediate
13027 // materialization.
13028 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13029 }
13030
13031 return SDValue();
13032}
13033
13035 if (V.getValueType() != MVT::i1)
13036 return false;
13037 switch (V.getOpcode()) {
13038 default:
13039 break;
13040 case ISD::SETCC:
13041 case ISD::IS_FPCLASS:
13043 return true;
13044 case ISD::AND:
13045 case ISD::OR:
13046 case ISD::XOR:
13047 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13048 case ISD::SADDO:
13049 case ISD::UADDO:
13050 case ISD::SSUBO:
13051 case ISD::USUBO:
13052 case ISD::SMULO:
13053 case ISD::UMULO:
13054 return V.getResNo() == 1;
13056 unsigned IntrinsicID = V.getConstantOperandVal(0);
13057 switch (IntrinsicID) {
13058 case Intrinsic::amdgcn_is_shared:
13059 case Intrinsic::amdgcn_is_private:
13060 return true;
13061 default:
13062 return false;
13063 }
13064
13065 return false;
13066 }
13067 }
13068 return false;
13069}
13070
13071// If a constant has all zeroes or all ones within each byte return it.
13072// Otherwise return 0.
13074 // 0xff for any zero byte in the mask
13075 uint32_t ZeroByteMask = 0;
13076 if (!(C & 0x000000ff))
13077 ZeroByteMask |= 0x000000ff;
13078 if (!(C & 0x0000ff00))
13079 ZeroByteMask |= 0x0000ff00;
13080 if (!(C & 0x00ff0000))
13081 ZeroByteMask |= 0x00ff0000;
13082 if (!(C & 0xff000000))
13083 ZeroByteMask |= 0xff000000;
13084 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13085 if ((NonZeroByteMask & C) != NonZeroByteMask)
13086 return 0; // Partial bytes selected.
13087 return C;
13088}
13089
13090// Check if a node selects whole bytes from its operand 0 starting at a byte
13091// boundary while masking the rest. Returns select mask as in the v_perm_b32
13092// or -1 if not succeeded.
13093// Note byte select encoding:
13094// value 0-3 selects corresponding source byte;
13095// value 0xc selects zero;
13096// value 0xff selects 0xff.
13098 assert(V.getValueSizeInBits() == 32);
13099
13100 if (V.getNumOperands() != 2)
13101 return ~0;
13102
13103 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13104 if (!N1)
13105 return ~0;
13106
13107 uint32_t C = N1->getZExtValue();
13108
13109 switch (V.getOpcode()) {
13110 default:
13111 break;
13112 case ISD::AND:
13113 if (uint32_t ConstMask = getConstantPermuteMask(C))
13114 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13115 break;
13116
13117 case ISD::OR:
13118 if (uint32_t ConstMask = getConstantPermuteMask(C))
13119 return (0x03020100 & ~ConstMask) | ConstMask;
13120 break;
13121
13122 case ISD::SHL:
13123 if (C % 8)
13124 return ~0;
13125
13126 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13127
13128 case ISD::SRL:
13129 if (C % 8)
13130 return ~0;
13131
13132 return uint32_t(0x0c0c0c0c03020100ull >> C);
13133 }
13134
13135 return ~0;
13136}
13137
13138SDValue SITargetLowering::performAndCombine(SDNode *N,
13139 DAGCombinerInfo &DCI) const {
13140 if (DCI.isBeforeLegalize())
13141 return SDValue();
13142
13143 SelectionDAG &DAG = DCI.DAG;
13144 EVT VT = N->getValueType(0);
13145 SDValue LHS = N->getOperand(0);
13146 SDValue RHS = N->getOperand(1);
13147
13148 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13149 if (VT == MVT::i64 && CRHS) {
13150 if (SDValue Split =
13151 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13152 return Split;
13153 }
13154
13155 if (CRHS && VT == MVT::i32) {
13156 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13157 // nb = number of trailing zeroes in mask
13158 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13159 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13160 uint64_t Mask = CRHS->getZExtValue();
13161 unsigned Bits = llvm::popcount(Mask);
13162 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13163 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13164 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13165 unsigned Shift = CShift->getZExtValue();
13166 unsigned NB = CRHS->getAPIntValue().countr_zero();
13167 unsigned Offset = NB + Shift;
13168 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13169 SDLoc SL(N);
13170 SDValue BFE =
13171 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13172 DAG.getConstant(Offset, SL, MVT::i32),
13173 DAG.getConstant(Bits, SL, MVT::i32));
13174 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13175 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13176 DAG.getValueType(NarrowVT));
13177 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13178 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13179 return Shl;
13180 }
13181 }
13182 }
13183
13184 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13185 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13186 isa<ConstantSDNode>(LHS.getOperand(2))) {
13187 uint32_t Sel = getConstantPermuteMask(Mask);
13188 if (!Sel)
13189 return SDValue();
13190
13191 // Select 0xc for all zero bytes
13192 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13193 SDLoc DL(N);
13194 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13195 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13196 }
13197 }
13198
13199 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13200 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13201 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13202 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13203 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13204
13205 SDValue X = LHS.getOperand(0);
13206 SDValue Y = RHS.getOperand(0);
13207 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13208 !isTypeLegal(X.getValueType()))
13209 return SDValue();
13210
13211 if (LCC == ISD::SETO) {
13212 if (X != LHS.getOperand(1))
13213 return SDValue();
13214
13215 if (RCC == ISD::SETUNE) {
13216 const ConstantFPSDNode *C1 =
13217 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13218 if (!C1 || !C1->isInfinity() || C1->isNegative())
13219 return SDValue();
13220
13221 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13225
13226 static_assert(
13229 0x3ff) == Mask,
13230 "mask not equal");
13231
13232 SDLoc DL(N);
13233 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13234 DAG.getConstant(Mask, DL, MVT::i32));
13235 }
13236 }
13237 }
13238
13239 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13240 std::swap(LHS, RHS);
13241
13242 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13243 RHS.hasOneUse()) {
13244 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13245 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13246 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13247 // | n_nan)
13248 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13249 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13250 (RHS.getOperand(0) == LHS.getOperand(0) &&
13251 LHS.getOperand(0) == LHS.getOperand(1))) {
13252 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13253 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13254 : Mask->getZExtValue() & OrdMask;
13255
13256 SDLoc DL(N);
13257 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13258 DAG.getConstant(NewMask, DL, MVT::i32));
13259 }
13260 }
13261
13262 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13263 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13264 // and x, (sext cc from i1) => select cc, x, 0
13265 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13266 std::swap(LHS, RHS);
13267 if (isBoolSGPR(RHS.getOperand(0)))
13268 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13269 DAG.getConstant(0, SDLoc(N), MVT::i32));
13270 }
13271
13272 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13273 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13274 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13275 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13276 uint32_t LHSMask = getPermuteMask(LHS);
13277 uint32_t RHSMask = getPermuteMask(RHS);
13278 if (LHSMask != ~0u && RHSMask != ~0u) {
13279 // Canonicalize the expression in an attempt to have fewer unique masks
13280 // and therefore fewer registers used to hold the masks.
13281 if (LHSMask > RHSMask) {
13282 std::swap(LHSMask, RHSMask);
13283 std::swap(LHS, RHS);
13284 }
13285
13286 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13287 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13288 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13289 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13290
13291 // Check of we need to combine values from two sources within a byte.
13292 if (!(LHSUsedLanes & RHSUsedLanes) &&
13293 // If we select high and lower word keep it for SDWA.
13294 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13295 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13296 // Each byte in each mask is either selector mask 0-3, or has higher
13297 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13298 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13299 // mask which is not 0xff wins. By anding both masks we have a correct
13300 // result except that 0x0c shall be corrected to give 0x0c only.
13301 uint32_t Mask = LHSMask & RHSMask;
13302 for (unsigned I = 0; I < 32; I += 8) {
13303 uint32_t ByteSel = 0xff << I;
13304 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13305 Mask &= (0x0c << I) & 0xffffffff;
13306 }
13307
13308 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13309 // or 0x0c.
13310 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13311 SDLoc DL(N);
13312
13313 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13314 RHS.getOperand(0),
13315 DAG.getConstant(Sel, DL, MVT::i32));
13316 }
13317 }
13318 }
13319
13320 return SDValue();
13321}
13322
13323// A key component of v_perm is a mapping between byte position of the src
13324// operands, and the byte position of the dest. To provide such, we need: 1. the
13325// node that provides x byte of the dest of the OR, and 2. the byte of the node
13326// used to provide that x byte. calculateByteProvider finds which node provides
13327// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13328// and finds an ultimate src and byte position For example: The supported
13329// LoadCombine pattern for vector loads is as follows
13330// t1
13331// or
13332// / \
13333// t2 t3
13334// zext shl
13335// | | \
13336// t4 t5 16
13337// or anyext
13338// / \ |
13339// t6 t7 t8
13340// srl shl or
13341// / | / \ / \
13342// t9 t10 t11 t12 t13 t14
13343// trunc* 8 trunc* 8 and and
13344// | | / | | \
13345// t15 t16 t17 t18 t19 t20
13346// trunc* 255 srl -256
13347// | / \
13348// t15 t15 16
13349//
13350// *In this example, the truncs are from i32->i16
13351//
13352// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13353// respectively. calculateSrcByte would find (given node) -> ultimate src &
13354// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13355// After finding the mapping, we can combine the tree into vperm t15, t16,
13356// 0x05000407
13357
13358// Find the source and byte position from a node.
13359// \p DestByte is the byte position of the dest of the or that the src
13360// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13361// dest of the or byte. \p Depth tracks how many recursive iterations we have
13362// performed.
13363static const std::optional<ByteProvider<SDValue>>
13364calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13365 unsigned Depth = 0) {
13366 // We may need to recursively traverse a series of SRLs
13367 if (Depth >= 6)
13368 return std::nullopt;
13369
13370 if (Op.getValueSizeInBits() < 8)
13371 return std::nullopt;
13372
13373 if (Op.getValueType().isVector())
13374 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13375
13376 switch (Op->getOpcode()) {
13377 case ISD::TRUNCATE: {
13378 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13379 }
13380
13381 case ISD::SIGN_EXTEND:
13382 case ISD::ZERO_EXTEND:
13384 SDValue NarrowOp = Op->getOperand(0);
13385 auto NarrowVT = NarrowOp.getValueType();
13386 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13387 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13388 NarrowVT = VTSign->getVT();
13389 }
13390 if (!NarrowVT.isByteSized())
13391 return std::nullopt;
13392 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13393
13394 if (SrcIndex >= NarrowByteWidth)
13395 return std::nullopt;
13396 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13397 }
13398
13399 case ISD::SRA:
13400 case ISD::SRL: {
13401 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13402 if (!ShiftOp)
13403 return std::nullopt;
13404
13405 uint64_t BitShift = ShiftOp->getZExtValue();
13406
13407 if (BitShift % 8 != 0)
13408 return std::nullopt;
13409
13410 SrcIndex += BitShift / 8;
13411
13412 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13413 }
13414
13415 default: {
13416 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13417 }
13418 }
13419 llvm_unreachable("fully handled switch");
13420}
13421
13422// For a byte position in the result of an Or, traverse the tree and find the
13423// node (and the byte of the node) which ultimately provides this {Or,
13424// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13425// the byte position of the Op that corresponds with the originally requested
13426// byte of the Or \p Depth tracks how many recursive iterations we have
13427// performed. \p StartingIndex is the originally requested byte of the Or
13428static const std::optional<ByteProvider<SDValue>>
13429calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13430 unsigned StartingIndex = 0) {
13431 // Finding Src tree of RHS of or typically requires at least 1 additional
13432 // depth
13433 if (Depth > 6)
13434 return std::nullopt;
13435
13436 unsigned BitWidth = Op.getScalarValueSizeInBits();
13437 if (BitWidth % 8 != 0)
13438 return std::nullopt;
13439 if (Index > BitWidth / 8 - 1)
13440 return std::nullopt;
13441
13442 bool IsVec = Op.getValueType().isVector();
13443 switch (Op.getOpcode()) {
13444 case ISD::OR: {
13445 if (IsVec)
13446 return std::nullopt;
13447
13448 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13449 StartingIndex);
13450 if (!RHS)
13451 return std::nullopt;
13452 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13453 StartingIndex);
13454 if (!LHS)
13455 return std::nullopt;
13456 // A well formed Or will have two ByteProviders for each byte, one of which
13457 // is constant zero
13458 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13459 return std::nullopt;
13460 if (!LHS || LHS->isConstantZero())
13461 return RHS;
13462 if (!RHS || RHS->isConstantZero())
13463 return LHS;
13464 return std::nullopt;
13465 }
13466
13467 case ISD::AND: {
13468 if (IsVec)
13469 return std::nullopt;
13470
13471 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13472 if (!BitMaskOp)
13473 return std::nullopt;
13474
13475 uint32_t BitMask = BitMaskOp->getZExtValue();
13476 // Bits we expect for our StartingIndex
13477 uint32_t IndexMask = 0xFF << (Index * 8);
13478
13479 if ((IndexMask & BitMask) != IndexMask) {
13480 // If the result of the and partially provides the byte, then it
13481 // is not well formatted
13482 if (IndexMask & BitMask)
13483 return std::nullopt;
13485 }
13486
13487 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13488 }
13489
13490 case ISD::FSHR: {
13491 if (IsVec)
13492 return std::nullopt;
13493
13494 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13495 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13496 if (!ShiftOp || Op.getValueType().isVector())
13497 return std::nullopt;
13498
13499 uint64_t BitsProvided = Op.getValueSizeInBits();
13500 if (BitsProvided % 8 != 0)
13501 return std::nullopt;
13502
13503 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13504 if (BitShift % 8)
13505 return std::nullopt;
13506
13507 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13508 uint64_t ByteShift = BitShift / 8;
13509
13510 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13511 uint64_t BytesProvided = BitsProvided / 8;
13512 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13513 NewIndex %= BytesProvided;
13514 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13515 }
13516
13517 case ISD::SRA:
13518 case ISD::SRL: {
13519 if (IsVec)
13520 return std::nullopt;
13521
13522 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13523 if (!ShiftOp)
13524 return std::nullopt;
13525
13526 uint64_t BitShift = ShiftOp->getZExtValue();
13527 if (BitShift % 8)
13528 return std::nullopt;
13529
13530 auto BitsProvided = Op.getScalarValueSizeInBits();
13531 if (BitsProvided % 8 != 0)
13532 return std::nullopt;
13533
13534 uint64_t BytesProvided = BitsProvided / 8;
13535 uint64_t ByteShift = BitShift / 8;
13536 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13537 // If the byte we are trying to provide (as tracked by index) falls in this
13538 // range, then the SRL provides the byte. The byte of interest of the src of
13539 // the SRL is Index + ByteShift
13540 return BytesProvided - ByteShift > Index
13541 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13542 Index + ByteShift)
13544 }
13545
13546 case ISD::SHL: {
13547 if (IsVec)
13548 return std::nullopt;
13549
13550 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13551 if (!ShiftOp)
13552 return std::nullopt;
13553
13554 uint64_t BitShift = ShiftOp->getZExtValue();
13555 if (BitShift % 8 != 0)
13556 return std::nullopt;
13557 uint64_t ByteShift = BitShift / 8;
13558
13559 // If we are shifting by an amount greater than (or equal to)
13560 // the index we are trying to provide, then it provides 0s. If not,
13561 // then this bytes are not definitively 0s, and the corresponding byte
13562 // of interest is Index - ByteShift of the src
13563 return Index < ByteShift
13565 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13566 Depth + 1, StartingIndex);
13567 }
13568 case ISD::ANY_EXTEND:
13569 case ISD::SIGN_EXTEND:
13570 case ISD::ZERO_EXTEND:
13572 case ISD::AssertZext:
13573 case ISD::AssertSext: {
13574 if (IsVec)
13575 return std::nullopt;
13576
13577 SDValue NarrowOp = Op->getOperand(0);
13578 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13579 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13580 Op->getOpcode() == ISD::AssertZext ||
13581 Op->getOpcode() == ISD::AssertSext) {
13582 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13583 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13584 }
13585 if (NarrowBitWidth % 8 != 0)
13586 return std::nullopt;
13587 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13588
13589 if (Index >= NarrowByteWidth)
13590 return Op.getOpcode() == ISD::ZERO_EXTEND
13591 ? std::optional<ByteProvider<SDValue>>(
13593 : std::nullopt;
13594 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13595 }
13596
13597 case ISD::TRUNCATE: {
13598 if (IsVec)
13599 return std::nullopt;
13600
13601 uint64_t NarrowByteWidth = BitWidth / 8;
13602
13603 if (NarrowByteWidth >= Index) {
13604 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13605 StartingIndex);
13606 }
13607
13608 return std::nullopt;
13609 }
13610
13611 case ISD::CopyFromReg: {
13612 if (BitWidth / 8 > Index)
13613 return calculateSrcByte(Op, StartingIndex, Index);
13614
13615 return std::nullopt;
13616 }
13617
13618 case ISD::LOAD: {
13619 auto *L = cast<LoadSDNode>(Op.getNode());
13620
13621 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13622 if (NarrowBitWidth % 8 != 0)
13623 return std::nullopt;
13624 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13625
13626 // If the width of the load does not reach byte we are trying to provide for
13627 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13628 // question
13629 if (Index >= NarrowByteWidth) {
13630 return L->getExtensionType() == ISD::ZEXTLOAD
13631 ? std::optional<ByteProvider<SDValue>>(
13633 : std::nullopt;
13634 }
13635
13636 if (NarrowByteWidth > Index) {
13637 return calculateSrcByte(Op, StartingIndex, Index);
13638 }
13639
13640 return std::nullopt;
13641 }
13642
13643 case ISD::BSWAP: {
13644 if (IsVec)
13645 return std::nullopt;
13646
13647 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13648 Depth + 1, StartingIndex);
13649 }
13650
13652 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13653 if (!IdxOp)
13654 return std::nullopt;
13655 auto VecIdx = IdxOp->getZExtValue();
13656 auto ScalarSize = Op.getScalarValueSizeInBits();
13657 if (ScalarSize < 32)
13658 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13659 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13660 StartingIndex, Index);
13661 }
13662
13663 case AMDGPUISD::PERM: {
13664 if (IsVec)
13665 return std::nullopt;
13666
13667 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13668 if (!PermMask)
13669 return std::nullopt;
13670
13671 auto IdxMask =
13672 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13673 if (IdxMask > 0x07 && IdxMask != 0x0c)
13674 return std::nullopt;
13675
13676 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13677 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13678
13679 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13682 }
13683
13684 default: {
13685 return std::nullopt;
13686 }
13687 }
13688
13689 llvm_unreachable("fully handled switch");
13690}
13691
13692// Returns true if the Operand is a scalar and is 16 bits
13693static bool isExtendedFrom16Bits(SDValue &Operand) {
13694
13695 switch (Operand.getOpcode()) {
13696 case ISD::ANY_EXTEND:
13697 case ISD::SIGN_EXTEND:
13698 case ISD::ZERO_EXTEND: {
13699 auto OpVT = Operand.getOperand(0).getValueType();
13700 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13701 }
13702 case ISD::LOAD: {
13703 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13704 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13705 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13706 ExtType == ISD::EXTLOAD) {
13707 auto MemVT = L->getMemoryVT();
13708 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13709 }
13710 return L->getMemoryVT().getSizeInBits() == 16;
13711 }
13712 default:
13713 return false;
13714 }
13715}
13716
13717// Returns true if the mask matches consecutive bytes, and the first byte
13718// begins at a power of 2 byte offset from 0th byte
13719static bool addresses16Bits(int Mask) {
13720 int Low8 = Mask & 0xff;
13721 int Hi8 = (Mask & 0xff00) >> 8;
13722
13723 assert(Low8 < 8 && Hi8 < 8);
13724 // Are the bytes contiguous in the order of increasing addresses.
13725 bool IsConsecutive = (Hi8 - Low8 == 1);
13726 // Is the first byte at location that is aligned for 16 bit instructions.
13727 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13728 // In this case, we still need code to extract the 16 bit operand, so it
13729 // is better to use i8 v_perm
13730 bool Is16Aligned = !(Low8 % 2);
13731
13732 return IsConsecutive && Is16Aligned;
13733}
13734
13735// Do not lower into v_perm if the operands are actually 16 bit
13736// and the selected bits (based on PermMask) correspond with two
13737// easily addressable 16 bit operands.
13739 SDValue &OtherOp) {
13740 int Low16 = PermMask & 0xffff;
13741 int Hi16 = (PermMask & 0xffff0000) >> 16;
13742
13743 auto TempOp = peekThroughBitcasts(Op);
13744 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13745
13746 auto OpIs16Bit =
13747 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13748 if (!OpIs16Bit)
13749 return true;
13750
13751 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13752 isExtendedFrom16Bits(TempOtherOp);
13753 if (!OtherOpIs16Bit)
13754 return true;
13755
13756 // Do we cleanly address both
13757 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13758}
13759
13761 unsigned DWordOffset) {
13762 SDValue Ret;
13763
13764 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13765 // ByteProvider must be at least 8 bits
13766 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13767
13768 if (TypeSize <= 32)
13769 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13770
13771 if (Src.getValueType().isVector()) {
13772 auto ScalarTySize = Src.getScalarValueSizeInBits();
13773 auto ScalarTy = Src.getValueType().getScalarType();
13774 if (ScalarTySize == 32) {
13775 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13776 DAG.getConstant(DWordOffset, SL, MVT::i32));
13777 }
13778 if (ScalarTySize > 32) {
13779 Ret = DAG.getNode(
13780 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13781 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13782 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13783 if (ShiftVal)
13784 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13785 DAG.getConstant(ShiftVal, SL, MVT::i32));
13786 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13787 }
13788
13789 assert(ScalarTySize < 32);
13790 auto NumElements = TypeSize / ScalarTySize;
13791 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13792 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13793 auto NumElementsIn32 = 32 / ScalarTySize;
13794 auto NumAvailElements = DWordOffset < Trunc32Elements
13795 ? NumElementsIn32
13796 : NumElements - NormalizedTrunc;
13797
13799 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13800 NumAvailElements);
13801
13802 Ret = DAG.getBuildVector(
13803 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13804 VecSrcs);
13805 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13806 }
13807
13808 /// Scalar Type
13809 auto ShiftVal = 32 * DWordOffset;
13810 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13811 DAG.getConstant(ShiftVal, SL, MVT::i32));
13812 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13813}
13814
13816 SelectionDAG &DAG = DCI.DAG;
13817 [[maybe_unused]] EVT VT = N->getValueType(0);
13819
13820 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13821 assert(VT == MVT::i32);
13822 for (int i = 0; i < 4; i++) {
13823 // Find the ByteProvider that provides the ith byte of the result of OR
13824 std::optional<ByteProvider<SDValue>> P =
13825 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13826 // TODO support constantZero
13827 if (!P || P->isConstantZero())
13828 return SDValue();
13829
13830 PermNodes.push_back(*P);
13831 }
13832 if (PermNodes.size() != 4)
13833 return SDValue();
13834
13835 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13836 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13837 uint64_t PermMask = 0x00000000;
13838 for (size_t i = 0; i < PermNodes.size(); i++) {
13839 auto PermOp = PermNodes[i];
13840 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13841 // by sizeof(Src2) = 4
13842 int SrcByteAdjust = 4;
13843
13844 // If the Src uses a byte from a different DWORD, then it corresponds
13845 // with a difference source
13846 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13847 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13848 if (SecondSrc)
13849 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13850 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13851 return SDValue();
13852
13853 // Set the index of the second distinct Src node
13854 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13855 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13856 SrcByteAdjust = 0;
13857 }
13858 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13860 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13861 }
13862 SDLoc DL(N);
13863 SDValue Op = *PermNodes[FirstSrc.first].Src;
13864 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13865 assert(Op.getValueSizeInBits() == 32);
13866
13867 // Check that we are not just extracting the bytes in order from an op
13868 if (!SecondSrc) {
13869 int Low16 = PermMask & 0xffff;
13870 int Hi16 = (PermMask & 0xffff0000) >> 16;
13871
13872 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13873 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13874
13875 // The perm op would really just produce Op. So combine into Op
13876 if (WellFormedLow && WellFormedHi)
13877 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13878 }
13879
13880 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13881
13882 if (SecondSrc) {
13883 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13884 assert(OtherOp.getValueSizeInBits() == 32);
13885 }
13886
13887 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13888
13889 assert(Op.getValueType().isByteSized() &&
13890 OtherOp.getValueType().isByteSized());
13891
13892 // If the ultimate src is less than 32 bits, then we will only be
13893 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13894 // CalculateByteProvider would not have returned Op as source if we
13895 // used a byte that is outside its ValueType. Thus, we are free to
13896 // ANY_EXTEND as the extended bits are dont-cares.
13897 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13898 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13899
13900 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13901 DAG.getConstant(PermMask, DL, MVT::i32));
13902 }
13903 return SDValue();
13904}
13905
13906SDValue SITargetLowering::performOrCombine(SDNode *N,
13907 DAGCombinerInfo &DCI) const {
13908 SelectionDAG &DAG = DCI.DAG;
13909 SDValue LHS = N->getOperand(0);
13910 SDValue RHS = N->getOperand(1);
13911
13912 EVT VT = N->getValueType(0);
13913 if (VT == MVT::i1) {
13914 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13915 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13916 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13917 SDValue Src = LHS.getOperand(0);
13918 if (Src != RHS.getOperand(0))
13919 return SDValue();
13920
13921 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13922 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13923 if (!CLHS || !CRHS)
13924 return SDValue();
13925
13926 // Only 10 bits are used.
13927 static const uint32_t MaxMask = 0x3ff;
13928
13929 uint32_t NewMask =
13930 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13931 SDLoc DL(N);
13932 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13933 DAG.getConstant(NewMask, DL, MVT::i32));
13934 }
13935
13936 return SDValue();
13937 }
13938
13939 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13941 LHS.getOpcode() == AMDGPUISD::PERM &&
13942 isa<ConstantSDNode>(LHS.getOperand(2))) {
13943 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13944 if (!Sel)
13945 return SDValue();
13946
13947 Sel |= LHS.getConstantOperandVal(2);
13948 SDLoc DL(N);
13949 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13950 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13951 }
13952
13953 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13954 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13955 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13956 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13957
13958 // If all the uses of an or need to extract the individual elements, do not
13959 // attempt to lower into v_perm
13960 auto usesCombinedOperand = [](SDNode *OrUse) {
13961 // If we have any non-vectorized use, then it is a candidate for v_perm
13962 if (OrUse->getOpcode() != ISD::BITCAST ||
13963 !OrUse->getValueType(0).isVector())
13964 return true;
13965
13966 // If we have any non-vectorized use, then it is a candidate for v_perm
13967 for (auto *VUser : OrUse->users()) {
13968 if (!VUser->getValueType(0).isVector())
13969 return true;
13970
13971 // If the use of a vector is a store, then combining via a v_perm
13972 // is beneficial.
13973 // TODO -- whitelist more uses
13974 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13975 if (VUser->getOpcode() == VectorwiseOp)
13976 return true;
13977 }
13978 return false;
13979 };
13980
13981 if (!any_of(N->users(), usesCombinedOperand))
13982 return SDValue();
13983
13984 uint32_t LHSMask = getPermuteMask(LHS);
13985 uint32_t RHSMask = getPermuteMask(RHS);
13986
13987 if (LHSMask != ~0u && RHSMask != ~0u) {
13988 // Canonicalize the expression in an attempt to have fewer unique masks
13989 // and therefore fewer registers used to hold the masks.
13990 if (LHSMask > RHSMask) {
13991 std::swap(LHSMask, RHSMask);
13992 std::swap(LHS, RHS);
13993 }
13994
13995 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13996 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13997 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13998 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13999
14000 // Check of we need to combine values from two sources within a byte.
14001 if (!(LHSUsedLanes & RHSUsedLanes) &&
14002 // If we select high and lower word keep it for SDWA.
14003 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14004 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14005 // Kill zero bytes selected by other mask. Zero value is 0xc.
14006 LHSMask &= ~RHSUsedLanes;
14007 RHSMask &= ~LHSUsedLanes;
14008 // Add 4 to each active LHS lane
14009 LHSMask |= LHSUsedLanes & 0x04040404;
14010 // Combine masks
14011 uint32_t Sel = LHSMask | RHSMask;
14012 SDLoc DL(N);
14013
14014 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14015 RHS.getOperand(0),
14016 DAG.getConstant(Sel, DL, MVT::i32));
14017 }
14018 }
14019 if (LHSMask == ~0u || RHSMask == ~0u) {
14020 if (SDValue Perm = matchPERM(N, DCI))
14021 return Perm;
14022 }
14023 }
14024
14025 // Detect identity v2i32 OR and replace with identity source node.
14026 // Specifically an Or that has operands constructed from the same source node
14027 // via extract_vector_elt and build_vector. I.E.
14028 // v2i32 or(
14029 // v2i32 build_vector(
14030 // i32 extract_elt(%IdentitySrc, 0),
14031 // i32 0
14032 // ),
14033 // v2i32 build_vector(
14034 // i32 0,
14035 // i32 extract_elt(%IdentitySrc, 1)
14036 // ) )
14037 // =>
14038 // v2i32 %IdentitySrc
14039
14040 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14041 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14042
14043 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14044 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14045
14046 // Test for and normalise build vectors.
14047 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14048
14049 // Get the extract_vector_element operands.
14050 SDValue LEVE = LHS->getOperand(0);
14051 SDValue REVE = RHS->getOperand(1);
14052
14053 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14055 // Check that different elements from the same vector are
14056 // extracted.
14057 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14058 LEVE->getOperand(1) != REVE->getOperand(1)) {
14059 SDValue IdentitySrc = LEVE.getOperand(0);
14060 return IdentitySrc;
14061 }
14062 }
14063 }
14064 }
14065
14066 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14067 return SDValue();
14068
14069 // TODO: This could be a generic combine with a predicate for extracting the
14070 // high half of an integer being free.
14071
14072 // (or i64:x, (zero_extend i32:y)) ->
14073 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14074 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14075 RHS.getOpcode() != ISD::ZERO_EXTEND)
14076 std::swap(LHS, RHS);
14077
14078 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14079 SDValue ExtSrc = RHS.getOperand(0);
14080 EVT SrcVT = ExtSrc.getValueType();
14081 if (SrcVT == MVT::i32) {
14082 SDLoc SL(N);
14083 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14084 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14085
14086 DCI.AddToWorklist(LowOr.getNode());
14087 DCI.AddToWorklist(HiBits.getNode());
14088
14089 SDValue Vec =
14090 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14091 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14092 }
14093 }
14094
14095 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14096 if (CRHS) {
14097 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14098 N->getOperand(0), CRHS))
14099 return Split;
14100 }
14101
14102 return SDValue();
14103}
14104
14105SDValue SITargetLowering::performXorCombine(SDNode *N,
14106 DAGCombinerInfo &DCI) const {
14107 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14108 return RV;
14109
14110 SDValue LHS = N->getOperand(0);
14111 SDValue RHS = N->getOperand(1);
14112
14113 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14114 SelectionDAG &DAG = DCI.DAG;
14115
14116 EVT VT = N->getValueType(0);
14117 if (CRHS && VT == MVT::i64) {
14118 if (SDValue Split =
14119 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14120 return Split;
14121 }
14122
14123 // v2i32 (xor (vselect cc, x, y), K) ->
14124 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14125 // replaced with source modifiers when the select is lowered to CNDMASK.
14126 unsigned Opc = LHS.getOpcode();
14127 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14128 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14129 CRHS && CRHS->getAPIntValue().isSignMask()) {
14130 SDValue CC = LHS->getOperand(0);
14131 SDValue TRUE = LHS->getOperand(1);
14132 SDValue FALSE = LHS->getOperand(2);
14133 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14134 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14135 SDValue XSelect =
14136 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14137 return XSelect;
14138 }
14139
14140 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14141 // fneg-like xors into 64-bit select.
14142 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14143 // This looks like an fneg, try to fold as a source modifier.
14144 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14146 // xor (select c, a, b), 0x80000000 ->
14147 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14148 SDLoc DL(N);
14149 SDValue CastLHS =
14150 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14151 SDValue CastRHS =
14152 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14153 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14154 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14155 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14156 LHS->getOperand(0), FNegLHS, FNegRHS);
14157 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14158 }
14159 }
14160
14161 return SDValue();
14162}
14163
14164SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14165 DAGCombinerInfo &DCI) const {
14166 if (!Subtarget->has16BitInsts() ||
14167 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14168 return SDValue();
14169
14170 EVT VT = N->getValueType(0);
14171 if (VT != MVT::i32)
14172 return SDValue();
14173
14174 SDValue Src = N->getOperand(0);
14175 if (Src.getValueType() != MVT::i16)
14176 return SDValue();
14177
14178 return SDValue();
14179}
14180
14181SDValue
14182SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14183 DAGCombinerInfo &DCI) const {
14184 SDValue Src = N->getOperand(0);
14185 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14186
14187 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14188 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14189 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14190 VTSign->getVT() == MVT::i8) ||
14191 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14192 VTSign->getVT() == MVT::i16))) {
14193 assert(Subtarget->hasScalarSubwordLoads() &&
14194 "s_buffer_load_{u8, i8} are supported "
14195 "in GFX12 (or newer) architectures.");
14196 EVT VT = Src.getValueType();
14197 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14200 SDLoc DL(N);
14201 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14202 SDValue Ops[] = {
14203 Src.getOperand(0), // source register
14204 Src.getOperand(1), // offset
14205 Src.getOperand(2) // cachePolicy
14206 };
14207 auto *M = cast<MemSDNode>(Src);
14208 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14209 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14210 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14211 return LoadVal;
14212 }
14213 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14214 VTSign->getVT() == MVT::i8) ||
14215 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14216 VTSign->getVT() == MVT::i16)) &&
14217 Src.hasOneUse()) {
14218 auto *M = cast<MemSDNode>(Src);
14219 SDValue Ops[] = {Src.getOperand(0), // Chain
14220 Src.getOperand(1), // rsrc
14221 Src.getOperand(2), // vindex
14222 Src.getOperand(3), // voffset
14223 Src.getOperand(4), // soffset
14224 Src.getOperand(5), // offset
14225 Src.getOperand(6), Src.getOperand(7)};
14226 // replace with BUFFER_LOAD_BYTE/SHORT
14227 SDVTList ResList =
14228 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14229 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14232 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14233 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14234 return DCI.DAG.getMergeValues(
14235 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14236 }
14237 return SDValue();
14238}
14239
14240SDValue SITargetLowering::performClassCombine(SDNode *N,
14241 DAGCombinerInfo &DCI) const {
14242 SelectionDAG &DAG = DCI.DAG;
14243 SDValue Mask = N->getOperand(1);
14244
14245 // fp_class x, 0 -> false
14246 if (isNullConstant(Mask))
14247 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14248
14249 if (N->getOperand(0).isUndef())
14250 return DAG.getUNDEF(MVT::i1);
14251
14252 return SDValue();
14253}
14254
14255SDValue SITargetLowering::performRcpCombine(SDNode *N,
14256 DAGCombinerInfo &DCI) const {
14257 EVT VT = N->getValueType(0);
14258 SDValue N0 = N->getOperand(0);
14259
14260 if (N0.isUndef()) {
14261 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14262 SDLoc(N), VT);
14263 }
14264
14265 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14266 N0.getOpcode() == ISD::SINT_TO_FP)) {
14267 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14268 N->getFlags());
14269 }
14270
14271 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14272 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14273 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14274 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14275 N->getFlags());
14276 }
14277
14279}
14280
14282 unsigned MaxDepth) const {
14283 unsigned Opcode = Op.getOpcode();
14284 if (Opcode == ISD::FCANONICALIZE)
14285 return true;
14286
14287 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14288 const auto &F = CFP->getValueAPF();
14289 if (F.isNaN() && F.isSignaling())
14290 return false;
14291 if (!F.isDenormal())
14292 return true;
14293
14294 DenormalMode Mode =
14295 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14296 return Mode == DenormalMode::getIEEE();
14297 }
14298
14299 // If source is a result of another standard FP operation it is already in
14300 // canonical form.
14301 if (MaxDepth == 0)
14302 return false;
14303
14304 switch (Opcode) {
14305 // These will flush denorms if required.
14306 case ISD::FADD:
14307 case ISD::FSUB:
14308 case ISD::FMUL:
14309 case ISD::FCEIL:
14310 case ISD::FFLOOR:
14311 case ISD::FMA:
14312 case ISD::FMAD:
14313 case ISD::FSQRT:
14314 case ISD::FDIV:
14315 case ISD::FREM:
14316 case ISD::FP_ROUND:
14317 case ISD::FP_EXTEND:
14318 case ISD::FP16_TO_FP:
14319 case ISD::FP_TO_FP16:
14320 case ISD::BF16_TO_FP:
14321 case ISD::FP_TO_BF16:
14322 case ISD::FLDEXP:
14325 case AMDGPUISD::RCP:
14326 case AMDGPUISD::RSQ:
14330 case AMDGPUISD::LOG:
14331 case AMDGPUISD::EXP:
14335 case AMDGPUISD::FRACT:
14342 case AMDGPUISD::SIN_HW:
14343 case AMDGPUISD::COS_HW:
14344 return true;
14345
14346 // It can/will be lowered or combined as a bit operation.
14347 // Need to check their input recursively to handle.
14348 case ISD::FNEG:
14349 case ISD::FABS:
14350 case ISD::FCOPYSIGN:
14351 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14352
14353 case ISD::AND:
14354 if (Op.getValueType() == MVT::i32) {
14355 // Be careful as we only know it is a bitcast floating point type. It
14356 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14357 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14358 // is valid to optimize for all types.
14359 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14360 if (RHS->getZExtValue() == 0xffff0000) {
14361 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14362 }
14363 }
14364 }
14365 break;
14366
14367 case ISD::FSIN:
14368 case ISD::FCOS:
14369 case ISD::FSINCOS:
14370 return Op.getValueType().getScalarType() != MVT::f16;
14371
14372 case ISD::FMINNUM:
14373 case ISD::FMAXNUM:
14374 case ISD::FMINNUM_IEEE:
14375 case ISD::FMAXNUM_IEEE:
14376 case ISD::FMINIMUM:
14377 case ISD::FMAXIMUM:
14378 case ISD::FMINIMUMNUM:
14379 case ISD::FMAXIMUMNUM:
14380 case AMDGPUISD::CLAMP:
14381 case AMDGPUISD::FMED3:
14382 case AMDGPUISD::FMAX3:
14383 case AMDGPUISD::FMIN3:
14385 case AMDGPUISD::FMINIMUM3: {
14386 // FIXME: Shouldn't treat the generic operations different based these.
14387 // However, we aren't really required to flush the result from
14388 // minnum/maxnum..
14389
14390 // snans will be quieted, so we only need to worry about denormals.
14391 if (Subtarget->supportsMinMaxDenormModes() ||
14392 // FIXME: denormalsEnabledForType is broken for dynamic
14393 denormalsEnabledForType(DAG, Op.getValueType()))
14394 return true;
14395
14396 // Flushing may be required.
14397 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14398 // targets need to check their input recursively.
14399
14400 // FIXME: Does this apply with clamp? It's implemented with max.
14401 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14402 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14403 return false;
14404 }
14405
14406 return true;
14407 }
14408 case ISD::SELECT: {
14409 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14410 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14411 }
14412 case ISD::BUILD_VECTOR: {
14413 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14414 SDValue SrcOp = Op.getOperand(i);
14415 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14416 return false;
14417 }
14418
14419 return true;
14420 }
14423 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14424 }
14426 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14427 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14428 }
14429 case ISD::UNDEF:
14430 // Could be anything.
14431 return false;
14432
14433 case ISD::BITCAST:
14434 // TODO: This is incorrect as it loses track of the operand's type. We may
14435 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14436 // same bits that are canonicalized in one type need not be in the other.
14437 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14438 case ISD::TRUNCATE: {
14439 // Hack round the mess we make when legalizing extract_vector_elt
14440 if (Op.getValueType() == MVT::i16) {
14441 SDValue TruncSrc = Op.getOperand(0);
14442 if (TruncSrc.getValueType() == MVT::i32 &&
14443 TruncSrc.getOpcode() == ISD::BITCAST &&
14444 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14445 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14446 }
14447 }
14448 return false;
14449 }
14451 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14452 // TODO: Handle more intrinsics
14453 switch (IntrinsicID) {
14454 case Intrinsic::amdgcn_cvt_pkrtz:
14455 case Intrinsic::amdgcn_cubeid:
14456 case Intrinsic::amdgcn_frexp_mant:
14457 case Intrinsic::amdgcn_fdot2:
14458 case Intrinsic::amdgcn_rcp:
14459 case Intrinsic::amdgcn_rsq:
14460 case Intrinsic::amdgcn_rsq_clamp:
14461 case Intrinsic::amdgcn_rcp_legacy:
14462 case Intrinsic::amdgcn_rsq_legacy:
14463 case Intrinsic::amdgcn_trig_preop:
14464 case Intrinsic::amdgcn_tanh:
14465 case Intrinsic::amdgcn_log:
14466 case Intrinsic::amdgcn_exp2:
14467 case Intrinsic::amdgcn_sqrt:
14468 return true;
14469 default:
14470 break;
14471 }
14472
14473 break;
14474 }
14475 default:
14476 break;
14477 }
14478
14479 // FIXME: denormalsEnabledForType is broken for dynamic
14480 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14481 DAG.isKnownNeverSNaN(Op);
14482}
14483
14485 unsigned MaxDepth) const {
14486 const MachineRegisterInfo &MRI = MF.getRegInfo();
14487 MachineInstr *MI = MRI.getVRegDef(Reg);
14488 unsigned Opcode = MI->getOpcode();
14489
14490 if (Opcode == AMDGPU::G_FCANONICALIZE)
14491 return true;
14492
14493 std::optional<FPValueAndVReg> FCR;
14494 // Constant splat (can be padded with undef) or scalar constant.
14496 if (FCR->Value.isSignaling())
14497 return false;
14498 if (!FCR->Value.isDenormal())
14499 return true;
14500
14501 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14502 return Mode == DenormalMode::getIEEE();
14503 }
14504
14505 if (MaxDepth == 0)
14506 return false;
14507
14508 switch (Opcode) {
14509 case AMDGPU::G_FADD:
14510 case AMDGPU::G_FSUB:
14511 case AMDGPU::G_FMUL:
14512 case AMDGPU::G_FCEIL:
14513 case AMDGPU::G_FFLOOR:
14514 case AMDGPU::G_FRINT:
14515 case AMDGPU::G_FNEARBYINT:
14516 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14517 case AMDGPU::G_INTRINSIC_TRUNC:
14518 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14519 case AMDGPU::G_FMA:
14520 case AMDGPU::G_FMAD:
14521 case AMDGPU::G_FSQRT:
14522 case AMDGPU::G_FDIV:
14523 case AMDGPU::G_FREM:
14524 case AMDGPU::G_FPOW:
14525 case AMDGPU::G_FPEXT:
14526 case AMDGPU::G_FLOG:
14527 case AMDGPU::G_FLOG2:
14528 case AMDGPU::G_FLOG10:
14529 case AMDGPU::G_FPTRUNC:
14530 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14531 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14532 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14533 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14534 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14535 return true;
14536 case AMDGPU::G_FNEG:
14537 case AMDGPU::G_FABS:
14538 case AMDGPU::G_FCOPYSIGN:
14539 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14540 case AMDGPU::G_FMINNUM:
14541 case AMDGPU::G_FMAXNUM:
14542 case AMDGPU::G_FMINNUM_IEEE:
14543 case AMDGPU::G_FMAXNUM_IEEE:
14544 case AMDGPU::G_FMINIMUM:
14545 case AMDGPU::G_FMAXIMUM:
14546 case AMDGPU::G_FMINIMUMNUM:
14547 case AMDGPU::G_FMAXIMUMNUM: {
14548 if (Subtarget->supportsMinMaxDenormModes() ||
14549 // FIXME: denormalsEnabledForType is broken for dynamic
14550 denormalsEnabledForType(MRI.getType(Reg), MF))
14551 return true;
14552
14553 [[fallthrough]];
14554 }
14555 case AMDGPU::G_BUILD_VECTOR:
14556 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14557 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14558 return false;
14559 return true;
14560 case AMDGPU::G_INTRINSIC:
14561 case AMDGPU::G_INTRINSIC_CONVERGENT:
14562 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14563 case Intrinsic::amdgcn_fmul_legacy:
14564 case Intrinsic::amdgcn_fmad_ftz:
14565 case Intrinsic::amdgcn_sqrt:
14566 case Intrinsic::amdgcn_fmed3:
14567 case Intrinsic::amdgcn_sin:
14568 case Intrinsic::amdgcn_cos:
14569 case Intrinsic::amdgcn_log:
14570 case Intrinsic::amdgcn_exp2:
14571 case Intrinsic::amdgcn_log_clamp:
14572 case Intrinsic::amdgcn_rcp:
14573 case Intrinsic::amdgcn_rcp_legacy:
14574 case Intrinsic::amdgcn_rsq:
14575 case Intrinsic::amdgcn_rsq_clamp:
14576 case Intrinsic::amdgcn_rsq_legacy:
14577 case Intrinsic::amdgcn_div_scale:
14578 case Intrinsic::amdgcn_div_fmas:
14579 case Intrinsic::amdgcn_div_fixup:
14580 case Intrinsic::amdgcn_fract:
14581 case Intrinsic::amdgcn_cvt_pkrtz:
14582 case Intrinsic::amdgcn_cubeid:
14583 case Intrinsic::amdgcn_cubema:
14584 case Intrinsic::amdgcn_cubesc:
14585 case Intrinsic::amdgcn_cubetc:
14586 case Intrinsic::amdgcn_frexp_mant:
14587 case Intrinsic::amdgcn_fdot2:
14588 case Intrinsic::amdgcn_trig_preop:
14589 case Intrinsic::amdgcn_tanh:
14590 return true;
14591 default:
14592 break;
14593 }
14594
14595 [[fallthrough]];
14596 default:
14597 return false;
14598 }
14599
14600 llvm_unreachable("invalid operation");
14601}
14602
14603// Constant fold canonicalize.
14604SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14605 const SDLoc &SL, EVT VT,
14606 const APFloat &C) const {
14607 // Flush denormals to 0 if not enabled.
14608 if (C.isDenormal()) {
14609 DenormalMode Mode =
14610 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14611 if (Mode == DenormalMode::getPreserveSign()) {
14612 return DAG.getConstantFP(
14613 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14614 }
14615
14616 if (Mode != DenormalMode::getIEEE())
14617 return SDValue();
14618 }
14619
14620 if (C.isNaN()) {
14621 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14622 if (C.isSignaling()) {
14623 // Quiet a signaling NaN.
14624 // FIXME: Is this supposed to preserve payload bits?
14625 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14626 }
14627
14628 // Make sure it is the canonical NaN bitpattern.
14629 //
14630 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14631 // immediate?
14632 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14633 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14634 }
14635
14636 // Already canonical.
14637 return DAG.getConstantFP(C, SL, VT);
14638}
14639
14641 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14642}
14643
14644SDValue
14645SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14646 DAGCombinerInfo &DCI) const {
14647 SelectionDAG &DAG = DCI.DAG;
14648 SDValue N0 = N->getOperand(0);
14649 EVT VT = N->getValueType(0);
14650
14651 // fcanonicalize undef -> qnan
14652 if (N0.isUndef()) {
14654 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14655 }
14656
14657 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14658 EVT VT = N->getValueType(0);
14659 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14660 }
14661
14662 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14663 // (fcanonicalize k)
14664 //
14665 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14666
14667 // TODO: This could be better with wider vectors that will be split to v2f16,
14668 // and to consider uses since there aren't that many packed operations.
14669 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14670 isTypeLegal(MVT::v2f16)) {
14671 SDLoc SL(N);
14672 SDValue NewElts[2];
14673 SDValue Lo = N0.getOperand(0);
14674 SDValue Hi = N0.getOperand(1);
14675 EVT EltVT = Lo.getValueType();
14676
14678 for (unsigned I = 0; I != 2; ++I) {
14679 SDValue Op = N0.getOperand(I);
14680 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14681 NewElts[I] =
14682 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14683 } else if (Op.isUndef()) {
14684 // Handled below based on what the other operand is.
14685 NewElts[I] = Op;
14686 } else {
14687 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14688 }
14689 }
14690
14691 // If one half is undef, and one is constant, prefer a splat vector rather
14692 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14693 // cheaper to use and may be free with a packed operation.
14694 if (NewElts[0].isUndef()) {
14695 if (isa<ConstantFPSDNode>(NewElts[1]))
14696 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14697 ? NewElts[1]
14698 : DAG.getConstantFP(0.0f, SL, EltVT);
14699 }
14700
14701 if (NewElts[1].isUndef()) {
14702 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14703 ? NewElts[0]
14704 : DAG.getConstantFP(0.0f, SL, EltVT);
14705 }
14706
14707 return DAG.getBuildVector(VT, SL, NewElts);
14708 }
14709 }
14710
14711 return SDValue();
14712}
14713
14714static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14715 switch (Opc) {
14716 case ISD::FMAXNUM:
14717 case ISD::FMAXNUM_IEEE:
14718 case ISD::FMAXIMUMNUM:
14719 return AMDGPUISD::FMAX3;
14720 case ISD::FMAXIMUM:
14721 return AMDGPUISD::FMAXIMUM3;
14722 case ISD::SMAX:
14723 return AMDGPUISD::SMAX3;
14724 case ISD::UMAX:
14725 return AMDGPUISD::UMAX3;
14726 case ISD::FMINNUM:
14727 case ISD::FMINNUM_IEEE:
14728 case ISD::FMINIMUMNUM:
14729 return AMDGPUISD::FMIN3;
14730 case ISD::FMINIMUM:
14731 return AMDGPUISD::FMINIMUM3;
14732 case ISD::SMIN:
14733 return AMDGPUISD::SMIN3;
14734 case ISD::UMIN:
14735 return AMDGPUISD::UMIN3;
14736 default:
14737 llvm_unreachable("Not a min/max opcode");
14738 }
14739}
14740
14741SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14742 const SDLoc &SL, SDValue Src,
14743 SDValue MinVal,
14744 SDValue MaxVal,
14745 bool Signed) const {
14746
14747 // med3 comes from
14748 // min(max(x, K0), K1), K0 < K1
14749 // max(min(x, K0), K1), K1 < K0
14750 //
14751 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14752 // min/max op.
14753 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14754 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14755
14756 if (!MinK || !MaxK)
14757 return SDValue();
14758
14759 if (Signed) {
14760 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14761 return SDValue();
14762 } else {
14763 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14764 return SDValue();
14765 }
14766
14767 EVT VT = MinK->getValueType(0);
14768 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14769 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14770 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14771
14772 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14773 // not available, but this is unlikely to be profitable as constants
14774 // will often need to be materialized & extended, especially on
14775 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14776 return SDValue();
14777}
14778
14781 return C;
14782
14784 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14785 return C;
14786 }
14787
14788 return nullptr;
14789}
14790
14791SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14792 const SDLoc &SL, SDValue Op0,
14793 SDValue Op1) const {
14794 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14795 if (!K1)
14796 return SDValue();
14797
14798 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14799 if (!K0)
14800 return SDValue();
14801
14802 // Ordered >= (although NaN inputs should have folded away by now).
14803 if (K0->getValueAPF() > K1->getValueAPF())
14804 return SDValue();
14805
14806 // med3 with a nan input acts like
14807 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14808 //
14809 // So the result depends on whether the IEEE mode bit is enabled or not with a
14810 // signaling nan input.
14811 // ieee=1
14812 // s0 snan: yields s2
14813 // s1 snan: yields s2
14814 // s2 snan: qnan
14815
14816 // s0 qnan: min(s1, s2)
14817 // s1 qnan: min(s0, s2)
14818 // s2 qnan: min(s0, s1)
14819
14820 // ieee=0
14821 // s0 snan: min(s1, s2)
14822 // s1 snan: min(s0, s2)
14823 // s2 snan: qnan
14824
14825 // s0 qnan: min(s1, s2)
14826 // s1 qnan: min(s0, s2)
14827 // s2 qnan: min(s0, s1)
14828 const MachineFunction &MF = DAG.getMachineFunction();
14829 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14830
14831 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14832 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14833 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14834 EVT VT = Op0.getValueType();
14835 if (Info->getMode().DX10Clamp) {
14836 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14837 // hardware fmed3 behavior converting to a min.
14838 // FIXME: Should this be allowing -0.0?
14839 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14840 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14841 }
14842
14843 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14844 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14845 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14846 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14847 // then give the other result, which is different from med3 with a NaN
14848 // input.
14849 SDValue Var = Op0.getOperand(0);
14850 if (!DAG.isKnownNeverSNaN(Var))
14851 return SDValue();
14852
14853 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14854
14855 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14856 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14857 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14858 SDValue(K0, 0), SDValue(K1, 0));
14859 }
14860 }
14861
14862 return SDValue();
14863}
14864
14865/// \return true if the subtarget supports minimum3 and maximum3 with the given
14866/// base min/max opcode \p Opc for type \p VT.
14867static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14868 EVT VT) {
14869 switch (Opc) {
14870 case ISD::FMINNUM:
14871 case ISD::FMAXNUM:
14872 case ISD::FMINNUM_IEEE:
14873 case ISD::FMAXNUM_IEEE:
14874 case ISD::FMINIMUMNUM:
14875 case ISD::FMAXIMUMNUM:
14878 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14879 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14880 case ISD::FMINIMUM:
14881 case ISD::FMAXIMUM:
14882 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14883 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14884 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14885 case ISD::SMAX:
14886 case ISD::SMIN:
14887 case ISD::UMAX:
14888 case ISD::UMIN:
14889 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14890 default:
14891 return false;
14892 }
14893
14894 llvm_unreachable("not a min/max opcode");
14895}
14896
14897SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14898 DAGCombinerInfo &DCI) const {
14899 SelectionDAG &DAG = DCI.DAG;
14900
14901 EVT VT = N->getValueType(0);
14902 unsigned Opc = N->getOpcode();
14903 SDValue Op0 = N->getOperand(0);
14904 SDValue Op1 = N->getOperand(1);
14905
14906 // Only do this if the inner op has one use since this will just increases
14907 // register pressure for no benefit.
14908
14909 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14910 // max(max(a, b), c) -> max3(a, b, c)
14911 // min(min(a, b), c) -> min3(a, b, c)
14912 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14913 SDLoc DL(N);
14914 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14915 Op0.getOperand(0), Op0.getOperand(1), Op1);
14916 }
14917
14918 // Try commuted.
14919 // max(a, max(b, c)) -> max3(a, b, c)
14920 // min(a, min(b, c)) -> min3(a, b, c)
14921 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14922 SDLoc DL(N);
14923 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14924 Op0, Op1.getOperand(0), Op1.getOperand(1));
14925 }
14926 }
14927
14928 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14929 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14930 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14931 if (SDValue Med3 = performIntMed3ImmCombine(
14932 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14933 return Med3;
14934 }
14935 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14936 if (SDValue Med3 = performIntMed3ImmCombine(
14937 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14938 return Med3;
14939 }
14940
14941 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14942 if (SDValue Med3 = performIntMed3ImmCombine(
14943 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14944 return Med3;
14945 }
14946 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14947 if (SDValue Med3 = performIntMed3ImmCombine(
14948 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14949 return Med3;
14950 }
14951
14952 // if !is_snan(x):
14953 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14954 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14955 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14956 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14957 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14958 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14959 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14961 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14962 (VT == MVT::f32 || VT == MVT::f64 ||
14963 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14964 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14965 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14966 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14967 Op0.hasOneUse()) {
14968 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14969 return Res;
14970 }
14971
14972 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14973 // for some types, but at a higher cost since it's implemented with a 3
14974 // operand form.
14975 const SDNodeFlags Flags = N->getFlags();
14976 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14977 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14978 unsigned NewOpc =
14979 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14980 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14981 }
14982
14983 return SDValue();
14984}
14985
14989 // FIXME: Should this be allowing -0.0?
14990 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14991 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14992 }
14993 }
14994
14995 return false;
14996}
14997
14998// FIXME: Should only worry about snans for version with chain.
14999SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15000 DAGCombinerInfo &DCI) const {
15001 EVT VT = N->getValueType(0);
15002 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15003 // NaNs. With a NaN input, the order of the operands may change the result.
15004
15005 SelectionDAG &DAG = DCI.DAG;
15006 SDLoc SL(N);
15007
15008 SDValue Src0 = N->getOperand(0);
15009 SDValue Src1 = N->getOperand(1);
15010 SDValue Src2 = N->getOperand(2);
15011
15012 if (isClampZeroToOne(Src0, Src1)) {
15013 // const_a, const_b, x -> clamp is safe in all cases including signaling
15014 // nans.
15015 // FIXME: Should this be allowing -0.0?
15016 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15017 }
15018
15019 const MachineFunction &MF = DAG.getMachineFunction();
15020 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15021
15022 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15023 // handling no dx10-clamp?
15024 if (Info->getMode().DX10Clamp) {
15025 // If NaNs is clamped to 0, we are free to reorder the inputs.
15026
15027 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15028 std::swap(Src0, Src1);
15029
15030 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15031 std::swap(Src1, Src2);
15032
15033 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15034 std::swap(Src0, Src1);
15035
15036 if (isClampZeroToOne(Src1, Src2))
15037 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15038 }
15039
15040 return SDValue();
15041}
15042
15043SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15044 DAGCombinerInfo &DCI) const {
15045 SDValue Src0 = N->getOperand(0);
15046 SDValue Src1 = N->getOperand(1);
15047 if (Src0.isUndef() && Src1.isUndef())
15048 return DCI.DAG.getUNDEF(N->getValueType(0));
15049 return SDValue();
15050}
15051
15052// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15053// expanded into a set of cmp/select instructions.
15055 unsigned NumElem,
15056 bool IsDivergentIdx,
15057 const GCNSubtarget *Subtarget) {
15059 return false;
15060
15061 unsigned VecSize = EltSize * NumElem;
15062
15063 // Sub-dword vectors of size 2 dword or less have better implementation.
15064 if (VecSize <= 64 && EltSize < 32)
15065 return false;
15066
15067 // Always expand the rest of sub-dword instructions, otherwise it will be
15068 // lowered via memory.
15069 if (EltSize < 32)
15070 return true;
15071
15072 // Always do this if var-idx is divergent, otherwise it will become a loop.
15073 if (IsDivergentIdx)
15074 return true;
15075
15076 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15077 unsigned NumInsts = NumElem /* Number of compares */ +
15078 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15079
15080 // On some architectures (GFX9) movrel is not available and it's better
15081 // to expand.
15082 if (Subtarget->useVGPRIndexMode())
15083 return NumInsts <= 16;
15084
15085 // If movrel is available, use it instead of expanding for vector of 8
15086 // elements.
15087 if (Subtarget->hasMovrel())
15088 return NumInsts <= 15;
15089
15090 return true;
15091}
15092
15094 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15095 if (isa<ConstantSDNode>(Idx))
15096 return false;
15097
15098 SDValue Vec = N->getOperand(0);
15099 EVT VecVT = Vec.getValueType();
15100 EVT EltVT = VecVT.getVectorElementType();
15101 unsigned EltSize = EltVT.getSizeInBits();
15102 unsigned NumElem = VecVT.getVectorNumElements();
15103
15105 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15106}
15107
15108SDValue
15109SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15110 DAGCombinerInfo &DCI) const {
15111 SDValue Vec = N->getOperand(0);
15112 SelectionDAG &DAG = DCI.DAG;
15113
15114 EVT VecVT = Vec.getValueType();
15115 EVT VecEltVT = VecVT.getVectorElementType();
15116 EVT ResVT = N->getValueType(0);
15117
15118 unsigned VecSize = VecVT.getSizeInBits();
15119 unsigned VecEltSize = VecEltVT.getSizeInBits();
15120
15121 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15123 SDLoc SL(N);
15124 SDValue Idx = N->getOperand(1);
15125 SDValue Elt =
15126 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15127 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15128 }
15129
15130 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15131 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15132 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15133 // depending on the shift operand. See e.g. performSraCombine().
15134 // This combine ensures that the optimisation is compatible with v2i32
15135 // legalised AND.
15136 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15137 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15138
15140 if (!C || C->getZExtValue() != 0x1f)
15141 return SDValue();
15142
15143 SDLoc SL(N);
15144 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15145 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15146 Vec->getOperand(0), N->getOperand(1));
15147 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15148 DAG.ReplaceAllUsesWith(N, A.getNode());
15149 }
15150
15151 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15152 // =>
15153 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15154 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15155 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15156 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15157 SDLoc SL(N);
15158 SDValue Idx = N->getOperand(1);
15159 unsigned Opc = Vec.getOpcode();
15160
15161 switch (Opc) {
15162 default:
15163 break;
15164 // TODO: Support other binary operations.
15165 case ISD::FADD:
15166 case ISD::FSUB:
15167 case ISD::FMUL:
15168 case ISD::ADD:
15169 case ISD::UMIN:
15170 case ISD::UMAX:
15171 case ISD::SMIN:
15172 case ISD::SMAX:
15173 case ISD::FMAXNUM:
15174 case ISD::FMINNUM:
15175 case ISD::FMAXNUM_IEEE:
15176 case ISD::FMINNUM_IEEE:
15177 case ISD::FMAXIMUM:
15178 case ISD::FMINIMUM: {
15179 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15180 Vec.getOperand(0), Idx);
15181 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15182 Vec.getOperand(1), Idx);
15183
15184 DCI.AddToWorklist(Elt0.getNode());
15185 DCI.AddToWorklist(Elt1.getNode());
15186 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15187 }
15188 }
15189 }
15190
15191 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15193 SDLoc SL(N);
15194 SDValue Idx = N->getOperand(1);
15195 SDValue V;
15196 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15197 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15198 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15199 if (I == 0)
15200 V = Elt;
15201 else
15202 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15203 }
15204 return V;
15205 }
15206
15207 // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
15208 // =>
15209 // i32:Lo(k) if Idx == 0, or
15210 // i32:Hi(k) if Idx == 1
15211 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15212 if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
15213 SDLoc SL(N);
15214 SDValue PeekThrough = Vec.getOperand(0);
15215 auto *KImm = dyn_cast<ConstantSDNode>(PeekThrough);
15216 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
15217 uint64_t KImmValue = KImm->getZExtValue();
15218 return DAG.getConstant(
15219 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
15220 }
15221 auto *KFPImm = dyn_cast<ConstantFPSDNode>(PeekThrough);
15222 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
15223 uint64_t KFPImmValue =
15224 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
15225 return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
15226 0xffffffff,
15227 SL, MVT::i32);
15228 }
15229 }
15230
15231 if (!DCI.isBeforeLegalize())
15232 return SDValue();
15233
15234 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15235 // elements. This exposes more load reduction opportunities by replacing
15236 // multiple small extract_vector_elements with a single 32-bit extract.
15237 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15238 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15239 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15240
15241 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15242 unsigned EltIdx = BitIndex / 32;
15243 unsigned LeftoverBitIdx = BitIndex % 32;
15244 SDLoc SL(N);
15245
15246 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15247 DCI.AddToWorklist(Cast.getNode());
15248
15249 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15250 DAG.getConstant(EltIdx, SL, MVT::i32));
15251 DCI.AddToWorklist(Elt.getNode());
15252 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15253 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15254 DCI.AddToWorklist(Srl.getNode());
15255
15256 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15257 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15258 DCI.AddToWorklist(Trunc.getNode());
15259
15260 if (VecEltVT == ResVT) {
15261 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15262 }
15263
15264 assert(ResVT.isScalarInteger());
15265 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15266 }
15267
15268 return SDValue();
15269}
15270
15271SDValue
15272SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15273 DAGCombinerInfo &DCI) const {
15274 SDValue Vec = N->getOperand(0);
15275 SDValue Idx = N->getOperand(2);
15276 EVT VecVT = Vec.getValueType();
15277 EVT EltVT = VecVT.getVectorElementType();
15278
15279 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15280 // => BUILD_VECTOR n x select (e, const-idx)
15282 return SDValue();
15283
15284 SelectionDAG &DAG = DCI.DAG;
15285 SDLoc SL(N);
15286 SDValue Ins = N->getOperand(1);
15287 EVT IdxVT = Idx.getValueType();
15288
15290 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15291 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15292 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15293 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15294 Ops.push_back(V);
15295 }
15296
15297 return DAG.getBuildVector(VecVT, SL, Ops);
15298}
15299
15300/// Return the source of an fp_extend from f16 to f32, or a converted FP
15301/// constant.
15303 if (Src.getOpcode() == ISD::FP_EXTEND &&
15304 Src.getOperand(0).getValueType() == MVT::f16) {
15305 return Src.getOperand(0);
15306 }
15307
15308 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15309 APFloat Val = CFP->getValueAPF();
15310 bool LosesInfo = true;
15312 if (!LosesInfo)
15313 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15314 }
15315
15316 return SDValue();
15317}
15318
15319SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15320 DAGCombinerInfo &DCI) const {
15321 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15322 "combine only useful on gfx8");
15323
15324 SDValue TruncSrc = N->getOperand(0);
15325 EVT VT = N->getValueType(0);
15326 if (VT != MVT::f16)
15327 return SDValue();
15328
15329 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15330 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15331 return SDValue();
15332
15333 SelectionDAG &DAG = DCI.DAG;
15334 SDLoc SL(N);
15335
15336 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15337 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15338 // casting back.
15339
15340 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15341 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15342 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15343 if (!A)
15344 return SDValue();
15345
15346 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15347 if (!B)
15348 return SDValue();
15349
15350 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15351 if (!C)
15352 return SDValue();
15353
15354 // This changes signaling nan behavior. If an input is a signaling nan, it
15355 // would have been quieted by the fpext originally. We don't care because
15356 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15357 // we would be worse off than just doing the promotion.
15358 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15359 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15360 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15361 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15362}
15363
15364unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15365 const SDNode *N0,
15366 const SDNode *N1) const {
15367 EVT VT = N0->getValueType(0);
15368
15369 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15370 // support denormals ever.
15371 if (((VT == MVT::f32 &&
15373 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15376 return ISD::FMAD;
15377
15378 const TargetOptions &Options = DAG.getTarget().Options;
15379 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15380 (N0->getFlags().hasAllowContract() &&
15381 N1->getFlags().hasAllowContract())) &&
15383 return ISD::FMA;
15384 }
15385
15386 return 0;
15387}
15388
15389// For a reassociatable opcode perform:
15390// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15391SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15392 SelectionDAG &DAG) const {
15393 EVT VT = N->getValueType(0);
15394 if (VT != MVT::i32 && VT != MVT::i64)
15395 return SDValue();
15396
15397 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15398 return SDValue();
15399
15400 unsigned Opc = N->getOpcode();
15401 SDValue Op0 = N->getOperand(0);
15402 SDValue Op1 = N->getOperand(1);
15403
15404 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15405 return SDValue();
15406
15407 if (Op0->isDivergent())
15408 std::swap(Op0, Op1);
15409
15410 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15411 return SDValue();
15412
15413 SDValue Op2 = Op1.getOperand(1);
15414 Op1 = Op1.getOperand(0);
15415 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15416 return SDValue();
15417
15418 if (Op1->isDivergent())
15419 std::swap(Op1, Op2);
15420
15421 SDLoc SL(N);
15422 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15423 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15424}
15425
15426static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15427 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15429 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15430 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15431 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15432}
15433
15434// Fold
15435// y = lshr i64 x, 32
15436// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15437// with Const.hi == -1
15438// To
15439// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15441 SDValue MulLHS, SDValue MulRHS,
15442 SDValue AddRHS) {
15443 if (MulRHS.getOpcode() == ISD::SRL)
15444 std::swap(MulLHS, MulRHS);
15445
15446 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15447 return SDValue();
15448
15449 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15450 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15451 MulLHS.getOperand(0) != AddRHS)
15452 return SDValue();
15453
15455 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15456 return SDValue();
15457
15458 SDValue ConstMul =
15459 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15460 return getMad64_32(DAG, SL, MVT::i64,
15461 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15462 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15463}
15464
15465// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15466// multiplies, if any.
15467//
15468// Full 64-bit multiplies that feed into an addition are lowered here instead
15469// of using the generic expansion. The generic expansion ends up with
15470// a tree of ADD nodes that prevents us from using the "add" part of the
15471// MAD instruction. The expansion produced here results in a chain of ADDs
15472// instead of a tree.
15473SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15474 DAGCombinerInfo &DCI) const {
15475 assert(N->isAnyAdd());
15476
15477 SelectionDAG &DAG = DCI.DAG;
15478 EVT VT = N->getValueType(0);
15479 SDLoc SL(N);
15480 SDValue LHS = N->getOperand(0);
15481 SDValue RHS = N->getOperand(1);
15482
15483 if (VT.isVector())
15484 return SDValue();
15485
15486 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15487 // result in scalar registers for uniform values.
15488 if (!N->isDivergent() && Subtarget->hasSMulHi())
15489 return SDValue();
15490
15491 unsigned NumBits = VT.getScalarSizeInBits();
15492 if (NumBits <= 32 || NumBits > 64)
15493 return SDValue();
15494
15495 if (LHS.getOpcode() != ISD::MUL) {
15496 assert(RHS.getOpcode() == ISD::MUL);
15497 std::swap(LHS, RHS);
15498 }
15499
15500 // Avoid the fold if it would unduly increase the number of multiplies due to
15501 // multiple uses, except on hardware with full-rate multiply-add (which is
15502 // part of full-rate 64-bit ops).
15503 if (!Subtarget->hasFullRate64Ops()) {
15504 unsigned NumUsers = 0;
15505 for (SDNode *User : LHS->users()) {
15506 // There is a use that does not feed into addition, so the multiply can't
15507 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15508 if (!User->isAnyAdd())
15509 return SDValue();
15510
15511 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15512 // MUL + 3xADD + 3xADDC over 3xMAD.
15513 ++NumUsers;
15514 if (NumUsers >= 3)
15515 return SDValue();
15516 }
15517 }
15518
15519 SDValue MulLHS = LHS.getOperand(0);
15520 SDValue MulRHS = LHS.getOperand(1);
15521 SDValue AddRHS = RHS;
15522
15523 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15524 return FoldedMAD;
15525
15526 // Always check whether operands are small unsigned values, since that
15527 // knowledge is useful in more cases. Check for small signed values only if
15528 // doing so can unlock a shorter code sequence.
15529 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15530 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15531
15532 bool MulSignedLo = false;
15533 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15534 MulSignedLo =
15535 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15536 }
15537
15538 // The operands and final result all have the same number of bits. If
15539 // operands need to be extended, they can be extended with garbage. The
15540 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15541 // truncated away in the end.
15542 if (VT != MVT::i64) {
15543 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15544 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15545 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15546 }
15547
15548 // The basic code generated is conceptually straightforward. Pseudo code:
15549 //
15550 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15551 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15552 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15553 //
15554 // The second and third lines are optional, depending on whether the factors
15555 // are {sign,zero}-extended or not.
15556 //
15557 // The actual DAG is noisier than the pseudo code, but only due to
15558 // instructions that disassemble values into low and high parts, and
15559 // assemble the final result.
15560 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15561
15562 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15563 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15564 SDValue Accum =
15565 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15566
15567 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15568 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15569
15570 if (!MulLHSUnsigned32) {
15571 auto MulLHSHi =
15572 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15573 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15574 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15575 }
15576
15577 if (!MulRHSUnsigned32) {
15578 auto MulRHSHi =
15579 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15580 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15581 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15582 }
15583
15584 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15585 Accum = DAG.getBitcast(MVT::i64, Accum);
15586 }
15587
15588 if (VT != MVT::i64)
15589 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15590 return Accum;
15591}
15592
15593SDValue
15594SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15595 DAGCombinerInfo &DCI) const {
15596 SDValue RHS = N->getOperand(1);
15597 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15598 if (!CRHS)
15599 return SDValue();
15600
15601 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15602 // common.
15603 uint64_t Val = CRHS->getZExtValue();
15604 if (countr_zero(Val) >= 32) {
15605 SelectionDAG &DAG = DCI.DAG;
15606 SDLoc SL(N);
15607 SDValue LHS = N->getOperand(0);
15608
15609 // Avoid carry machinery if we know the low half of the add does not
15610 // contribute to the final result.
15611 //
15612 // add i64:x, K if computeTrailingZeros(K) >= 32
15613 // => build_pair (add x.hi, K.hi), x.lo
15614
15615 // Breaking the 64-bit add here with this strange constant is unlikely
15616 // to interfere with addressing mode patterns.
15617
15618 SDValue Hi = getHiHalf64(LHS, DAG);
15619 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15620 unsigned Opcode = N->getOpcode();
15621 if (Opcode == ISD::PTRADD)
15622 Opcode = ISD::ADD;
15623 SDValue AddHi =
15624 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15625
15626 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15627 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15628 }
15629
15630 return SDValue();
15631}
15632
15633// Collect the ultimate src of each of the mul node's operands, and confirm
15634// each operand is 8 bytes.
15635static std::optional<ByteProvider<SDValue>>
15636handleMulOperand(const SDValue &MulOperand) {
15637 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15638 if (!Byte0 || Byte0->isConstantZero()) {
15639 return std::nullopt;
15640 }
15641 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15642 if (Byte1 && !Byte1->isConstantZero()) {
15643 return std::nullopt;
15644 }
15645 return Byte0;
15646}
15647
15648static unsigned addPermMasks(unsigned First, unsigned Second) {
15649 unsigned FirstCs = First & 0x0c0c0c0c;
15650 unsigned SecondCs = Second & 0x0c0c0c0c;
15651 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15652 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15653
15654 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15655 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15656 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15657 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15658
15659 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15660}
15661
15662struct DotSrc {
15664 int64_t PermMask;
15666};
15667
15671 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15672
15673 assert(Src0.Src.has_value() && Src1.Src.has_value());
15674 // Src0s and Src1s are empty, just place arbitrarily.
15675 if (Step == 0) {
15676 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15677 Src0.SrcOffset / 4});
15678 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15679 Src1.SrcOffset / 4});
15680 return;
15681 }
15682
15683 for (int BPI = 0; BPI < 2; BPI++) {
15684 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15685 if (BPI == 1) {
15686 BPP = {Src1, Src0};
15687 }
15688 unsigned ZeroMask = 0x0c0c0c0c;
15689 unsigned FMask = 0xFF << (8 * (3 - Step));
15690
15691 unsigned FirstMask =
15692 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15693 unsigned SecondMask =
15694 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15695 // Attempt to find Src vector which contains our SDValue, if so, add our
15696 // perm mask to the existing one. If we are unable to find a match for the
15697 // first SDValue, attempt to find match for the second.
15698 int FirstGroup = -1;
15699 for (int I = 0; I < 2; I++) {
15700 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15701 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15702 return IterElt.SrcOp == *BPP.first.Src &&
15703 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15704 };
15705
15706 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15707 if (Match != Srcs.end()) {
15708 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15709 FirstGroup = I;
15710 break;
15711 }
15712 }
15713 if (FirstGroup != -1) {
15714 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15715 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15716 return IterElt.SrcOp == *BPP.second.Src &&
15717 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15718 };
15719 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15720 if (Match != Srcs.end()) {
15721 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15722 } else
15723 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15724 return;
15725 }
15726 }
15727
15728 // If we have made it here, then we could not find a match in Src0s or Src1s
15729 // for either Src0 or Src1, so just place them arbitrarily.
15730
15731 unsigned ZeroMask = 0x0c0c0c0c;
15732 unsigned FMask = 0xFF << (8 * (3 - Step));
15733
15734 Src0s.push_back(
15735 {*Src0.Src,
15736 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15737 Src0.SrcOffset / 4});
15738 Src1s.push_back(
15739 {*Src1.Src,
15740 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15741 Src1.SrcOffset / 4});
15742}
15743
15745 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15746 bool IsAny) {
15747
15748 // If we just have one source, just permute it accordingly.
15749 if (Srcs.size() == 1) {
15750 auto *Elt = Srcs.begin();
15751 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15752
15753 // v_perm will produce the original value
15754 if (Elt->PermMask == 0x3020100)
15755 return EltOp;
15756
15757 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15758 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15759 }
15760
15761 auto *FirstElt = Srcs.begin();
15762 auto *SecondElt = std::next(FirstElt);
15763
15765
15766 // If we have multiple sources in the chain, combine them via perms (using
15767 // calculated perm mask) and Ors.
15768 while (true) {
15769 auto FirstMask = FirstElt->PermMask;
15770 auto SecondMask = SecondElt->PermMask;
15771
15772 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15773 unsigned FirstPlusFour = FirstMask | 0x04040404;
15774 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15775 // original 0x0C.
15776 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15777
15778 auto PermMask = addPermMasks(FirstMask, SecondMask);
15779 auto FirstVal =
15780 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15781 auto SecondVal =
15782 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15783
15784 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15785 SecondVal,
15786 DAG.getConstant(PermMask, SL, MVT::i32)));
15787
15788 FirstElt = std::next(SecondElt);
15789 if (FirstElt == Srcs.end())
15790 break;
15791
15792 SecondElt = std::next(FirstElt);
15793 // If we only have a FirstElt, then just combine that into the cumulative
15794 // source node.
15795 if (SecondElt == Srcs.end()) {
15796 auto EltOp =
15797 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15798
15799 Perms.push_back(
15800 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15801 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15802 break;
15803 }
15804 }
15805
15806 assert(Perms.size() == 1 || Perms.size() == 2);
15807 return Perms.size() == 2
15808 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15809 : Perms[0];
15810}
15811
15812static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15813 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15814 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15815 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15816 EntryMask += ZeroMask;
15817 }
15818}
15819
15820static bool isMul(const SDValue Op) {
15821 auto Opcode = Op.getOpcode();
15822
15823 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15824 Opcode == AMDGPUISD::MUL_I24);
15825}
15826
15827static std::optional<bool>
15829 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15830 const SDValue &S1Op, const SelectionDAG &DAG) {
15831 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15832 // of the dot4 is irrelevant.
15833 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15834 return false;
15835
15836 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15837 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15838 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15839 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15840 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15841 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15842
15843 assert(!(S0IsUnsigned && S0IsSigned));
15844 assert(!(S1IsUnsigned && S1IsSigned));
15845
15846 // There are 9 possible permutations of
15847 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15848
15849 // In two permutations, the sign bits are known to be the same for both Ops,
15850 // so simply return Signed / Unsigned corresponding to the MSB
15851
15852 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15853 return S0IsSigned;
15854
15855 // In another two permutations, the sign bits are known to be opposite. In
15856 // this case return std::nullopt to indicate a bad match.
15857
15858 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15859 return std::nullopt;
15860
15861 // In the remaining five permutations, we don't know the value of the sign
15862 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15863 // the upper bits must be extension bits. Thus, the only ways for the sign
15864 // bit to be unknown is if it was sign extended from unknown value, or if it
15865 // was any extended. In either case, it is correct to use the signed
15866 // version of the signedness semantics of dot4
15867
15868 // In two of such permutations, we known the sign bit is set for
15869 // one op, and the other is unknown. It is okay to used signed version of
15870 // dot4.
15871 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15872 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15873 return true;
15874
15875 // In one such permutation, we don't know either of the sign bits. It is okay
15876 // to used the signed version of dot4.
15877 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15878 return true;
15879
15880 // In two of such permutations, we known the sign bit is unset for
15881 // one op, and the other is unknown. Return std::nullopt to indicate a
15882 // bad match.
15883 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15884 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15885 return std::nullopt;
15886
15887 llvm_unreachable("Fully covered condition");
15888}
15889
15890SDValue SITargetLowering::performAddCombine(SDNode *N,
15891 DAGCombinerInfo &DCI) const {
15892 SelectionDAG &DAG = DCI.DAG;
15893 EVT VT = N->getValueType(0);
15894 SDLoc SL(N);
15895 SDValue LHS = N->getOperand(0);
15896 SDValue RHS = N->getOperand(1);
15897
15898 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15899 if (Subtarget->hasMad64_32()) {
15900 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15901 return Folded;
15902 }
15903 }
15904
15905 if (SDValue V = reassociateScalarOps(N, DAG)) {
15906 return V;
15907 }
15908
15909 if (VT == MVT::i64) {
15910 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15911 return Folded;
15912 }
15913
15914 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15915 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15916 SDValue TempNode(N, 0);
15917 std::optional<bool> IsSigned;
15921
15922 // Match the v_dot4 tree, while collecting src nodes.
15923 int ChainLength = 0;
15924 for (int I = 0; I < 4; I++) {
15925 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15926 if (MulIdx == -1)
15927 break;
15928 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15929 if (!Src0)
15930 break;
15931 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15932 if (!Src1)
15933 break;
15934
15935 auto IterIsSigned = checkDot4MulSignedness(
15936 TempNode->getOperand(MulIdx), *Src0, *Src1,
15937 TempNode->getOperand(MulIdx)->getOperand(0),
15938 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15939 if (!IterIsSigned)
15940 break;
15941 if (!IsSigned)
15942 IsSigned = *IterIsSigned;
15943 if (*IterIsSigned != *IsSigned)
15944 break;
15945 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15946 auto AddIdx = 1 - MulIdx;
15947 // Allow the special case where add (add (mul24, 0), mul24) became ->
15948 // add (mul24, mul24).
15949 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15950 Src2s.push_back(TempNode->getOperand(AddIdx));
15951 auto Src0 =
15952 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15953 if (!Src0)
15954 break;
15955 auto Src1 =
15956 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15957 if (!Src1)
15958 break;
15959 auto IterIsSigned = checkDot4MulSignedness(
15960 TempNode->getOperand(AddIdx), *Src0, *Src1,
15961 TempNode->getOperand(AddIdx)->getOperand(0),
15962 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15963 if (!IterIsSigned)
15964 break;
15965 assert(IsSigned);
15966 if (*IterIsSigned != *IsSigned)
15967 break;
15968 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15969 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15970 ChainLength = I + 2;
15971 break;
15972 }
15973
15974 TempNode = TempNode->getOperand(AddIdx);
15975 Src2s.push_back(TempNode);
15976 ChainLength = I + 1;
15977 if (TempNode->getNumOperands() < 2)
15978 break;
15979 LHS = TempNode->getOperand(0);
15980 RHS = TempNode->getOperand(1);
15981 }
15982
15983 if (ChainLength < 2)
15984 return SDValue();
15985
15986 // Masks were constructed with assumption that we would find a chain of
15987 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15988 // 0x0c) so they do not affect dot calculation.
15989 if (ChainLength < 4) {
15990 fixMasks(Src0s, ChainLength);
15991 fixMasks(Src1s, ChainLength);
15992 }
15993
15994 SDValue Src0, Src1;
15995
15996 // If we are just using a single source for both, and have permuted the
15997 // bytes consistently, we can just use the sources without permuting
15998 // (commutation).
15999 bool UseOriginalSrc = false;
16000 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16001 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16002 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16003 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16004 SmallVector<unsigned, 4> SrcBytes;
16005 auto Src0Mask = Src0s.begin()->PermMask;
16006 SrcBytes.push_back(Src0Mask & 0xFF000000);
16007 bool UniqueEntries = true;
16008 for (auto I = 1; I < 4; I++) {
16009 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16010
16011 if (is_contained(SrcBytes, NextByte)) {
16012 UniqueEntries = false;
16013 break;
16014 }
16015 SrcBytes.push_back(NextByte);
16016 }
16017
16018 if (UniqueEntries) {
16019 UseOriginalSrc = true;
16020
16021 auto *FirstElt = Src0s.begin();
16022 auto FirstEltOp =
16023 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16024
16025 auto *SecondElt = Src1s.begin();
16026 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16027 SecondElt->DWordOffset);
16028
16029 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16030 MVT::getIntegerVT(32));
16031 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16032 MVT::getIntegerVT(32));
16033 }
16034 }
16035
16036 if (!UseOriginalSrc) {
16037 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16038 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16039 }
16040
16041 assert(IsSigned);
16042 SDValue Src2 =
16043 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16044
16045 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16046 : Intrinsic::amdgcn_udot4,
16047 SL, MVT::i64);
16048
16049 assert(!VT.isVector());
16050 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16051 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16052
16053 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16054 }
16055
16056 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16057 return SDValue();
16058
16059 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16060 // add x, sext (setcc) => usubo_carry x, 0, setcc
16061 unsigned Opc = LHS.getOpcode();
16064 std::swap(RHS, LHS);
16065
16066 Opc = RHS.getOpcode();
16067 switch (Opc) {
16068 default:
16069 break;
16070 case ISD::ZERO_EXTEND:
16071 case ISD::SIGN_EXTEND:
16072 case ISD::ANY_EXTEND: {
16073 auto Cond = RHS.getOperand(0);
16074 // If this won't be a real VOPC output, we would still need to insert an
16075 // extra instruction anyway.
16076 if (!isBoolSGPR(Cond))
16077 break;
16078 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16079 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16081 return DAG.getNode(Opc, SL, VTList, Args);
16082 }
16083 case ISD::UADDO_CARRY: {
16084 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16085 if (!isNullConstant(RHS.getOperand(1)))
16086 break;
16087 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16088 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16089 }
16090 }
16091 return SDValue();
16092}
16093
16094SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16095 DAGCombinerInfo &DCI) const {
16096 SelectionDAG &DAG = DCI.DAG;
16097 SDLoc DL(N);
16098 EVT VT = N->getValueType(0);
16099 SDValue N0 = N->getOperand(0);
16100 SDValue N1 = N->getOperand(1);
16101
16102 // The following folds transform PTRADDs into regular arithmetic in cases
16103 // where the PTRADD wouldn't be folded as an immediate offset into memory
16104 // instructions anyway. They are target-specific in that other targets might
16105 // prefer to not lose information about the pointer arithmetic.
16106
16107 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16108 // Adapted from DAGCombiner::visitADDLikeCommutative.
16109 SDValue V, K;
16110 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16111 SDNodeFlags ShlFlags = N1->getFlags();
16112 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16113 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16114 // preserved.
16115 SDNodeFlags NewShlFlags =
16116 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16118 : SDNodeFlags();
16119 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16120 DCI.AddToWorklist(Inner.getNode());
16121 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16122 }
16123
16124 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16125 // performAddCombine.
16126 if (N1.getOpcode() == ISD::MUL) {
16127 if (Subtarget->hasMad64_32()) {
16128 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16129 return Folded;
16130 }
16131 }
16132
16133 // If the 32 low bits of the constant are all zero, there is nothing to fold
16134 // into an immediate offset, so it's better to eliminate the unnecessary
16135 // addition for the lower 32 bits than to preserve the PTRADD.
16136 // Analogous to a fold in performAddCombine.
16137 if (VT == MVT::i64) {
16138 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16139 return Folded;
16140 }
16141
16142 if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
16143 // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
16144 // global address GA and constant c, such that c can be folded into GA.
16145 SDValue GAValue = N0.getOperand(0);
16146 if (const GlobalAddressSDNode *GA =
16148 if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
16149 // If both additions in the original were NUW, reassociation preserves
16150 // that.
16151 SDNodeFlags Flags =
16152 (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16153 SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
16154 DCI.AddToWorklist(Inner.getNode());
16155 return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
16156 }
16157 }
16158 }
16159
16160 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16161 return SDValue();
16162
16163 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
16164 // y is not, and (add y, z) is used only once.
16165 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
16166 // z is not, and (add y, z) is used only once.
16167 // The goal is to move constant offsets to the outermost ptradd, to create
16168 // more opportunities to fold offsets into memory instructions.
16169 // Together with the generic combines in DAGCombiner.cpp, this also
16170 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
16171 //
16172 // This transform is here instead of in the general DAGCombiner as it can
16173 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
16174 // AArch64's CPA.
16175 SDValue X = N0;
16176 SDValue Y = N1.getOperand(0);
16177 SDValue Z = N1.getOperand(1);
16178 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16179 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16180
16181 // If both additions in the original were NUW, reassociation preserves that.
16182 SDNodeFlags ReassocFlags =
16183 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16184
16185 if (ZIsConstant != YIsConstant) {
16186 if (YIsConstant)
16187 std::swap(Y, Z);
16188 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16189 DCI.AddToWorklist(Inner.getNode());
16190 return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
16191 }
16192
16193 // If one of Y and Z is constant, they have been handled above. If both were
16194 // constant, the addition would have been folded in SelectionDAG::getNode
16195 // already. This ensures that the generic DAG combines won't undo the
16196 // following reassociation.
16197 assert(!YIsConstant && !ZIsConstant);
16198
16199 if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
16200 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16201 // y are uniform and z isn't.
16202 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16203 // z are uniform and y isn't.
16204 // The goal is to push uniform operands up in the computation, so that they
16205 // can be handled with scalar operations. We can't use reassociateScalarOps
16206 // for this since it requires two identical commutative operations to
16207 // reassociate.
16208 if (Y->isDivergent())
16209 std::swap(Y, Z);
16210 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16211 DCI.AddToWorklist(UniformInner.getNode());
16212 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16213 }
16214
16215 return SDValue();
16216}
16217
16218SDValue SITargetLowering::performSubCombine(SDNode *N,
16219 DAGCombinerInfo &DCI) const {
16220 SelectionDAG &DAG = DCI.DAG;
16221 EVT VT = N->getValueType(0);
16222
16223 if (VT == MVT::i64) {
16224 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16225 return Folded;
16226 }
16227
16228 if (VT != MVT::i32)
16229 return SDValue();
16230
16231 SDLoc SL(N);
16232 SDValue LHS = N->getOperand(0);
16233 SDValue RHS = N->getOperand(1);
16234
16235 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16236 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16237 unsigned Opc = RHS.getOpcode();
16238 switch (Opc) {
16239 default:
16240 break;
16241 case ISD::ZERO_EXTEND:
16242 case ISD::SIGN_EXTEND:
16243 case ISD::ANY_EXTEND: {
16244 auto Cond = RHS.getOperand(0);
16245 // If this won't be a real VOPC output, we would still need to insert an
16246 // extra instruction anyway.
16247 if (!isBoolSGPR(Cond))
16248 break;
16249 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16250 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16252 return DAG.getNode(Opc, SL, VTList, Args);
16253 }
16254 }
16255
16256 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16257 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16258 if (!isNullConstant(LHS.getOperand(1)))
16259 return SDValue();
16260 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16261 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16262 }
16263 return SDValue();
16264}
16265
16266SDValue
16267SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16268 DAGCombinerInfo &DCI) const {
16269
16270 if (N->getValueType(0) != MVT::i32)
16271 return SDValue();
16272
16273 if (!isNullConstant(N->getOperand(1)))
16274 return SDValue();
16275
16276 SelectionDAG &DAG = DCI.DAG;
16277 SDValue LHS = N->getOperand(0);
16278
16279 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16280 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16281 unsigned LHSOpc = LHS.getOpcode();
16282 unsigned Opc = N->getOpcode();
16283 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16284 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16285 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16286 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16287 }
16288 return SDValue();
16289}
16290
16291SDValue SITargetLowering::performFAddCombine(SDNode *N,
16292 DAGCombinerInfo &DCI) const {
16293 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16294 return SDValue();
16295
16296 SelectionDAG &DAG = DCI.DAG;
16297 EVT VT = N->getValueType(0);
16298
16299 SDLoc SL(N);
16300 SDValue LHS = N->getOperand(0);
16301 SDValue RHS = N->getOperand(1);
16302
16303 // These should really be instruction patterns, but writing patterns with
16304 // source modifiers is a pain.
16305
16306 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16307 if (LHS.getOpcode() == ISD::FADD) {
16308 SDValue A = LHS.getOperand(0);
16309 if (A == LHS.getOperand(1)) {
16310 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16311 if (FusedOp != 0) {
16312 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16313 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16314 }
16315 }
16316 }
16317
16318 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16319 if (RHS.getOpcode() == ISD::FADD) {
16320 SDValue A = RHS.getOperand(0);
16321 if (A == RHS.getOperand(1)) {
16322 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16323 if (FusedOp != 0) {
16324 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16325 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16326 }
16327 }
16328 }
16329
16330 return SDValue();
16331}
16332
16333SDValue SITargetLowering::performFSubCombine(SDNode *N,
16334 DAGCombinerInfo &DCI) const {
16335 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16336 return SDValue();
16337
16338 SelectionDAG &DAG = DCI.DAG;
16339 SDLoc SL(N);
16340 EVT VT = N->getValueType(0);
16341 assert(!VT.isVector());
16342
16343 // Try to get the fneg to fold into the source modifier. This undoes generic
16344 // DAG combines and folds them into the mad.
16345 //
16346 // Only do this if we are not trying to support denormals. v_mad_f32 does
16347 // not support denormals ever.
16348 SDValue LHS = N->getOperand(0);
16349 SDValue RHS = N->getOperand(1);
16350 if (LHS.getOpcode() == ISD::FADD) {
16351 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16352 SDValue A = LHS.getOperand(0);
16353 if (A == LHS.getOperand(1)) {
16354 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16355 if (FusedOp != 0) {
16356 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16357 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16358
16359 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16360 }
16361 }
16362 }
16363
16364 if (RHS.getOpcode() == ISD::FADD) {
16365 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16366
16367 SDValue A = RHS.getOperand(0);
16368 if (A == RHS.getOperand(1)) {
16369 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16370 if (FusedOp != 0) {
16371 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16372 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16373 }
16374 }
16375 }
16376
16377 return SDValue();
16378}
16379
16380SDValue SITargetLowering::performFDivCombine(SDNode *N,
16381 DAGCombinerInfo &DCI) const {
16382 SelectionDAG &DAG = DCI.DAG;
16383 SDLoc SL(N);
16384 EVT VT = N->getValueType(0);
16385 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16386 return SDValue();
16387
16388 SDValue LHS = N->getOperand(0);
16389 SDValue RHS = N->getOperand(1);
16390
16391 SDNodeFlags Flags = N->getFlags();
16392 SDNodeFlags RHSFlags = RHS->getFlags();
16393 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16394 !RHS->hasOneUse())
16395 return SDValue();
16396
16397 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16398 bool IsNegative = false;
16399 if (CLHS->isExactlyValue(1.0) ||
16400 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16401 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16402 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16403 if (RHS.getOpcode() == ISD::FSQRT) {
16404 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16405 SDValue Rsq =
16406 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16407 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16408 }
16409 }
16410 }
16411
16412 return SDValue();
16413}
16414
16415SDValue SITargetLowering::performFMulCombine(SDNode *N,
16416 DAGCombinerInfo &DCI) const {
16417 SelectionDAG &DAG = DCI.DAG;
16418 EVT VT = N->getValueType(0);
16419 EVT ScalarVT = VT.getScalarType();
16420 EVT IntVT = VT.changeElementType(MVT::i32);
16421
16422 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16423 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16424 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16425 return SDValue();
16426 }
16427
16428 SDValue LHS = N->getOperand(0);
16429 SDValue RHS = N->getOperand(1);
16430
16431 // It is cheaper to realize i32 inline constants as compared against
16432 // materializing f16 or f64 (or even non-inline f32) values,
16433 // possible via ldexp usage, as shown below :
16434 //
16435 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16436 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16437 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16438 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16439 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16440 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16441 if (!TrueNode)
16442 return SDValue();
16443 const ConstantFPSDNode *FalseNode =
16444 isConstOrConstSplatFP(RHS.getOperand(2));
16445 if (!FalseNode)
16446 return SDValue();
16447
16448 if (TrueNode->isNegative() != FalseNode->isNegative())
16449 return SDValue();
16450
16451 // For f32, only non-inline constants should be transformed.
16452 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16453 if (ScalarVT == MVT::f32 &&
16454 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16455 TII->isInlineConstant(FalseNode->getValueAPF()))
16456 return SDValue();
16457
16458 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16459 if (TrueNodeExpVal == INT_MIN)
16460 return SDValue();
16461 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16462 if (FalseNodeExpVal == INT_MIN)
16463 return SDValue();
16464
16465 SDLoc SL(N);
16466 SDValue SelectNode =
16467 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16468 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16469 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16470
16471 LHS = TrueNode->isNegative()
16472 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16473 : LHS;
16474
16475 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16476 }
16477
16478 return SDValue();
16479}
16480
16481SDValue SITargetLowering::performFMACombine(SDNode *N,
16482 DAGCombinerInfo &DCI) const {
16483 SelectionDAG &DAG = DCI.DAG;
16484 EVT VT = N->getValueType(0);
16485 SDLoc SL(N);
16486
16487 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16488 return SDValue();
16489
16490 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16491 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16492 SDValue Op1 = N->getOperand(0);
16493 SDValue Op2 = N->getOperand(1);
16494 SDValue FMA = N->getOperand(2);
16495
16496 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16497 Op2.getOpcode() != ISD::FP_EXTEND)
16498 return SDValue();
16499
16500 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16501 // regardless of the denorm mode setting. Therefore,
16502 // fp-contract is sufficient to allow generating fdot2.
16503 const TargetOptions &Options = DAG.getTarget().Options;
16504 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16505 (N->getFlags().hasAllowContract() &&
16506 FMA->getFlags().hasAllowContract())) {
16507 Op1 = Op1.getOperand(0);
16508 Op2 = Op2.getOperand(0);
16509 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16511 return SDValue();
16512
16513 SDValue Vec1 = Op1.getOperand(0);
16514 SDValue Idx1 = Op1.getOperand(1);
16515 SDValue Vec2 = Op2.getOperand(0);
16516
16517 SDValue FMAOp1 = FMA.getOperand(0);
16518 SDValue FMAOp2 = FMA.getOperand(1);
16519 SDValue FMAAcc = FMA.getOperand(2);
16520
16521 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16522 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16523 return SDValue();
16524
16525 FMAOp1 = FMAOp1.getOperand(0);
16526 FMAOp2 = FMAOp2.getOperand(0);
16527 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16529 return SDValue();
16530
16531 SDValue Vec3 = FMAOp1.getOperand(0);
16532 SDValue Vec4 = FMAOp2.getOperand(0);
16533 SDValue Idx2 = FMAOp1.getOperand(1);
16534
16535 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16536 // Idx1 and Idx2 cannot be the same.
16537 Idx1 == Idx2)
16538 return SDValue();
16539
16540 if (Vec1 == Vec2 || Vec3 == Vec4)
16541 return SDValue();
16542
16543 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16544 return SDValue();
16545
16546 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16547 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16548 DAG.getTargetConstant(0, SL, MVT::i1));
16549 }
16550 }
16551 return SDValue();
16552}
16553
16554SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16555 DAGCombinerInfo &DCI) const {
16556 SelectionDAG &DAG = DCI.DAG;
16557 SDLoc SL(N);
16558
16559 SDValue LHS = N->getOperand(0);
16560 SDValue RHS = N->getOperand(1);
16561 EVT VT = LHS.getValueType();
16562 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16563
16564 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16565 if (!CRHS) {
16567 if (CRHS) {
16568 std::swap(LHS, RHS);
16569 CC = getSetCCSwappedOperands(CC);
16570 }
16571 }
16572
16573 if (CRHS) {
16574 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16575 isBoolSGPR(LHS.getOperand(0))) {
16576 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16577 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16578 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16579 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16580 if ((CRHS->isAllOnes() &&
16581 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16582 (CRHS->isZero() &&
16583 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16584 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16585 DAG.getAllOnesConstant(SL, MVT::i1));
16586 if ((CRHS->isAllOnes() &&
16587 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16588 (CRHS->isZero() &&
16589 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16590 return LHS.getOperand(0);
16591 }
16592
16593 const APInt &CRHSVal = CRHS->getAPIntValue();
16594 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16595 LHS.getOpcode() == ISD::SELECT &&
16596 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16597 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16598 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16599 isBoolSGPR(LHS.getOperand(0))) {
16600 // Given CT != FT:
16601 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16602 // setcc (select cc, CT, CF), CF, ne => cc
16603 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16604 // setcc (select cc, CT, CF), CT, eq => cc
16605 const APInt &CT = LHS.getConstantOperandAPInt(1);
16606 const APInt &CF = LHS.getConstantOperandAPInt(2);
16607
16608 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16609 (CT == CRHSVal && CC == ISD::SETNE))
16610 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16611 DAG.getAllOnesConstant(SL, MVT::i1));
16612 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16613 (CT == CRHSVal && CC == ISD::SETEQ))
16614 return LHS.getOperand(0);
16615 }
16616 }
16617
16618 if (VT != MVT::f32 && VT != MVT::f64 &&
16619 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16620 return SDValue();
16621
16622 // Match isinf/isfinite pattern
16623 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16624 // (fcmp one (fabs x), inf) -> (fp_class x,
16625 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16626 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16627 LHS.getOpcode() == ISD::FABS) {
16628 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16629 if (!CRHS)
16630 return SDValue();
16631
16632 const APFloat &APF = CRHS->getValueAPF();
16633 if (APF.isInfinity() && !APF.isNegative()) {
16634 const unsigned IsInfMask =
16636 const unsigned IsFiniteMask =
16640 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16641 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16642 DAG.getConstant(Mask, SL, MVT::i32));
16643 }
16644 }
16645
16646 return SDValue();
16647}
16648
16649SDValue
16650SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16651 DAGCombinerInfo &DCI) const {
16652 SelectionDAG &DAG = DCI.DAG;
16653 SDLoc SL(N);
16654 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16655
16656 SDValue Src = N->getOperand(0);
16657 SDValue Shift = N->getOperand(0);
16658
16659 // TODO: Extend type shouldn't matter (assuming legal types).
16660 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16661 Shift = Shift.getOperand(0);
16662
16663 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16664 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16665 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16666 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16667 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16668 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16669 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16670 SDValue Shifted = DAG.getZExtOrTrunc(
16671 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16672
16673 unsigned ShiftOffset = 8 * Offset;
16674 if (Shift.getOpcode() == ISD::SHL)
16675 ShiftOffset -= C->getZExtValue();
16676 else
16677 ShiftOffset += C->getZExtValue();
16678
16679 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16680 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16681 MVT::f32, Shifted);
16682 }
16683 }
16684 }
16685
16686 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16687 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16688 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16689 // We simplified Src. If this node is not dead, visit it again so it is
16690 // folded properly.
16691 if (N->getOpcode() != ISD::DELETED_NODE)
16692 DCI.AddToWorklist(N);
16693 return SDValue(N, 0);
16694 }
16695
16696 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16697 if (SDValue DemandedSrc =
16698 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16699 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16700
16701 return SDValue();
16702}
16703
16704SDValue SITargetLowering::performClampCombine(SDNode *N,
16705 DAGCombinerInfo &DCI) const {
16706 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16707 if (!CSrc)
16708 return SDValue();
16709
16710 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16711 const APFloat &F = CSrc->getValueAPF();
16712 APFloat Zero = APFloat::getZero(F.getSemantics());
16713 if (F < Zero ||
16714 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16715 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16716 }
16717
16718 APFloat One(F.getSemantics(), "1.0");
16719 if (F > One)
16720 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16721
16722 return SDValue(CSrc, 0);
16723}
16724
16725SDValue SITargetLowering::performSelectCombine(SDNode *N,
16726 DAGCombinerInfo &DCI) const {
16727
16728 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16729 // integer).
16730 // Detect when CMP and SELECT use the same constant and fold them to avoid
16731 // loading the constant twice. Specifically handles patterns like:
16732 // %cmp = icmp eq i32 %val, 4242
16733 // %sel = select i1 %cmp, i32 4242, i32 %other
16734 // It can be optimized to reuse %val instead of 4242 in select.
16735 SDValue Cond = N->getOperand(0);
16736 SDValue TrueVal = N->getOperand(1);
16737 SDValue FalseVal = N->getOperand(2);
16738
16739 // Check if condition is a comparison.
16740 if (Cond.getOpcode() != ISD::SETCC)
16741 return SDValue();
16742
16743 SDValue LHS = Cond.getOperand(0);
16744 SDValue RHS = Cond.getOperand(1);
16745 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16746
16747 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16748 bool isInteger = LHS.getValueType().isInteger();
16749
16750 // Handle simple floating-point and integer types only.
16751 if (!isFloatingPoint && !isInteger)
16752 return SDValue();
16753
16754 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16755 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16756 if (!isEquality && !isNonEquality)
16757 return SDValue();
16758
16759 SDValue ArgVal, ConstVal;
16760 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16761 (isInteger && isa<ConstantSDNode>(RHS))) {
16762 ConstVal = RHS;
16763 ArgVal = LHS;
16764 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16765 (isInteger && isa<ConstantSDNode>(LHS))) {
16766 ConstVal = LHS;
16767 ArgVal = RHS;
16768 } else {
16769 return SDValue();
16770 }
16771
16772 // Skip optimization for inlinable immediates.
16773 if (isFloatingPoint) {
16774 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16775 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16776 return SDValue();
16777 } else {
16779 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16780 return SDValue();
16781 }
16782
16783 // For equality and non-equality comparisons, patterns:
16784 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16785 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16786 if (!(isEquality && TrueVal == ConstVal) &&
16787 !(isNonEquality && FalseVal == ConstVal))
16788 return SDValue();
16789
16790 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16791 SDValue SelectRHS =
16792 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16793 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16794 SelectLHS, SelectRHS);
16795}
16796
16798 DAGCombinerInfo &DCI) const {
16799 switch (N->getOpcode()) {
16800 case ISD::ADD:
16801 case ISD::SUB:
16802 case ISD::SHL:
16803 case ISD::SRL:
16804 case ISD::SRA:
16805 case ISD::AND:
16806 case ISD::OR:
16807 case ISD::XOR:
16808 case ISD::MUL:
16809 case ISD::SETCC:
16810 case ISD::SELECT:
16811 case ISD::SMIN:
16812 case ISD::SMAX:
16813 case ISD::UMIN:
16814 case ISD::UMAX:
16815 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16816 return Res;
16817 break;
16818 default:
16819 break;
16820 }
16821
16822 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16823 return SDValue();
16824
16825 switch (N->getOpcode()) {
16826 case ISD::ADD:
16827 return performAddCombine(N, DCI);
16828 case ISD::PTRADD:
16829 return performPtrAddCombine(N, DCI);
16830 case ISD::SUB:
16831 return performSubCombine(N, DCI);
16832 case ISD::UADDO_CARRY:
16833 case ISD::USUBO_CARRY:
16834 return performAddCarrySubCarryCombine(N, DCI);
16835 case ISD::FADD:
16836 return performFAddCombine(N, DCI);
16837 case ISD::FSUB:
16838 return performFSubCombine(N, DCI);
16839 case ISD::FDIV:
16840 return performFDivCombine(N, DCI);
16841 case ISD::FMUL:
16842 return performFMulCombine(N, DCI);
16843 case ISD::SETCC:
16844 return performSetCCCombine(N, DCI);
16845 case ISD::SELECT:
16846 if (auto Res = performSelectCombine(N, DCI))
16847 return Res;
16848 break;
16849 case ISD::FMAXNUM:
16850 case ISD::FMINNUM:
16851 case ISD::FMAXNUM_IEEE:
16852 case ISD::FMINNUM_IEEE:
16853 case ISD::FMAXIMUM:
16854 case ISD::FMINIMUM:
16855 case ISD::FMAXIMUMNUM:
16856 case ISD::FMINIMUMNUM:
16857 case ISD::SMAX:
16858 case ISD::SMIN:
16859 case ISD::UMAX:
16860 case ISD::UMIN:
16863 return performMinMaxCombine(N, DCI);
16864 case ISD::FMA:
16865 return performFMACombine(N, DCI);
16866 case ISD::AND:
16867 return performAndCombine(N, DCI);
16868 case ISD::OR:
16869 return performOrCombine(N, DCI);
16870 case ISD::FSHR: {
16872 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16873 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16874 return matchPERM(N, DCI);
16875 }
16876 break;
16877 }
16878 case ISD::XOR:
16879 return performXorCombine(N, DCI);
16880 case ISD::ZERO_EXTEND:
16881 return performZeroExtendCombine(N, DCI);
16883 return performSignExtendInRegCombine(N, DCI);
16885 return performClassCombine(N, DCI);
16886 case ISD::FCANONICALIZE:
16887 return performFCanonicalizeCombine(N, DCI);
16888 case AMDGPUISD::RCP:
16889 return performRcpCombine(N, DCI);
16890 case ISD::FLDEXP:
16891 case AMDGPUISD::FRACT:
16892 case AMDGPUISD::RSQ:
16895 case AMDGPUISD::RSQ_CLAMP: {
16896 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16897 SDValue Src = N->getOperand(0);
16898 if (Src.isUndef())
16899 return Src;
16900 break;
16901 }
16902 case ISD::SINT_TO_FP:
16903 case ISD::UINT_TO_FP:
16904 return performUCharToFloatCombine(N, DCI);
16905 case ISD::FCOPYSIGN:
16906 return performFCopySignCombine(N, DCI);
16911 return performCvtF32UByteNCombine(N, DCI);
16912 case AMDGPUISD::FMED3:
16913 return performFMed3Combine(N, DCI);
16915 return performCvtPkRTZCombine(N, DCI);
16916 case AMDGPUISD::CLAMP:
16917 return performClampCombine(N, DCI);
16918 case ISD::SCALAR_TO_VECTOR: {
16919 SelectionDAG &DAG = DCI.DAG;
16920 EVT VT = N->getValueType(0);
16921
16922 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16923 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16924 SDLoc SL(N);
16925 SDValue Src = N->getOperand(0);
16926 EVT EltVT = Src.getValueType();
16927 if (EltVT != MVT::i16)
16928 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16929
16930 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16931 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16932 }
16933
16934 break;
16935 }
16937 return performExtractVectorEltCombine(N, DCI);
16939 return performInsertVectorEltCombine(N, DCI);
16940 case ISD::FP_ROUND:
16941 return performFPRoundCombine(N, DCI);
16942 case ISD::LOAD: {
16943 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16944 return Widened;
16945 [[fallthrough]];
16946 }
16947 default: {
16948 if (!DCI.isBeforeLegalize()) {
16949 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16950 return performMemSDNodeCombine(MemNode, DCI);
16951 }
16952
16953 break;
16954 }
16955 }
16956
16958}
16959
16960/// Helper function for adjustWritemask
16961static unsigned SubIdx2Lane(unsigned Idx) {
16962 switch (Idx) {
16963 default:
16964 return ~0u;
16965 case AMDGPU::sub0:
16966 return 0;
16967 case AMDGPU::sub1:
16968 return 1;
16969 case AMDGPU::sub2:
16970 return 2;
16971 case AMDGPU::sub3:
16972 return 3;
16973 case AMDGPU::sub4:
16974 return 4; // Possible with TFE/LWE
16975 }
16976}
16977
16978/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16979SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16980 SelectionDAG &DAG) const {
16981 unsigned Opcode = Node->getMachineOpcode();
16982
16983 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16984 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16985 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16986 return Node; // not implemented for D16
16987
16988 SDNode *Users[5] = {nullptr};
16989 unsigned Lane = 0;
16990 unsigned DmaskIdx =
16991 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16992 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16993 unsigned NewDmask = 0;
16994 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16995 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16996 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16997 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16998 unsigned TFCLane = 0;
16999 bool HasChain = Node->getNumValues() > 1;
17000
17001 if (OldDmask == 0) {
17002 // These are folded out, but on the chance it happens don't assert.
17003 return Node;
17004 }
17005
17006 unsigned OldBitsSet = llvm::popcount(OldDmask);
17007 // Work out which is the TFE/LWE lane if that is enabled.
17008 if (UsesTFC) {
17009 TFCLane = OldBitsSet;
17010 }
17011
17012 // Try to figure out the used register components
17013 for (SDUse &Use : Node->uses()) {
17014
17015 // Don't look at users of the chain.
17016 if (Use.getResNo() != 0)
17017 continue;
17018
17019 SDNode *User = Use.getUser();
17020
17021 // Abort if we can't understand the usage
17022 if (!User->isMachineOpcode() ||
17023 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17024 return Node;
17025
17026 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17027 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17028 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17029 // set, etc.
17030 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17031 if (Lane == ~0u)
17032 return Node;
17033
17034 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17035 if (UsesTFC && Lane == TFCLane) {
17036 Users[Lane] = User;
17037 } else {
17038 // Set which texture component corresponds to the lane.
17039 unsigned Comp;
17040 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17041 Comp = llvm::countr_zero(Dmask);
17042 Dmask &= ~(1 << Comp);
17043 }
17044
17045 // Abort if we have more than one user per component.
17046 if (Users[Lane])
17047 return Node;
17048
17049 Users[Lane] = User;
17050 NewDmask |= 1 << Comp;
17051 }
17052 }
17053
17054 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17055 bool NoChannels = !NewDmask;
17056 if (NoChannels) {
17057 if (!UsesTFC) {
17058 // No uses of the result and not using TFC. Then do nothing.
17059 return Node;
17060 }
17061 // If the original dmask has one channel - then nothing to do
17062 if (OldBitsSet == 1)
17063 return Node;
17064 // Use an arbitrary dmask - required for the instruction to work
17065 NewDmask = 1;
17066 }
17067 // Abort if there's no change
17068 if (NewDmask == OldDmask)
17069 return Node;
17070
17071 unsigned BitsSet = llvm::popcount(NewDmask);
17072
17073 // Check for TFE or LWE - increase the number of channels by one to account
17074 // for the extra return value
17075 // This will need adjustment for D16 if this is also included in
17076 // adjustWriteMask (this function) but at present D16 are excluded.
17077 unsigned NewChannels = BitsSet + UsesTFC;
17078
17079 int NewOpcode =
17080 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17081 assert(NewOpcode != -1 &&
17082 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17083 "failed to find equivalent MIMG op");
17084
17085 // Adjust the writemask in the node
17087 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17088 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17089 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17090
17091 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17092
17093 MVT ResultVT = NewChannels == 1
17094 ? SVT
17095 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17096 : NewChannels == 5 ? 8
17097 : NewChannels);
17098 SDVTList NewVTList =
17099 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17100
17101 MachineSDNode *NewNode =
17102 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17103
17104 if (HasChain) {
17105 // Update chain.
17106 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17107 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17108 }
17109
17110 if (NewChannels == 1) {
17111 assert(Node->hasNUsesOfValue(1, 0));
17112 SDNode *Copy =
17113 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17114 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17115 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17116 return nullptr;
17117 }
17118
17119 // Update the users of the node with the new indices
17120 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17121 SDNode *User = Users[i];
17122 if (!User) {
17123 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17124 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17125 if (i || !NoChannels)
17126 continue;
17127 } else {
17128 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17129 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17130 if (NewUser != User) {
17131 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17132 DAG.RemoveDeadNode(User);
17133 }
17134 }
17135
17136 switch (Idx) {
17137 default:
17138 break;
17139 case AMDGPU::sub0:
17140 Idx = AMDGPU::sub1;
17141 break;
17142 case AMDGPU::sub1:
17143 Idx = AMDGPU::sub2;
17144 break;
17145 case AMDGPU::sub2:
17146 Idx = AMDGPU::sub3;
17147 break;
17148 case AMDGPU::sub3:
17149 Idx = AMDGPU::sub4;
17150 break;
17151 }
17152 }
17153
17154 DAG.RemoveDeadNode(Node);
17155 return nullptr;
17156}
17157
17159 if (Op.getOpcode() == ISD::AssertZext)
17160 Op = Op.getOperand(0);
17161
17162 return isa<FrameIndexSDNode>(Op);
17163}
17164
17165/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17166/// with frame index operands.
17167/// LLVM assumes that inputs are to these instructions are registers.
17168SDNode *
17170 SelectionDAG &DAG) const {
17171 if (Node->getOpcode() == ISD::CopyToReg) {
17172 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17173 SDValue SrcVal = Node->getOperand(2);
17174
17175 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17176 // to try understanding copies to physical registers.
17177 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17178 SDLoc SL(Node);
17180 SDValue VReg = DAG.getRegister(
17181 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17182
17183 SDNode *Glued = Node->getGluedNode();
17184 SDValue ToVReg = DAG.getCopyToReg(
17185 Node->getOperand(0), SL, VReg, SrcVal,
17186 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17187 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17188 VReg, ToVReg.getValue(1));
17189 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17190 DAG.RemoveDeadNode(Node);
17191 return ToResultReg.getNode();
17192 }
17193 }
17194
17196 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17197 if (!isFrameIndexOp(Node->getOperand(i))) {
17198 Ops.push_back(Node->getOperand(i));
17199 continue;
17200 }
17201
17202 SDLoc DL(Node);
17203 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17204 Node->getOperand(i).getValueType(),
17205 Node->getOperand(i)),
17206 0));
17207 }
17208
17209 return DAG.UpdateNodeOperands(Node, Ops);
17210}
17211
17212/// Fold the instructions after selecting them.
17213/// Returns null if users were already updated.
17215 SelectionDAG &DAG) const {
17217 unsigned Opcode = Node->getMachineOpcode();
17218
17219 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17220 !TII->isGather4(Opcode) &&
17221 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17222 return adjustWritemask(Node, DAG);
17223 }
17224
17225 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17227 return Node;
17228 }
17229
17230 switch (Opcode) {
17231 case AMDGPU::V_DIV_SCALE_F32_e64:
17232 case AMDGPU::V_DIV_SCALE_F64_e64: {
17233 // Satisfy the operand register constraint when one of the inputs is
17234 // undefined. Ordinarily each undef value will have its own implicit_def of
17235 // a vreg, so force these to use a single register.
17236 SDValue Src0 = Node->getOperand(1);
17237 SDValue Src1 = Node->getOperand(3);
17238 SDValue Src2 = Node->getOperand(5);
17239
17240 if ((Src0.isMachineOpcode() &&
17241 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17242 (Src0 == Src1 || Src0 == Src2))
17243 break;
17244
17245 MVT VT = Src0.getValueType().getSimpleVT();
17246 const TargetRegisterClass *RC =
17247 getRegClassFor(VT, Src0.getNode()->isDivergent());
17248
17250 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17251
17252 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17253 Src0, SDValue());
17254
17255 // src0 must be the same register as src1 or src2, even if the value is
17256 // undefined, so make sure we don't violate this constraint.
17257 if (Src0.isMachineOpcode() &&
17258 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17259 if (Src1.isMachineOpcode() &&
17260 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17261 Src0 = Src1;
17262 else if (Src2.isMachineOpcode() &&
17263 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17264 Src0 = Src2;
17265 else {
17266 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17267 Src0 = UndefReg;
17268 Src1 = UndefReg;
17269 }
17270 } else
17271 break;
17272
17274 Ops[1] = Src0;
17275 Ops[3] = Src1;
17276 Ops[5] = Src2;
17277 Ops.push_back(ImpDef.getValue(1));
17278 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17279 }
17280 default:
17281 break;
17282 }
17283
17284 return Node;
17285}
17286
17287// Any MIMG instructions that use tfe or lwe require an initialization of the
17288// result register that will be written in the case of a memory access failure.
17289// The required code is also added to tie this init code to the result of the
17290// img instruction.
17293 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17294 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17295 MachineBasicBlock &MBB = *MI.getParent();
17296
17297 int DstIdx =
17298 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17299 unsigned InitIdx = 0;
17300
17301 if (TII->isImage(MI)) {
17302 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17303 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17304 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17305
17306 if (!TFE && !LWE) // intersect_ray
17307 return;
17308
17309 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17310 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17311 unsigned D16Val = D16 ? D16->getImm() : 0;
17312
17313 if (!TFEVal && !LWEVal)
17314 return;
17315
17316 // At least one of TFE or LWE are non-zero
17317 // We have to insert a suitable initialization of the result value and
17318 // tie this to the dest of the image instruction.
17319
17320 // Calculate which dword we have to initialize to 0.
17321 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17322
17323 // check that dmask operand is found.
17324 assert(MO_Dmask && "Expected dmask operand in instruction");
17325
17326 unsigned dmask = MO_Dmask->getImm();
17327 // Determine the number of active lanes taking into account the
17328 // Gather4 special case
17329 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17330
17331 bool Packed = !Subtarget->hasUnpackedD16VMem();
17332
17333 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17334
17335 // Abandon attempt if the dst size isn't large enough
17336 // - this is in fact an error but this is picked up elsewhere and
17337 // reported correctly.
17338 uint32_t DstSize =
17339 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17340 if (DstSize < InitIdx)
17341 return;
17342 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17343 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17344 } else {
17345 return;
17346 }
17347
17348 const DebugLoc &DL = MI.getDebugLoc();
17349
17350 // Create a register for the initialization value.
17351 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17352 unsigned NewDst = 0; // Final initialized value will be in here
17353
17354 // If PRTStrictNull feature is enabled (the default) then initialize
17355 // all the result registers to 0, otherwise just the error indication
17356 // register (VGPRn+1)
17357 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17358 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17359
17360 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17361 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17362 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17363 // Initialize dword
17364 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17365 // clang-format off
17366 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17367 .addImm(0);
17368 // clang-format on
17369 // Insert into the super-reg
17370 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17371 .addReg(PrevDst)
17372 .addReg(SubReg)
17374
17375 PrevDst = NewDst;
17376 }
17377
17378 // Add as an implicit operand
17379 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17380
17381 // Tie the just added implicit operand to the dst
17382 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17383}
17384
17385/// Assign the register class depending on the number of
17386/// bits set in the writemask
17388 SDNode *Node) const {
17390
17391 MachineFunction *MF = MI.getParent()->getParent();
17394
17395 if (TII->isVOP3(MI.getOpcode())) {
17396 // Make sure constant bus requirements are respected.
17397 TII->legalizeOperandsVOP3(MRI, MI);
17398
17399 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17400 // This saves a chain-copy of registers and better balance register
17401 // use between vgpr and agpr as agpr tuples tend to be big.
17402 if (!MI.getDesc().operands().empty()) {
17403 unsigned Opc = MI.getOpcode();
17404 bool HasAGPRs = Info->mayNeedAGPRs();
17405 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17406 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17407 for (auto I :
17408 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17409 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17410 if (I == -1)
17411 break;
17412 if ((I == Src2Idx) && (HasAGPRs))
17413 break;
17414 MachineOperand &Op = MI.getOperand(I);
17415 if (!Op.isReg() || !Op.getReg().isVirtual())
17416 continue;
17417 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17418 if (!TRI->hasAGPRs(RC))
17419 continue;
17420 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17421 if (!Src || !Src->isCopy() ||
17422 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17423 continue;
17424 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17425 // All uses of agpr64 and agpr32 can also accept vgpr except for
17426 // v_accvgpr_read, but we do not produce agpr reads during selection,
17427 // so no use checks are needed.
17428 MRI.setRegClass(Op.getReg(), NewRC);
17429 }
17430
17431 if (TII->isMAI(MI)) {
17432 // The ordinary src0, src1, src2 were legalized above.
17433 //
17434 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17435 // as a separate instruction.
17436 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17437 AMDGPU::OpName::scale_src0);
17438 if (Src0Idx != -1) {
17439 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17440 AMDGPU::OpName::scale_src1);
17441 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17442 TII->usesConstantBus(MRI, MI, Src1Idx))
17443 TII->legalizeOpWithMove(MI, Src1Idx);
17444 }
17445 }
17446
17447 if (!HasAGPRs)
17448 return;
17449
17450 // Resolve the rest of AV operands to AGPRs.
17451 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17452 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17453 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17454 if (TRI->isVectorSuperClass(RC)) {
17455 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17456 MRI.setRegClass(Src2->getReg(), NewRC);
17457 if (Src2->isTied())
17458 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17459 }
17460 }
17461 }
17462 }
17463
17464 return;
17465 }
17466
17467 if (TII->isImage(MI))
17468 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17469}
17470
17472 uint64_t Val) {
17473 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17474 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17475}
17476
17478 const SDLoc &DL,
17479 SDValue Ptr) const {
17481
17482 // Build the half of the subregister with the constants before building the
17483 // full 128-bit register. If we are building multiple resource descriptors,
17484 // this will allow CSEing of the 2-component register.
17485 const SDValue Ops0[] = {
17486 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17487 buildSMovImm32(DAG, DL, 0),
17488 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17489 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17490 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17491
17492 SDValue SubRegHi = SDValue(
17493 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17494
17495 // Combine the constants and the pointer.
17496 const SDValue Ops1[] = {
17497 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17498 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17499 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17500
17501 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17502}
17503
17504/// Return a resource descriptor with the 'Add TID' bit enabled
17505/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17506/// of the resource descriptor) to create an offset, which is added to
17507/// the resource pointer.
17509 SDValue Ptr, uint32_t RsrcDword1,
17510 uint64_t RsrcDword2And3) const {
17511 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17512 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17513 if (RsrcDword1) {
17514 PtrHi =
17515 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17516 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17517 0);
17518 }
17519
17520 SDValue DataLo =
17521 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17522 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17523
17524 const SDValue Ops[] = {
17525 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17526 PtrLo,
17527 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17528 PtrHi,
17529 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17530 DataLo,
17531 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17532 DataHi,
17533 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17534
17535 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17536}
17537
17538//===----------------------------------------------------------------------===//
17539// SI Inline Assembly Support
17540//===----------------------------------------------------------------------===//
17541
17542std::pair<unsigned, const TargetRegisterClass *>
17544 StringRef Constraint,
17545 MVT VT) const {
17546 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17547
17548 const TargetRegisterClass *RC = nullptr;
17549 if (Constraint.size() == 1) {
17550 // Check if we cannot determine the bit size of the given value type. This
17551 // can happen, for example, in this situation where we have an empty struct
17552 // (size 0): `call void asm "", "v"({} poison)`-
17553 if (VT == MVT::Other)
17554 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17555 const unsigned BitWidth = VT.getSizeInBits();
17556 switch (Constraint[0]) {
17557 default:
17558 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17559 case 's':
17560 case 'r':
17561 switch (BitWidth) {
17562 case 16:
17563 RC = &AMDGPU::SReg_32RegClass;
17564 break;
17565 case 64:
17566 RC = &AMDGPU::SGPR_64RegClass;
17567 break;
17568 default:
17570 if (!RC)
17571 return std::pair(0U, nullptr);
17572 break;
17573 }
17574 break;
17575 case 'v':
17576 switch (BitWidth) {
17577 case 16:
17578 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17579 : &AMDGPU::VGPR_32_Lo256RegClass;
17580 break;
17581 default:
17582 RC = Subtarget->has1024AddressableVGPRs()
17583 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17584 : TRI->getVGPRClassForBitWidth(BitWidth);
17585 if (!RC)
17586 return std::pair(0U, nullptr);
17587 break;
17588 }
17589 break;
17590 case 'a':
17591 if (!Subtarget->hasMAIInsts())
17592 break;
17593 switch (BitWidth) {
17594 case 16:
17595 RC = &AMDGPU::AGPR_32RegClass;
17596 break;
17597 default:
17598 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17599 if (!RC)
17600 return std::pair(0U, nullptr);
17601 break;
17602 }
17603 break;
17604 }
17605 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17606 const unsigned BitWidth = VT.getSizeInBits();
17607 switch (BitWidth) {
17608 case 16:
17609 RC = &AMDGPU::AV_32RegClass;
17610 break;
17611 default:
17612 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17613 if (!RC)
17614 return std::pair(0U, nullptr);
17615 break;
17616 }
17617 }
17618
17619 // We actually support i128, i16 and f16 as inline parameters
17620 // even if they are not reported as legal
17621 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17622 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17623 return std::pair(0U, RC);
17624
17625 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17626 if (Kind != '\0') {
17627 if (Kind == 'v') {
17628 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17629 } else if (Kind == 's') {
17630 RC = &AMDGPU::SGPR_32RegClass;
17631 } else if (Kind == 'a') {
17632 RC = &AMDGPU::AGPR_32RegClass;
17633 }
17634
17635 if (RC) {
17636 if (NumRegs > 1) {
17637 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17638 return std::pair(0U, nullptr);
17639
17640 uint32_t Width = NumRegs * 32;
17641 // Prohibit constraints for register ranges with a width that does not
17642 // match the required type.
17643 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17644 return std::pair(0U, nullptr);
17645
17646 MCRegister Reg = RC->getRegister(Idx);
17648 RC = TRI->getVGPRClassForBitWidth(Width);
17649 else if (SIRegisterInfo::isSGPRClass(RC))
17650 RC = TRI->getSGPRClassForBitWidth(Width);
17651 else if (SIRegisterInfo::isAGPRClass(RC))
17652 RC = TRI->getAGPRClassForBitWidth(Width);
17653 if (RC) {
17654 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17655 if (!Reg) {
17656 // The register class does not contain the requested register,
17657 // e.g., because it is an SGPR pair that would violate alignment
17658 // requirements.
17659 return std::pair(0U, nullptr);
17660 }
17661 return std::pair(Reg, RC);
17662 }
17663 }
17664
17665 // Check for lossy scalar/vector conversions.
17666 if (VT.isVector() && VT.getSizeInBits() != 32)
17667 return std::pair(0U, nullptr);
17668 if (Idx < RC->getNumRegs())
17669 return std::pair(RC->getRegister(Idx), RC);
17670 return std::pair(0U, nullptr);
17671 }
17672 }
17673
17674 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17675 if (Ret.first)
17676 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17677
17678 return Ret;
17679}
17680
17681static bool isImmConstraint(StringRef Constraint) {
17682 if (Constraint.size() == 1) {
17683 switch (Constraint[0]) {
17684 default:
17685 break;
17686 case 'I':
17687 case 'J':
17688 case 'A':
17689 case 'B':
17690 case 'C':
17691 return true;
17692 }
17693 } else if (Constraint == "DA" || Constraint == "DB") {
17694 return true;
17695 }
17696 return false;
17697}
17698
17701 if (Constraint.size() == 1) {
17702 switch (Constraint[0]) {
17703 default:
17704 break;
17705 case 's':
17706 case 'v':
17707 case 'a':
17708 return C_RegisterClass;
17709 }
17710 } else if (Constraint.size() == 2) {
17711 if (Constraint == "VA")
17712 return C_RegisterClass;
17713 }
17714 if (isImmConstraint(Constraint)) {
17715 return C_Other;
17716 }
17717 return TargetLowering::getConstraintType(Constraint);
17718}
17719
17720static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17722 Val = Val & maskTrailingOnes<uint64_t>(Size);
17723 }
17724 return Val;
17725}
17726
17728 StringRef Constraint,
17729 std::vector<SDValue> &Ops,
17730 SelectionDAG &DAG) const {
17731 if (isImmConstraint(Constraint)) {
17732 uint64_t Val;
17733 if (getAsmOperandConstVal(Op, Val) &&
17734 checkAsmConstraintVal(Op, Constraint, Val)) {
17735 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17736 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17737 }
17738 } else {
17740 }
17741}
17742
17744 unsigned Size = Op.getScalarValueSizeInBits();
17745 if (Size > 64)
17746 return false;
17747
17748 if (Size == 16 && !Subtarget->has16BitInsts())
17749 return false;
17750
17752 Val = C->getSExtValue();
17753 return true;
17754 }
17756 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17757 return true;
17758 }
17760 if (Size != 16 || Op.getNumOperands() != 2)
17761 return false;
17762 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17763 return false;
17764 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17765 Val = C->getSExtValue();
17766 return true;
17767 }
17768 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17769 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17770 return true;
17771 }
17772 }
17773
17774 return false;
17775}
17776
17778 uint64_t Val) const {
17779 if (Constraint.size() == 1) {
17780 switch (Constraint[0]) {
17781 case 'I':
17783 case 'J':
17784 return isInt<16>(Val);
17785 case 'A':
17786 return checkAsmConstraintValA(Op, Val);
17787 case 'B':
17788 return isInt<32>(Val);
17789 case 'C':
17790 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17792 default:
17793 break;
17794 }
17795 } else if (Constraint.size() == 2) {
17796 if (Constraint == "DA") {
17797 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17798 int64_t LoBits = static_cast<int32_t>(Val);
17799 return checkAsmConstraintValA(Op, HiBits, 32) &&
17800 checkAsmConstraintValA(Op, LoBits, 32);
17801 }
17802 if (Constraint == "DB") {
17803 return true;
17804 }
17805 }
17806 llvm_unreachable("Invalid asm constraint");
17807}
17808
17810 unsigned MaxSize) const {
17811 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17812 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17813 if (Size == 16) {
17814 MVT VT = Op.getSimpleValueType();
17815 switch (VT.SimpleTy) {
17816 default:
17817 return false;
17818 case MVT::i16:
17819 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17820 case MVT::f16:
17821 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17822 case MVT::bf16:
17823 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17824 case MVT::v2i16:
17825 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17826 case MVT::v2f16:
17827 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17828 case MVT::v2bf16:
17829 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17830 }
17831 }
17832 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17833 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17834 return true;
17835 return false;
17836}
17837
17838static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17839 switch (UnalignedClassID) {
17840 case AMDGPU::VReg_64RegClassID:
17841 return AMDGPU::VReg_64_Align2RegClassID;
17842 case AMDGPU::VReg_96RegClassID:
17843 return AMDGPU::VReg_96_Align2RegClassID;
17844 case AMDGPU::VReg_128RegClassID:
17845 return AMDGPU::VReg_128_Align2RegClassID;
17846 case AMDGPU::VReg_160RegClassID:
17847 return AMDGPU::VReg_160_Align2RegClassID;
17848 case AMDGPU::VReg_192RegClassID:
17849 return AMDGPU::VReg_192_Align2RegClassID;
17850 case AMDGPU::VReg_224RegClassID:
17851 return AMDGPU::VReg_224_Align2RegClassID;
17852 case AMDGPU::VReg_256RegClassID:
17853 return AMDGPU::VReg_256_Align2RegClassID;
17854 case AMDGPU::VReg_288RegClassID:
17855 return AMDGPU::VReg_288_Align2RegClassID;
17856 case AMDGPU::VReg_320RegClassID:
17857 return AMDGPU::VReg_320_Align2RegClassID;
17858 case AMDGPU::VReg_352RegClassID:
17859 return AMDGPU::VReg_352_Align2RegClassID;
17860 case AMDGPU::VReg_384RegClassID:
17861 return AMDGPU::VReg_384_Align2RegClassID;
17862 case AMDGPU::VReg_512RegClassID:
17863 return AMDGPU::VReg_512_Align2RegClassID;
17864 case AMDGPU::VReg_1024RegClassID:
17865 return AMDGPU::VReg_1024_Align2RegClassID;
17866 case AMDGPU::AReg_64RegClassID:
17867 return AMDGPU::AReg_64_Align2RegClassID;
17868 case AMDGPU::AReg_96RegClassID:
17869 return AMDGPU::AReg_96_Align2RegClassID;
17870 case AMDGPU::AReg_128RegClassID:
17871 return AMDGPU::AReg_128_Align2RegClassID;
17872 case AMDGPU::AReg_160RegClassID:
17873 return AMDGPU::AReg_160_Align2RegClassID;
17874 case AMDGPU::AReg_192RegClassID:
17875 return AMDGPU::AReg_192_Align2RegClassID;
17876 case AMDGPU::AReg_256RegClassID:
17877 return AMDGPU::AReg_256_Align2RegClassID;
17878 case AMDGPU::AReg_512RegClassID:
17879 return AMDGPU::AReg_512_Align2RegClassID;
17880 case AMDGPU::AReg_1024RegClassID:
17881 return AMDGPU::AReg_1024_Align2RegClassID;
17882 default:
17883 return -1;
17884 }
17885}
17886
17887// Figure out which registers should be reserved for stack access. Only after
17888// the function is legalized do we know all of the non-spill stack objects or if
17889// calls are present.
17893 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17894 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17895 const SIInstrInfo *TII = ST.getInstrInfo();
17896
17897 if (Info->isEntryFunction()) {
17898 // Callable functions have fixed registers used for stack access.
17900 }
17901
17902 // TODO: Move this logic to getReservedRegs()
17903 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17904 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17905 Register SReg = ST.isWave32()
17906 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17907 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17908 &AMDGPU::SGPR_64RegClass);
17909 Info->setSGPRForEXECCopy(SReg);
17910
17911 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17912 Info->getStackPtrOffsetReg()));
17913 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17914 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17915
17916 // We need to worry about replacing the default register with itself in case
17917 // of MIR testcases missing the MFI.
17918 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17919 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17920
17921 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17922 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17923
17924 Info->limitOccupancy(MF);
17925
17926 if (ST.isWave32() && !MF.empty()) {
17927 for (auto &MBB : MF) {
17928 for (auto &MI : MBB) {
17929 TII->fixImplicitOperands(MI);
17930 }
17931 }
17932 }
17933
17934 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17935 // classes if required. Ideally the register class constraints would differ
17936 // per-subtarget, but there's no easy way to achieve that right now. This is
17937 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17938 // from using them as the register class for legal types.
17939 if (ST.needsAlignedVGPRs()) {
17940 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17941 const Register Reg = Register::index2VirtReg(I);
17942 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17943 if (!RC)
17944 continue;
17945 int NewClassID = getAlignedAGPRClassID(RC->getID());
17946 if (NewClassID != -1)
17947 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17948 }
17949 }
17950
17952}
17953
17955 KnownBits &Known,
17956 const APInt &DemandedElts,
17957 const SelectionDAG &DAG,
17958 unsigned Depth) const {
17959 Known.resetAll();
17960 unsigned Opc = Op.getOpcode();
17961 switch (Opc) {
17963 unsigned IID = Op.getConstantOperandVal(0);
17964 switch (IID) {
17965 case Intrinsic::amdgcn_mbcnt_lo:
17966 case Intrinsic::amdgcn_mbcnt_hi: {
17967 const GCNSubtarget &ST =
17969 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17970 // most 31 + src1.
17971 Known.Zero.setBitsFrom(
17972 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17973 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17974 Known = KnownBits::add(Known, Known2);
17975 return;
17976 }
17977 }
17978 break;
17979 }
17980 }
17982 Op, Known, DemandedElts, DAG, Depth);
17983}
17984
17986 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17988
17989 // Set the high bits to zero based on the maximum allowed scratch size per
17990 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17991 // calculation won't overflow, so assume the sign bit is never set.
17992 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17993}
17994
17996 GISelValueTracking &VT, KnownBits &Known,
17997 unsigned Dim) {
17998 unsigned MaxValue =
17999 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18000 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18001}
18002
18004 KnownBits &Known, const APInt &DemandedElts,
18005 unsigned BFEWidth, bool SExt, unsigned Depth) {
18007 const MachineOperand &Src1 = MI.getOperand(2);
18008
18009 unsigned Src1Cst = 0;
18010 if (Src1.isImm()) {
18011 Src1Cst = Src1.getImm();
18012 } else if (Src1.isReg()) {
18013 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18014 if (!Cst)
18015 return;
18016 Src1Cst = Cst->Value.getZExtValue();
18017 } else {
18018 return;
18019 }
18020
18021 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18022 // Width is always [22:16].
18023 const unsigned Offset =
18024 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18025 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18026
18027 if (Width >= BFEWidth) // Ill-formed.
18028 return;
18029
18030 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18031 Depth + 1);
18032
18033 Known = Known.extractBits(Width, Offset);
18034
18035 if (SExt)
18036 Known = Known.sext(BFEWidth);
18037 else
18038 Known = Known.zext(BFEWidth);
18039}
18040
18042 GISelValueTracking &VT, Register R, KnownBits &Known,
18043 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18044 unsigned Depth) const {
18045 Known.resetAll();
18046 const MachineInstr *MI = MRI.getVRegDef(R);
18047 switch (MI->getOpcode()) {
18048 case AMDGPU::S_BFE_I32:
18049 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18050 /*SExt=*/true, Depth);
18051 case AMDGPU::S_BFE_U32:
18052 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18053 /*SExt=*/false, Depth);
18054 case AMDGPU::S_BFE_I64:
18055 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18056 /*SExt=*/true, Depth);
18057 case AMDGPU::S_BFE_U64:
18058 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18059 /*SExt=*/false, Depth);
18060 case AMDGPU::G_INTRINSIC:
18061 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18062 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18063 switch (IID) {
18064 case Intrinsic::amdgcn_workitem_id_x:
18065 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18066 break;
18067 case Intrinsic::amdgcn_workitem_id_y:
18068 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18069 break;
18070 case Intrinsic::amdgcn_workitem_id_z:
18071 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18072 break;
18073 case Intrinsic::amdgcn_mbcnt_lo:
18074 case Intrinsic::amdgcn_mbcnt_hi: {
18075 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18076 // most 31 + src1.
18077 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18078 ? getSubtarget()->getWavefrontSizeLog2()
18079 : 5);
18080 KnownBits Known2;
18081 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18082 Depth + 1);
18083 Known = KnownBits::add(Known, Known2);
18084 break;
18085 }
18086 case Intrinsic::amdgcn_groupstaticsize: {
18087 // We can report everything over the maximum size as 0. We can't report
18088 // based on the actual size because we don't know if it's accurate or not
18089 // at any given point.
18090 Known.Zero.setHighBits(
18091 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18092 break;
18093 }
18094 }
18095 break;
18096 }
18097 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18098 Known.Zero.setHighBits(24);
18099 break;
18100 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18101 Known.Zero.setHighBits(16);
18102 break;
18103 case AMDGPU::G_AMDGPU_SMED3:
18104 case AMDGPU::G_AMDGPU_UMED3: {
18105 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18106
18107 KnownBits Known2;
18108 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18109 if (Known2.isUnknown())
18110 break;
18111
18112 KnownBits Known1;
18113 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18114 if (Known1.isUnknown())
18115 break;
18116
18117 KnownBits Known0;
18118 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18119 if (Known0.isUnknown())
18120 break;
18121
18122 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18123 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18124 Known.One = Known0.One & Known1.One & Known2.One;
18125 break;
18126 }
18127 }
18128}
18129
18132 unsigned Depth) const {
18133 const MachineInstr *MI = MRI.getVRegDef(R);
18134 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18135 // FIXME: Can this move to generic code? What about the case where the call
18136 // site specifies a lower alignment?
18137 Intrinsic::ID IID = GI->getIntrinsicID();
18139 AttributeList Attrs =
18140 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18141 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18142 return *RetAlign;
18143 }
18144 return Align(1);
18145}
18146
18149 const Align CacheLineAlign = Align(64);
18150
18151 // Pre-GFX10 target did not benefit from loop alignment
18152 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18153 getSubtarget()->hasInstFwdPrefetchBug())
18154 return PrefAlign;
18155
18156 // On GFX10 I$ is 4 x 64 bytes cache lines.
18157 // By default prefetcher keeps one cache line behind and reads two ahead.
18158 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18159 // behind and one ahead.
18160 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18161 // If loop fits 64 bytes it always spans no more than two cache lines and
18162 // does not need an alignment.
18163 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18164 // Else if loop is less or equal 192 bytes we need two lines behind.
18165
18167 const MachineBasicBlock *Header = ML->getHeader();
18168 if (Header->getAlignment() != PrefAlign)
18169 return Header->getAlignment(); // Already processed.
18170
18171 unsigned LoopSize = 0;
18172 for (const MachineBasicBlock *MBB : ML->blocks()) {
18173 // If inner loop block is aligned assume in average half of the alignment
18174 // size to be added as nops.
18175 if (MBB != Header)
18176 LoopSize += MBB->getAlignment().value() / 2;
18177
18178 for (const MachineInstr &MI : *MBB) {
18179 LoopSize += TII->getInstSizeInBytes(MI);
18180 if (LoopSize > 192)
18181 return PrefAlign;
18182 }
18183 }
18184
18185 if (LoopSize <= 64)
18186 return PrefAlign;
18187
18188 if (LoopSize <= 128)
18189 return CacheLineAlign;
18190
18191 // If any of parent loops is surrounded by prefetch instructions do not
18192 // insert new for inner loop, which would reset parent's settings.
18193 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18194 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18195 auto I = Exit->getFirstNonDebugInstr();
18196 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18197 return CacheLineAlign;
18198 }
18199 }
18200
18201 MachineBasicBlock *Pre = ML->getLoopPreheader();
18202 MachineBasicBlock *Exit = ML->getExitBlock();
18203
18204 if (Pre && Exit) {
18205 auto PreTerm = Pre->getFirstTerminator();
18206 if (PreTerm == Pre->begin() ||
18207 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18208 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18209 .addImm(1); // prefetch 2 lines behind PC
18210
18211 auto ExitHead = Exit->getFirstNonDebugInstr();
18212 if (ExitHead == Exit->end() ||
18213 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18214 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18215 .addImm(2); // prefetch 1 line behind PC
18216 }
18217
18218 return CacheLineAlign;
18219}
18220
18222static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18223 assert(N->getOpcode() == ISD::CopyFromReg);
18224 do {
18225 // Follow the chain until we find an INLINEASM node.
18226 N = N->getOperand(0).getNode();
18227 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18228 return true;
18229 } while (N->getOpcode() == ISD::CopyFromReg);
18230 return false;
18231}
18232
18235 UniformityInfo *UA) const {
18236 switch (N->getOpcode()) {
18237 case ISD::CopyFromReg: {
18238 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18239 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18240 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18241 Register Reg = R->getReg();
18242
18243 // FIXME: Why does this need to consider isLiveIn?
18244 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18245 return !TRI->isSGPRReg(MRI, Reg);
18246
18247 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18248 return UA->isDivergent(V);
18249
18251 return !TRI->isSGPRReg(MRI, Reg);
18252 }
18253 case ISD::LOAD: {
18254 const LoadSDNode *L = cast<LoadSDNode>(N);
18255 unsigned AS = L->getAddressSpace();
18256 // A flat load may access private memory.
18258 }
18259 case ISD::CALLSEQ_END:
18260 return true;
18262 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18264 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18283 // Target-specific read-modify-write atomics are sources of divergence.
18284 return true;
18285 default:
18286 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18287 // Generic read-modify-write atomics are sources of divergence.
18288 return A->readMem() && A->writeMem();
18289 }
18290 return false;
18291 }
18292}
18293
18295 EVT VT) const {
18296 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18297 case MVT::f32:
18299 case MVT::f64:
18300 case MVT::f16:
18302 default:
18303 return false;
18304 }
18305}
18306
18308 LLT Ty, const MachineFunction &MF) const {
18309 switch (Ty.getScalarSizeInBits()) {
18310 case 32:
18311 return !denormalModeIsFlushAllF32(MF);
18312 case 64:
18313 case 16:
18314 return !denormalModeIsFlushAllF64F16(MF);
18315 default:
18316 return false;
18317 }
18318}
18319
18321 const APInt &DemandedElts,
18322 const SelectionDAG &DAG,
18323 bool SNaN,
18324 unsigned Depth) const {
18325 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18326 const MachineFunction &MF = DAG.getMachineFunction();
18328
18329 if (Info->getMode().DX10Clamp)
18330 return true; // Clamped to 0.
18331 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18332 }
18333
18335 DAG, SNaN, Depth);
18336}
18337
18338// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18339// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18341 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18342 return true;
18343
18345 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18346 if (DenormMode == DenormalMode::getPreserveSign())
18347 return true;
18348
18349 // TODO: Remove this.
18350 return RMW->getFunction()
18351 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18352 .getValueAsBool();
18353}
18354
18356 LLVMContext &Ctx = RMW->getContext();
18357 StringRef MemScope =
18358 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18359
18360 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18361 << "Hardware instruction generated for atomic "
18362 << RMW->getOperationName(RMW->getOperation())
18363 << " operation at memory scope " << MemScope;
18364}
18365
18366static bool isV2F16OrV2BF16(Type *Ty) {
18367 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18368 Type *EltTy = VT->getElementType();
18369 return VT->getNumElements() == 2 &&
18370 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18371 }
18372
18373 return false;
18374}
18375
18376static bool isV2F16(Type *Ty) {
18378 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18379}
18380
18381static bool isV2BF16(Type *Ty) {
18383 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18384}
18385
18386/// \return true if atomicrmw integer ops work for the type.
18387static bool isAtomicRMWLegalIntTy(Type *Ty) {
18388 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18389 unsigned BW = IT->getBitWidth();
18390 return BW == 32 || BW == 64;
18391 }
18392
18393 return false;
18394}
18395
18396/// \return true if this atomicrmw xchg type can be selected.
18397static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18398 Type *Ty = RMW->getType();
18399 if (isAtomicRMWLegalIntTy(Ty))
18400 return true;
18401
18402 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18403 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18404 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18405 return BW == 32 || BW == 64;
18406 }
18407
18408 if (Ty->isFloatTy() || Ty->isDoubleTy())
18409 return true;
18410
18412 return VT->getNumElements() == 2 &&
18413 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18414 }
18415
18416 return false;
18417}
18418
18419/// \returns true if it's valid to emit a native instruction for \p RMW, based
18420/// on the properties of the target memory.
18421static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18422 const AtomicRMWInst *RMW,
18423 bool HasSystemScope) {
18424 // The remote/fine-grained access logic is different from the integer
18425 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18426 // fine-grained access does not work, even for a device local allocation.
18427 //
18428 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18429 // allocations work.
18430 if (HasSystemScope) {
18432 RMW->hasMetadata("amdgpu.no.remote.memory"))
18433 return true;
18434 if (Subtarget.hasEmulatedSystemScopeAtomics())
18435 return true;
18437 return true;
18438
18439 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18440}
18441
18442/// \return Action to perform on AtomicRMWInsts for integer operations.
18449
18450/// Return if a flat address space atomicrmw can access private memory.
18452 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18453 return !MD ||
18455}
18456
18464
18467 unsigned AS = RMW->getPointerAddressSpace();
18468 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18470
18471 // 64-bit flat atomics that dynamically reside in private memory will silently
18472 // be dropped.
18473 //
18474 // Note that we will emit a new copy of the original atomic in the expansion,
18475 // which will be incrementally relegalized.
18476 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18477 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18478 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18481
18482 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18484 ORE.emit([=]() {
18485 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18486 });
18487 return Kind;
18488 };
18489
18490 auto SSID = RMW->getSyncScopeID();
18491 bool HasSystemScope =
18492 SSID == SyncScope::System ||
18493 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18494
18495 auto Op = RMW->getOperation();
18496 switch (Op) {
18498 // PCIe supports add and xchg for system atomics.
18499 return isAtomicRMWLegalXChgTy(RMW)
18502 case AtomicRMWInst::Add:
18503 // PCIe supports add and xchg for system atomics.
18505 case AtomicRMWInst::Sub:
18506 case AtomicRMWInst::And:
18507 case AtomicRMWInst::Or:
18508 case AtomicRMWInst::Xor:
18509 case AtomicRMWInst::Max:
18510 case AtomicRMWInst::Min:
18517 if (Subtarget->hasEmulatedSystemScopeAtomics())
18519
18520 // On most subtargets, for atomicrmw operations other than add/xchg,
18521 // whether or not the instructions will behave correctly depends on where
18522 // the address physically resides and what interconnect is used in the
18523 // system configuration. On some some targets the instruction will nop,
18524 // and in others synchronization will only occur at degraded device scope.
18525 //
18526 // If the allocation is known local to the device, the instructions should
18527 // work correctly.
18528 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18530
18531 // If fine-grained remote memory works at device scope, we don't need to
18532 // do anything.
18533 if (!HasSystemScope &&
18534 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18536
18537 // If we are targeting a remote allocated address, it depends what kind of
18538 // allocation the address belongs to.
18539 //
18540 // If the allocation is fine-grained (in host memory, or in PCIe peer
18541 // device memory), the operation will fail depending on the target.
18542 //
18543 // Note fine-grained host memory access does work on APUs or if XGMI is
18544 // used, but we do not know if we are targeting an APU or the system
18545 // configuration from the ISA version/target-cpu.
18546 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18548
18551 // Atomic sub/or/xor do not work over PCI express, but atomic add
18552 // does. InstCombine transforms these with 0 to or, so undo that.
18553 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18554 ConstVal && ConstVal->isNullValue())
18556 }
18557
18558 // If the allocation could be in remote, fine-grained memory, the rmw
18559 // instructions may fail. cmpxchg should work, so emit that. On some
18560 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18561 // even work, so you're out of luck anyway.
18562
18563 // In summary:
18564 //
18565 // Cases that may fail:
18566 // - fine-grained pinned host memory
18567 // - fine-grained migratable host memory
18568 // - fine-grained PCIe peer device
18569 //
18570 // Cases that should work, but may be treated overly conservatively.
18571 // - fine-grained host memory on an APU
18572 // - fine-grained XGMI peer device
18574 }
18575
18577 }
18578 case AtomicRMWInst::FAdd: {
18579 Type *Ty = RMW->getType();
18580
18581 // TODO: Handle REGION_ADDRESS
18582 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18583 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18584 // is fixed to round-to-nearest-even.
18585 //
18586 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18587 // round-to-nearest-even.
18588 //
18589 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18590 // suggests it is OK if the floating-point mode may not match the calling
18591 // thread.
18592 if (Ty->isFloatTy()) {
18593 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18595 }
18596
18597 if (Ty->isDoubleTy()) {
18598 // Ignores denormal mode, but we don't consider flushing mandatory.
18599 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18601 }
18602
18603 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18605
18607 }
18608
18609 // LDS atomics respect the denormal mode from the mode register.
18610 //
18611 // Traditionally f32 global/buffer memory atomics would unconditionally
18612 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18613 // flush.
18614 //
18615 // On targets with flat atomic fadd, denormals would flush depending on
18616 // whether the target address resides in LDS or global memory. We consider
18617 // this flat-maybe-flush as will-flush.
18618 if (Ty->isFloatTy() &&
18619 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18622
18623 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18624 // safe. The message phrasing also should be better.
18625 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18626 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18627 // gfx942, gfx12
18628 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18629 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18630 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18631 // gfx90a, gfx942, gfx12
18632 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18633 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18634
18635 // gfx942, gfx12
18636 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18637 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18638 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18639 // gfx90a, gfx942, gfx12
18640 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18641 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18642
18643 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18644 // buffer. gfx12 does have the buffer version.
18645 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18646 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18647 }
18648
18649 // global and flat atomic fadd f64: gfx90a, gfx942.
18650 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18651 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18652
18653 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18654 if (Ty->isFloatTy()) {
18655 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18656 // gfx11+.
18657 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18658 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18659 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18660 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18661 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18662 } else {
18663 // gfx908
18664 if (RMW->use_empty() &&
18665 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18666 isV2F16(Ty))
18667 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18668 }
18669 }
18670
18671 // flat atomic fadd f32: gfx942, gfx11+.
18672 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18673 if (Subtarget->hasFlatAtomicFaddF32Inst())
18674 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18675
18676 // If it is in flat address space, and the type is float, we will try to
18677 // expand it, if the target supports global and lds atomic fadd. The
18678 // reason we need that is, in the expansion, we emit the check of
18679 // address space. If it is in global address space, we emit the global
18680 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18681 // fadd.
18682 if (Subtarget->hasLDSFPAtomicAddF32()) {
18683 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18685 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18687 }
18688 }
18689 }
18690
18692 }
18694 case AtomicRMWInst::FMax: {
18695 Type *Ty = RMW->getType();
18696
18697 // LDS float and double fmin/fmax were always supported.
18698 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18699 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18701 }
18702
18703 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18704 // For flat and global cases:
18705 // float, double in gfx7. Manual claims denormal support.
18706 // Removed in gfx8.
18707 // float, double restored in gfx10.
18708 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18709 //
18710 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18711 // no f32.
18712 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18713 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18714 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18715 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18716 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18717 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18719 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18720 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18721 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18722 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18723 }
18724 }
18725
18727 }
18730 default:
18732 }
18733
18734 llvm_unreachable("covered atomicrmw op switch");
18735}
18736
18743
18750
18753 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18754 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18756
18757 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18759
18760 const DataLayout &DL = CmpX->getDataLayout();
18761
18762 Type *ValTy = CmpX->getNewValOperand()->getType();
18763
18764 // If a 64-bit flat atomic may alias private, we need to avoid using the
18765 // atomic in the private case.
18766 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18768}
18769
18770const TargetRegisterClass *
18771SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18773 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18774 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18775 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18776 : &AMDGPU::SReg_32RegClass;
18777 if (!TRI->isSGPRClass(RC) && !isDivergent)
18778 return TRI->getEquivalentSGPRClass(RC);
18779 if (TRI->isSGPRClass(RC) && isDivergent)
18780 return TRI->getEquivalentVGPRClass(RC);
18781
18782 return RC;
18783}
18784
18785// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18786// uniform values (as produced by the mask results of control flow intrinsics)
18787// used outside of divergent blocks. The phi users need to also be treated as
18788// always uniform.
18789//
18790// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18791static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18792 unsigned WaveSize) {
18793 // FIXME: We assume we never cast the mask results of a control flow
18794 // intrinsic.
18795 // Early exit if the type won't be consistent as a compile time hack.
18796 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18797 if (!IT || IT->getBitWidth() != WaveSize)
18798 return false;
18799
18800 if (!isa<Instruction>(V))
18801 return false;
18802 if (!Visited.insert(V).second)
18803 return false;
18804 bool Result = false;
18805 for (const auto *U : V->users()) {
18807 if (V == U->getOperand(1)) {
18808 switch (Intrinsic->getIntrinsicID()) {
18809 default:
18810 Result = false;
18811 break;
18812 case Intrinsic::amdgcn_if_break:
18813 case Intrinsic::amdgcn_if:
18814 case Intrinsic::amdgcn_else:
18815 Result = true;
18816 break;
18817 }
18818 }
18819 if (V == U->getOperand(0)) {
18820 switch (Intrinsic->getIntrinsicID()) {
18821 default:
18822 Result = false;
18823 break;
18824 case Intrinsic::amdgcn_end_cf:
18825 case Intrinsic::amdgcn_loop:
18826 Result = true;
18827 break;
18828 }
18829 }
18830 } else {
18831 Result = hasCFUser(U, Visited, WaveSize);
18832 }
18833 if (Result)
18834 break;
18835 }
18836 return Result;
18837}
18838
18840 const Value *V) const {
18841 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18842 if (CI->isInlineAsm()) {
18843 // FIXME: This cannot give a correct answer. This should only trigger in
18844 // the case where inline asm returns mixed SGPR and VGPR results, used
18845 // outside the defining block. We don't have a specific result to
18846 // consider, so this assumes if any value is SGPR, the overall register
18847 // also needs to be SGPR.
18848 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18850 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18851 for (auto &TC : TargetConstraints) {
18852 if (TC.Type == InlineAsm::isOutput) {
18854 const TargetRegisterClass *RC =
18855 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18856 TC.ConstraintVT)
18857 .second;
18858 if (RC && SIRI->isSGPRClass(RC))
18859 return true;
18860 }
18861 }
18862 }
18863 }
18865 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18866}
18867
18869 for (SDUse &Use : N->uses()) {
18871 if (getBasePtrIndex(M) == Use.getOperandNo())
18872 return true;
18873 }
18874 }
18875 return false;
18876}
18877
18879 SDValue N1) const {
18880 if (!N0.hasOneUse())
18881 return false;
18882 // Take care of the opportunity to keep N0 uniform
18883 if (N0->isDivergent() || !N1->isDivergent())
18884 return true;
18885 // Check if we have a good chance to form the memory access pattern with the
18886 // base and offset
18887 return (DAG.isBaseWithConstantOffset(N0) &&
18889}
18890
18892 Register N0, Register N1) const {
18893 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18894}
18895
18898 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18900 if (I.getMetadata("amdgpu.noclobber"))
18901 Flags |= MONoClobber;
18902 if (I.getMetadata("amdgpu.last.use"))
18903 Flags |= MOLastUse;
18904 return Flags;
18905}
18906
18908 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18909 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18910 if (User->getOpcode() != ISD::CopyToReg)
18911 return false;
18912 if (!Def->isMachineOpcode())
18913 return false;
18915 if (!MDef)
18916 return false;
18917
18918 unsigned ResNo = User->getOperand(Op).getResNo();
18919 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18920 return false;
18921 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18922 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18923 PhysReg = AMDGPU::SCC;
18924 const TargetRegisterClass *RC =
18925 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18926 Cost = RC->getCopyCost();
18927 return true;
18928 }
18929 return false;
18930}
18931
18933 Instruction *AI) const {
18934 // Given: atomicrmw fadd ptr %addr, float %val ordering
18935 //
18936 // With this expansion we produce the following code:
18937 // [...]
18938 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18939 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18940 //
18941 // atomicrmw.shared:
18942 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18943 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18944 // float %val ordering
18945 // br label %atomicrmw.phi
18946 //
18947 // atomicrmw.check.private:
18948 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18949 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18950 //
18951 // atomicrmw.private:
18952 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18953 // %loaded.private = load float, ptr addrspace(5) %cast.private
18954 // %val.new = fadd float %loaded.private, %val
18955 // store float %val.new, ptr addrspace(5) %cast.private
18956 // br label %atomicrmw.phi
18957 //
18958 // atomicrmw.global:
18959 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18960 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18961 // float %val ordering
18962 // br label %atomicrmw.phi
18963 //
18964 // atomicrmw.phi:
18965 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18966 // [ %loaded.private, %atomicrmw.private ],
18967 // [ %loaded.global, %atomicrmw.global ]
18968 // br label %atomicrmw.end
18969 //
18970 // atomicrmw.end:
18971 // [...]
18972 //
18973 //
18974 // For 64-bit atomics which may reside in private memory, we perform a simpler
18975 // version that only inserts the private check, and uses the flat operation.
18976
18977 IRBuilder<> Builder(AI);
18978 LLVMContext &Ctx = Builder.getContext();
18979
18980 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18981 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18983 Value *Addr = AI->getOperand(PtrOpIdx);
18984
18985 /// TODO: Only need to check private, then emit flat-known-not private (no
18986 /// need for shared block, or cast to global).
18988
18989 Align Alignment;
18990 if (RMW)
18991 Alignment = RMW->getAlign();
18992 else if (CX)
18993 Alignment = CX->getAlign();
18994 else
18995 llvm_unreachable("unhandled atomic operation");
18996
18997 // FullFlatEmulation is true if we need to issue the private, shared, and
18998 // global cases.
18999 //
19000 // If this is false, we are only dealing with the flat-targeting-private case,
19001 // where we only insert a check for private and still use the flat instruction
19002 // for global and shared.
19003
19004 bool FullFlatEmulation =
19005 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19006 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19007 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19008 RMW->getType()->isDoubleTy()));
19009
19010 // If the return value isn't used, do not introduce a false use in the phi.
19011 bool ReturnValueIsUsed = !AI->use_empty();
19012
19013 BasicBlock *BB = Builder.GetInsertBlock();
19014 Function *F = BB->getParent();
19015 BasicBlock *ExitBB =
19016 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19017 BasicBlock *SharedBB = nullptr;
19018
19019 BasicBlock *CheckPrivateBB = BB;
19020 if (FullFlatEmulation) {
19021 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19022 CheckPrivateBB =
19023 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19024 }
19025
19026 BasicBlock *PrivateBB =
19027 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19028 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19029 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19030
19031 std::prev(BB->end())->eraseFromParent();
19032 Builder.SetInsertPoint(BB);
19033
19034 Value *LoadedShared = nullptr;
19035 if (FullFlatEmulation) {
19036 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19037 {Addr}, nullptr, "is.shared");
19038 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19039 Builder.SetInsertPoint(SharedBB);
19040 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19042
19043 Instruction *Clone = AI->clone();
19044 Clone->insertInto(SharedBB, SharedBB->end());
19045 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19046 LoadedShared = Clone;
19047
19048 Builder.CreateBr(PhiBB);
19049 Builder.SetInsertPoint(CheckPrivateBB);
19050 }
19051
19052 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19053 {Addr}, nullptr, "is.private");
19054 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19055
19056 Builder.SetInsertPoint(PrivateBB);
19057
19058 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19060
19061 Value *LoadedPrivate;
19062 if (RMW) {
19063 LoadedPrivate = Builder.CreateAlignedLoad(
19064 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19065
19066 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19067 LoadedPrivate, RMW->getValOperand());
19068
19069 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19070 } else {
19071 auto [ResultLoad, Equal] =
19072 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19073 CX->getNewValOperand(), CX->getAlign());
19074
19075 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19076 ResultLoad, 0);
19077 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19078 }
19079
19080 Builder.CreateBr(PhiBB);
19081
19082 Builder.SetInsertPoint(GlobalBB);
19083
19084 // Continue using a flat instruction if we only emitted the check for private.
19085 Instruction *LoadedGlobal = AI;
19086 if (FullFlatEmulation) {
19087 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19089 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19090 }
19091
19092 AI->removeFromParent();
19093 AI->insertInto(GlobalBB, GlobalBB->end());
19094
19095 // The new atomicrmw may go through another round of legalization later.
19096 if (!FullFlatEmulation) {
19097 // We inserted the runtime check already, make sure we do not try to
19098 // re-expand this.
19099 // TODO: Should union with any existing metadata.
19100 MDBuilder MDB(F->getContext());
19101 MDNode *RangeNotPrivate =
19104 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19105 RangeNotPrivate);
19106 }
19107
19108 Builder.CreateBr(PhiBB);
19109
19110 Builder.SetInsertPoint(PhiBB);
19111
19112 if (ReturnValueIsUsed) {
19113 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19114 AI->replaceAllUsesWith(Loaded);
19115 if (FullFlatEmulation)
19116 Loaded->addIncoming(LoadedShared, SharedBB);
19117 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19118 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19119 Loaded->takeName(AI);
19120 }
19121
19122 Builder.CreateBr(ExitBB);
19123}
19124
19126 unsigned PtrOpIdx) {
19127 Value *PtrOp = I->getOperand(PtrOpIdx);
19130
19131 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19132 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19133 I->getIterator());
19134 I->setOperand(PtrOpIdx, ASCast);
19135}
19136
19139
19142
19145 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19146 ConstVal && ConstVal->isNullValue()) {
19147 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19149
19150 // We may still need the private-alias-flat handling below.
19151
19152 // TODO: Skip this for cases where we cannot access remote memory.
19153 }
19154 }
19155
19156 // The non-flat expansions should only perform the de-canonicalization of
19157 // identity values.
19159 return;
19160
19162}
19163
19170
19174
19176 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19177}
19178
19180 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19181 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19182
19184 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19185}
19186
19187LoadInst *
19189 IRBuilder<> Builder(AI);
19190 auto Order = AI->getOrdering();
19191
19192 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19193 // must be flushed if the atomic ordering had a release semantics. This is
19194 // not necessary a fence, a release fence just coincides to do that flush.
19195 // Avoid replacing of an atomicrmw with a release semantics.
19196 if (isReleaseOrStronger(Order))
19197 return nullptr;
19198
19199 LoadInst *LI = Builder.CreateAlignedLoad(
19200 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19201 LI->setAtomic(Order, AI->getSyncScopeID());
19202 LI->copyMetadata(*AI);
19203 LI->takeName(AI);
19204 AI->replaceAllUsesWith(LI);
19205 AI->eraseFromParent();
19206 return LI;
19207}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1247
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1244
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1445
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:311
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:834
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2125
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1721
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1747
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1886
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:218
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:241
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs