LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/MDBuilder.h"
45#include "llvm/Support/ModRef.h"
47#include <optional>
48
49using namespace llvm;
50using namespace llvm::SDPatternMatch;
51
52#define DEBUG_TYPE "si-lower"
53
54STATISTIC(NumTailCalls, "Number of tail calls");
55
56static cl::opt<bool>
57 DisableLoopAlignment("amdgpu-disable-loop-alignment",
58 cl::desc("Do not align and prefetch loops"),
59 cl::init(false));
60
62 "amdgpu-use-divergent-register-indexing", cl::Hidden,
63 cl::desc("Use indirect register addressing for divergent indexes"),
64 cl::init(false));
65
66// TODO: This option should be removed once we switch to always using PTRADD in
67// the SelectionDAG.
69 "amdgpu-use-sdag-ptradd", cl::Hidden,
70 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
71 "SelectionDAG ISel"),
72 cl::init(false));
73
76 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
77}
78
81 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
82}
83
84static unsigned findFirstFreeSGPR(CCState &CCInfo) {
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
87 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
88 return AMDGPU::SGPR0 + Reg;
89 }
90 }
91 llvm_unreachable("Cannot allocate sgpr");
92}
93
95 const GCNSubtarget &STI)
96 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
97 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
98 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
99
100 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
101 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const SIRegisterInfo *TRI = STI.getRegisterInfo();
106 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
147
148 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
149 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
150
151 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
152 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
153
154 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
156
157 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
158 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
159
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
162 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
163 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
165 } else {
166 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
169 }
170
171 // Unless there are also VOP3P operations, not operations are really legal.
172 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
176 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
179 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
182 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
185 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
187 }
188
189 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
190 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
191
192 computeRegisterProperties(Subtarget->getRegisterInfo());
193
194 // The boolean content concept here is too inflexible. Compares only ever
195 // really produce a 1-bit result. Any copy/extend from these will turn into a
196 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
197 // it's what most targets use.
200
201 // We need to custom lower vector stores from local memory
202 setOperationAction(ISD::LOAD,
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
207 Custom);
208
209 setOperationAction(ISD::STORE,
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
214 Custom);
215
216 if (isTypeLegal(MVT::bf16)) {
217 for (unsigned Opc :
219 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
225 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
226 ISD::SETCC}) {
227 // FIXME: The promoted to type shouldn't need to be explicit
228 setOperationAction(Opc, MVT::bf16, Promote);
229 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
230 }
231
233
235 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
236
237 setOperationAction(ISD::FABS, MVT::bf16, Legal);
238 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
240
241 // We only need to custom lower because we can't specify an action for bf16
242 // sources.
245 }
246
247 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
248 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
253 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
258 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
259 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
260 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
261 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
262 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
263
264 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
265 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
266 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
267 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
270 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
271
272 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
273
277 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
278
279 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
280
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
283
285 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
286 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
287
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
292 Expand);
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
297 Expand);
298
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
302 Custom);
303
304 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
305 setOperationAction(ISD::BR_CC,
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
307
309
311
313 Expand);
314
315#if 0
317#endif
318
319 // We only support LOAD/STORE and vector manipulation ops for vectors
320 // with > 4 elements.
321 for (MVT VT :
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
330 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
331 switch (Op) {
332 case ISD::LOAD:
333 case ISD::STORE:
335 case ISD::BITCAST:
336 case ISD::UNDEF:
340 case ISD::IS_FPCLASS:
341 break;
346 break;
347 default:
349 break;
350 }
351 }
352 }
353
354 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
355
356 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
357 // is expanded to avoid having two separate loops in case the index is a VGPR.
358
359 // Most operations are naturally 32-bit vector operations. We only support
360 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
361 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
363 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
364
366 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
367
369 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
373 }
374
375 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
377 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
378
380 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
381
383 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
387 }
388
389 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
391 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
392
394 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
395
397 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
401 }
402
403 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
405 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
406
408 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
409
411 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
415 }
416
417 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
419 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
420
422 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
423
425 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
429 }
430
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
434 Custom);
435
436 if (Subtarget->hasPkMovB32()) {
437 // TODO: 16-bit element vectors should be legal with even aligned elements.
438 // TODO: Can be legal with wider source types than the result with
439 // subregister extracts.
440 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441 }
442
443 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
444 Custom);
445
446 // Avoid stack access for these.
447 // TODO: Generalize to more vector types.
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
451 Custom);
452
453 // Deal with vec3 vector operations when widened to vec4.
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
456
457 // Deal with vec5/6/7 vector operations when widened to vec8.
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
463 Custom);
464
465 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
466 // and output demarshalling
467 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
468
469 // We can't return success/failure, only the old value,
470 // let LLVM add the comparison
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
472 Expand);
473
474 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
475
476 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
477
478 // FIXME: This should be narrowed to i32, but that only happens if i64 is
479 // illegal.
480 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
481 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
482
483 // On SI this is s_memtime and s_memrealtime on VI.
484 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
485
486 if (Subtarget->hasSMemRealTime() ||
487 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
488 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
489 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
490
491 if (Subtarget->has16BitInsts()) {
492 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
493 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
494 } else {
495 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
496 }
497
498 if (Subtarget->hasMadMacF32Insts())
500
501 if (!Subtarget->hasBFI())
502 // fcopysign can be done in a single instruction with BFI.
503 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
504
505 if (!Subtarget->hasBCNT(32))
507
508 if (!Subtarget->hasBCNT(64))
510
511 if (Subtarget->hasFFBH())
513
514 if (Subtarget->hasFFBL())
516
517 // We only really have 32-bit BFE instructions (and 16-bit on VI).
518 //
519 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
520 // effort to match them now. We want this to be false for i64 cases when the
521 // extraction isn't restricted to the upper or lower half. Ideally we would
522 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
523 // span the midpoint are probably relatively rare, so don't worry about them
524 // for now.
525 if (Subtarget->hasBFE())
527
528 // Clamp modifier on add/sub
529 if (Subtarget->hasIntClamp())
531
532 if (Subtarget->hasAddNoCarry())
533 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
534 Legal);
535
537 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
538 {MVT::f32, MVT::f64}, Custom);
539
540 // These are really only legal for ieee_mode functions. We should be avoiding
541 // them for functions that don't have ieee_mode enabled, so just say they are
542 // legal.
543 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
544 {MVT::f32, MVT::f64}, Legal);
545
546 if (Subtarget->haveRoundOpsF64())
547 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
548 Legal);
549 else
550 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
551 MVT::f64, Custom);
552
553 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
554 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
555 Legal);
556 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
557
558 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
560
561 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
562 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563
564 // Custom lower these because we can't specify a rule based on an illegal
565 // source bf16.
566 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
567 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
568
569 if (Subtarget->has16BitInsts()) {
572 MVT::i16, Legal);
573
574 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
575
577 MVT::i16, Expand);
578
582 ISD::CTPOP},
583 MVT::i16, Promote);
584
585 setOperationAction(ISD::LOAD, MVT::i16, Custom);
586
587 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
588
589 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
590 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
591 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
592 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
593
597
599
600 // F16 - Constant Actions.
603
604 // F16 - Load/Store Actions.
605 setOperationAction(ISD::LOAD, MVT::f16, Promote);
606 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
607 setOperationAction(ISD::STORE, MVT::f16, Promote);
608 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
609
610 // BF16 - Load/Store Actions.
611 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
612 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
613 setOperationAction(ISD::STORE, MVT::bf16, Promote);
614 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
615
616 // F16 - VOP1 Actions.
618 ISD::FSIN, ISD::FROUND},
619 MVT::f16, Custom);
620
621 // BF16 - VOP1 Actions.
622 if (Subtarget->hasBF16TransInsts())
623 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
624
627
628 // F16 - VOP2 Actions.
629 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
630 Expand);
631 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
632 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
634
635 // F16 - VOP3 Actions.
637 if (STI.hasMadF16())
639
640 for (MVT VT :
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
644 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
645 switch (Op) {
646 case ISD::LOAD:
647 case ISD::STORE:
649 case ISD::BITCAST:
650 case ISD::UNDEF:
655 case ISD::IS_FPCLASS:
656 break;
660 break;
661 default:
663 break;
664 }
665 }
666 }
667
668 // v_perm_b32 can handle either of these.
669 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
671
672 // XXX - Do these do anything? Vector constants turn into build_vector.
673 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
674
675 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
676 Legal);
677
678 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
679 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
680 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
681 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
682
683 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
684 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
685 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
686 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
687
688 setOperationAction(ISD::AND, MVT::v2i16, Promote);
689 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
690 setOperationAction(ISD::OR, MVT::v2i16, Promote);
691 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
692 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
694
695 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
696 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
697 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
698 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
699 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
700 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
701
702 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
703 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
704 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
705 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
706 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
707 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
708
709 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
711 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
712 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
713 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
715
716 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
718 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
720
721 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
722 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
723 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
724 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
725 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
726 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
727
728 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
730 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
731 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
732 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
733 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
734
735 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
737 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
738 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
739 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
740 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
741
742 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
744 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
746 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
748
749 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
750 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
751 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
752 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
753 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
755
757 MVT::v2i32, Expand);
758 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
759
761 MVT::v4i32, Expand);
762
764 MVT::v8i32, Expand);
765
766 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
767 Subtarget->hasVOP3PInsts() ? Legal : Custom);
768
769 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
770 // This isn't really legal, but this avoids the legalizer unrolling it (and
771 // allows matching fneg (fabs x) patterns)
772 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
773
774 // Can do this in one BFI plus a constant materialize.
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
779 Custom);
780
782 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
783 MVT::f16, Custom);
784 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
785
786 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
787 ISD::FMAXIMUMNUM},
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
789 Custom);
790
791 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 Expand);
794
795 for (MVT Vec16 :
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
800 Vec16, Custom);
802 }
803 }
804
805 if (Subtarget->hasVOP3PInsts()) {
809 MVT::v2i16, Legal);
810
811 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
812 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
813 MVT::v2f16, Legal);
814
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
817
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
822 Custom);
823
824 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
825 // Split vector operations.
830 VT, Custom);
831
832 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
833 // Split vector operations.
835 VT, Custom);
836
838 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
839 {MVT::v2f16, MVT::v4f16}, Custom);
840
841 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
842 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
843 Custom);
844
845 if (Subtarget->hasPackedFP32Ops()) {
847 MVT::v2f32, Legal);
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
850 Custom);
851 }
852 }
853
854 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
855
856 if (Subtarget->has16BitInsts()) {
858 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
860 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
861 } else {
862 // Legalization hack.
863 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
864
865 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
866 }
867
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
873 Custom);
874
876
877 if (Subtarget->hasVectorMulU64())
879 else if (Subtarget->hasScalarSMulU64())
881
882 if (Subtarget->hasMad64_32())
884
885 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
886 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
887
888 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
889 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
891 } else {
892 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
893 if (Subtarget->hasMinimum3Maximum3F32())
894 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
895
896 if (Subtarget->hasMinimum3Maximum3PKF16()) {
897 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
898
899 // If only the vector form is available, we need to widen to a vector.
900 if (!Subtarget->hasMinimum3Maximum3F16())
901 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
902 }
903 }
904
905 if (Subtarget->hasVOP3PInsts()) {
906 // We want to break these into v2f16 pieces, not scalarize.
907 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
909 Custom);
910 }
911
912 if (Subtarget->hasIntMinMax64())
914 Legal);
915
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
919 MVT::i8},
920 Custom);
921
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
927 Custom);
928
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
934 Custom);
935
936 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
938 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
939 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
940 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
941
942 // TODO: Could move this to custom lowering, could benefit from combines on
943 // extract of relevant bits.
944 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
945
947
948 if (Subtarget->hasBF16ConversionInsts()) {
949 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
951 }
952
953 if (Subtarget->hasBF16PackedInsts()) {
955 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
956 MVT::v2bf16, Legal);
957 }
958
959 if (Subtarget->hasBF16TransInsts()) {
960 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
961 }
962
963 if (Subtarget->hasCvtPkF16F32Inst()) {
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
966 Custom);
967 }
968
970 ISD::PTRADD,
972 ISD::SUB,
974 ISD::MUL,
975 ISD::FADD,
976 ISD::FSUB,
977 ISD::FDIV,
978 ISD::FMUL,
979 ISD::FMINNUM,
980 ISD::FMAXNUM,
981 ISD::FMINNUM_IEEE,
982 ISD::FMAXNUM_IEEE,
983 ISD::FMINIMUM,
984 ISD::FMAXIMUM,
985 ISD::FMINIMUMNUM,
986 ISD::FMAXIMUMNUM,
987 ISD::FMA,
988 ISD::SMIN,
989 ISD::SMAX,
990 ISD::UMIN,
991 ISD::UMAX,
994 ISD::SMIN,
995 ISD::SMAX,
996 ISD::UMIN,
997 ISD::UMAX,
998 ISD::AND,
999 ISD::OR,
1000 ISD::XOR,
1001 ISD::SHL,
1002 ISD::SRL,
1003 ISD::SRA,
1004 ISD::FSHR,
1014
1015 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1017
1018 // All memory operations. Some folding on the pointer operand is done to help
1019 // matching the constant offsets in the addressing modes.
1020 setTargetDAGCombine({ISD::LOAD,
1021 ISD::STORE,
1022 ISD::ATOMIC_LOAD,
1023 ISD::ATOMIC_STORE,
1024 ISD::ATOMIC_CMP_SWAP,
1025 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1026 ISD::ATOMIC_SWAP,
1027 ISD::ATOMIC_LOAD_ADD,
1028 ISD::ATOMIC_LOAD_SUB,
1029 ISD::ATOMIC_LOAD_AND,
1030 ISD::ATOMIC_LOAD_OR,
1031 ISD::ATOMIC_LOAD_XOR,
1032 ISD::ATOMIC_LOAD_NAND,
1033 ISD::ATOMIC_LOAD_MIN,
1034 ISD::ATOMIC_LOAD_MAX,
1035 ISD::ATOMIC_LOAD_UMIN,
1036 ISD::ATOMIC_LOAD_UMAX,
1037 ISD::ATOMIC_LOAD_FADD,
1038 ISD::ATOMIC_LOAD_FMIN,
1039 ISD::ATOMIC_LOAD_FMAX,
1040 ISD::ATOMIC_LOAD_UINC_WRAP,
1041 ISD::ATOMIC_LOAD_UDEC_WRAP,
1044
1045 // FIXME: In other contexts we pretend this is a per-function property.
1047
1049}
1050
1051const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1052
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1055 return RCRegs;
1056}
1057
1058//===----------------------------------------------------------------------===//
1059// TargetLowering queries
1060//===----------------------------------------------------------------------===//
1061
1062// v_mad_mix* support a conversion from f16 to f32.
1063//
1064// There is only one special case when denormals are enabled we don't currently,
1065// where this is OK to use.
1066bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1067 EVT DestVT, EVT SrcVT) const {
1068 return DestVT.getScalarType() == MVT::f32 &&
1069 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1070 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1071 SrcVT.getScalarType() == MVT::f16) ||
1072 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1073 SrcVT.getScalarType() == MVT::bf16)) &&
1074 // TODO: This probably only requires no input flushing?
1076}
1077
1079 LLT DestTy, LLT SrcTy) const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1082 DestTy.getScalarSizeInBits() == 32 &&
1083 SrcTy.getScalarSizeInBits() == 16 &&
1084 // TODO: This probably only requires no input flushing?
1085 denormalModeIsFlushAllF32(*MI.getMF());
1086}
1087
1089 // SI has some legal vector types, but no legal vector operations. Say no
1090 // shuffles are legal in order to prefer scalarizing some vector operations.
1091 return false;
1092}
1093
1095 CallingConv::ID CC,
1096 EVT VT) const {
1098 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1099
1100 if (VT.isVector()) {
1101 EVT ScalarVT = VT.getScalarType();
1102 unsigned Size = ScalarVT.getSizeInBits();
1103 if (Size == 16) {
1104 if (Subtarget->has16BitInsts()) {
1105 if (VT.isInteger())
1106 return MVT::v2i16;
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1108 }
1109 return VT.isInteger() ? MVT::i32 : MVT::f32;
1110 }
1111
1112 if (Size < 16)
1113 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1114 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1115 }
1116
1117 if (VT.getSizeInBits() > 32)
1118 return MVT::i32;
1119
1120 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1121}
1122
1124 CallingConv::ID CC,
1125 EVT VT) const {
1127 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1128
1129 if (VT.isVector()) {
1130 unsigned NumElts = VT.getVectorNumElements();
1131 EVT ScalarVT = VT.getScalarType();
1132 unsigned Size = ScalarVT.getSizeInBits();
1133
1134 // FIXME: Should probably promote 8-bit vectors to i16.
1135 if (Size == 16 && Subtarget->has16BitInsts())
1136 return (NumElts + 1) / 2;
1137
1138 if (Size <= 32)
1139 return NumElts;
1140
1141 if (Size > 32)
1142 return NumElts * ((Size + 31) / 32);
1143 } else if (VT.getSizeInBits() > 32)
1144 return (VT.getSizeInBits() + 31) / 32;
1145
1146 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1147}
1148
1150 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1151 unsigned &NumIntermediates, MVT &RegisterVT) const {
1152 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1153 unsigned NumElts = VT.getVectorNumElements();
1154 EVT ScalarVT = VT.getScalarType();
1155 unsigned Size = ScalarVT.getSizeInBits();
1156 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1157 // support, but unless we can properly handle 3-vectors, it will be still be
1158 // inconsistent.
1159 if (Size == 16 && Subtarget->has16BitInsts()) {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1163 } else {
1164 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1166 }
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1169 }
1170
1171 if (Size == 32) {
1172 RegisterVT = ScalarVT.getSimpleVT();
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1176 }
1177
1178 if (Size < 16 && Subtarget->has16BitInsts()) {
1179 // FIXME: Should probably form v2i16 pieces
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size != 16 && Size <= 32) {
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1191 }
1192
1193 if (Size > 32) {
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((Size + 31) / 32);
1197 return NumIntermediates;
1198 }
1199 }
1200
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1203}
1204
1206 const DataLayout &DL, Type *Ty,
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1209
1210 LLVMContext &Ctx = Ty->getContext();
1211 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1213 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1214 NumElts);
1215 }
1216
1217 return TLI.getValueType(DL, Ty);
1218}
1219
1220// Peek through TFE struct returns to only use the data size.
1222 const DataLayout &DL, Type *Ty,
1223 unsigned MaxNumLanes) {
1224 auto *ST = dyn_cast<StructType>(Ty);
1225 if (!ST)
1226 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1227
1228 // TFE intrinsics return an aggregate type.
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1231 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1232}
1233
1234/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1235/// in-memory representation. This return value is a custom type because there
1236/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1237/// could cause issues during codegen, these address space 7 pointers will be
1238/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1239/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1240/// for cost modeling, to work. (This also sets us up decently for doing the
1241/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1243 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1249}
1250/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1251/// v8i32 when padding is added.
1252/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1253/// also v8i32 with padding.
1255 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1259 return MVT::v8i32;
1261}
1262
1263static unsigned getIntrMemWidth(unsigned IntrID) {
1264 switch (IntrID) {
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1267 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1268 return 8;
1269 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1270 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1271 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1272 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1273 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1274 return 32;
1275 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1276 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1277 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1278 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1279 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1280 return 64;
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1284 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1285 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1286 return 128;
1287 default:
1288 llvm_unreachable("Unknown width");
1289 }
1290}
1291
1292static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1294 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1295 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1296 switch (AtomicOrderingCABI(Ord)) {
1299 break;
1302 break;
1305 break;
1306 default:
1308 break;
1309 }
1310
1311 Info.flags =
1313 Info.flags |= MOCooperative;
1314
1315 MDNode *ScopeMD = cast<MDNode>(
1316 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1317 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1318 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1319}
1320
1322 const CallInst &CI,
1323 MachineFunction &MF,
1324 unsigned IntrID) const {
1325 Info.flags = MachineMemOperand::MONone;
1326 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1327 Info.flags |= MachineMemOperand::MOInvariant;
1328 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1330 Info.flags |= getTargetMMOFlags(CI);
1331
1332 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1334 AttributeSet Attr =
1336 MemoryEffects ME = Attr.getMemoryEffects();
1337 if (ME.doesNotAccessMemory())
1338 return false;
1339
1340 // TODO: Should images get their own address space?
1341 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1342
1343 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1344 if (RsrcIntr->IsImage) {
1345 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1347 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1348 Info.align.reset();
1349 }
1350
1351 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1352 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1353 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1354 // We conservatively set the memory operand of a buffer intrinsic to the
1355 // base resource pointer, so that we can access alias information about
1356 // those pointers. Cases like "this points at the same value
1357 // but with a different offset" are handled in
1358 // areMemAccessesTriviallyDisjoint.
1359 Info.ptrVal = RsrcArg;
1360 }
1361
1362 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1363 if (!IsSPrefetch) {
1364 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1365 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1366 Info.flags |= MachineMemOperand::MOVolatile;
1367 }
1368
1370 if (ME.onlyReadsMemory()) {
1371 if (RsrcIntr->IsImage) {
1372 unsigned MaxNumLanes = 4;
1373
1374 if (!BaseOpcode->Gather4) {
1375 // If this isn't a gather, we may have excess loaded elements in the
1376 // IR type. Check the dmask for the real number of elements loaded.
1377 unsigned DMask =
1378 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1379 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1380 }
1381
1382 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1383 CI.getType(), MaxNumLanes);
1384 } else {
1385 Info.memVT =
1387 std::numeric_limits<unsigned>::max());
1388 }
1389
1390 // FIXME: What does alignment mean for an image?
1391 Info.opc = ISD::INTRINSIC_W_CHAIN;
1392 Info.flags |= MachineMemOperand::MOLoad;
1393 } else if (ME.onlyWritesMemory()) {
1394 Info.opc = ISD::INTRINSIC_VOID;
1395
1396 Type *DataTy = CI.getArgOperand(0)->getType();
1397 if (RsrcIntr->IsImage) {
1398 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1399 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1400 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1401 DMaskLanes);
1402 } else
1403 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1404
1405 Info.flags |= MachineMemOperand::MOStore;
1406 } else {
1407 // Atomic, NoReturn Sampler or prefetch
1408 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1410 Info.flags |=
1412
1413 if (!IsSPrefetch)
1414 Info.flags |= MachineMemOperand::MOStore;
1415
1416 switch (IntrID) {
1417 default:
1418 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1419 // Fake memory access type for no return sampler intrinsics
1420 Info.memVT = MVT::i32;
1421 } else {
1422 // XXX - Should this be volatile without known ordering?
1423 Info.flags |= MachineMemOperand::MOVolatile;
1424 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1425 }
1426 break;
1427 case Intrinsic::amdgcn_raw_buffer_load_lds:
1428 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1429 case Intrinsic::amdgcn_struct_buffer_load_lds:
1430 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1431 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1432 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1433 Info.ptrVal = CI.getArgOperand(1);
1434 return true;
1435 }
1436 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1437 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1438 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1439 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1440 Info.memVT =
1442 std::numeric_limits<unsigned>::max());
1443 Info.flags &= ~MachineMemOperand::MOStore;
1444 return true;
1445 }
1446 }
1447 }
1448 return true;
1449 }
1450
1451 switch (IntrID) {
1452 case Intrinsic::amdgcn_ds_ordered_add:
1453 case Intrinsic::amdgcn_ds_ordered_swap: {
1454 Info.opc = ISD::INTRINSIC_W_CHAIN;
1455 Info.memVT = MVT::getVT(CI.getType());
1456 Info.ptrVal = CI.getOperand(0);
1457 Info.align.reset();
1459
1460 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1461 if (!Vol->isZero())
1462 Info.flags |= MachineMemOperand::MOVolatile;
1463
1464 return true;
1465 }
1466 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1467 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1468 Info.opc = ISD::INTRINSIC_W_CHAIN;
1469 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1470 Info.ptrVal = nullptr;
1471 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_append:
1476 case Intrinsic::amdgcn_ds_consume: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getType());
1479 Info.ptrVal = CI.getOperand(0);
1480 Info.align.reset();
1482
1483 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1484 if (!Vol->isZero())
1485 Info.flags |= MachineMemOperand::MOVolatile;
1486
1487 return true;
1488 }
1489 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1490 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1491 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1494 Info.memVT = MVT::getVT(CI.getType());
1495 Info.ptrVal = CI.getOperand(0);
1496 Info.memVT = MVT::i64;
1497 Info.size = 8;
1498 Info.align.reset();
1500 return true;
1501 }
1502 case Intrinsic::amdgcn_global_atomic_csub: {
1503 Info.opc = ISD::INTRINSIC_W_CHAIN;
1504 Info.memVT = MVT::getVT(CI.getType());
1505 Info.ptrVal = CI.getOperand(0);
1506 Info.align.reset();
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1512 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1513 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1514 Info.opc = ISD::INTRINSIC_W_CHAIN;
1515 Info.memVT =
1516 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1517 ? CI.getType()
1519 ->getElementType(0)); // XXX: what is correct VT?
1520
1521 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1522 Info.align.reset();
1523 Info.flags |=
1525 return true;
1526 }
1527 case Intrinsic::amdgcn_global_atomic_fmin_num:
1528 case Intrinsic::amdgcn_global_atomic_fmax_num:
1529 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1530 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1531 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1532 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1533 Info.opc = ISD::INTRINSIC_W_CHAIN;
1534 Info.memVT = MVT::getVT(CI.getType());
1535 Info.ptrVal = CI.getOperand(0);
1536 Info.align.reset();
1540 return true;
1541 }
1542 case Intrinsic::amdgcn_flat_load_monitor_b32:
1543 case Intrinsic::amdgcn_flat_load_monitor_b64:
1544 case Intrinsic::amdgcn_flat_load_monitor_b128:
1545 case Intrinsic::amdgcn_global_load_monitor_b32:
1546 case Intrinsic::amdgcn_global_load_monitor_b64:
1547 case Intrinsic::amdgcn_global_load_monitor_b128:
1548 case Intrinsic::amdgcn_cluster_load_b32:
1549 case Intrinsic::amdgcn_cluster_load_b64:
1550 case Intrinsic::amdgcn_cluster_load_b128:
1551 case Intrinsic::amdgcn_ds_load_tr6_b96:
1552 case Intrinsic::amdgcn_ds_load_tr4_b64:
1553 case Intrinsic::amdgcn_ds_load_tr8_b64:
1554 case Intrinsic::amdgcn_ds_load_tr16_b128:
1555 case Intrinsic::amdgcn_global_load_tr6_b96:
1556 case Intrinsic::amdgcn_global_load_tr4_b64:
1557 case Intrinsic::amdgcn_global_load_tr_b64:
1558 case Intrinsic::amdgcn_global_load_tr_b128:
1559 case Intrinsic::amdgcn_ds_read_tr4_b64:
1560 case Intrinsic::amdgcn_ds_read_tr6_b96:
1561 case Intrinsic::amdgcn_ds_read_tr8_b64:
1562 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1563 Info.opc = ISD::INTRINSIC_W_CHAIN;
1564 Info.memVT = MVT::getVT(CI.getType());
1565 Info.ptrVal = CI.getOperand(0);
1566 Info.align.reset();
1567 Info.flags |= MachineMemOperand::MOLoad;
1568 return true;
1569 }
1570 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1571 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1572 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1573 Info.opc = ISD::INTRINSIC_W_CHAIN;
1574 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1575 Info.ptrVal = CI.getOperand(0);
1576 Info.align.reset();
1577 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1578 return true;
1579 }
1580 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1581 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1582 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1583 Info.opc = ISD::INTRINSIC_VOID;
1584 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1585 Info.ptrVal = CI.getArgOperand(0);
1586 Info.align.reset();
1587 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1588 return true;
1589 }
1590 case Intrinsic::amdgcn_ds_gws_init:
1591 case Intrinsic::amdgcn_ds_gws_barrier:
1592 case Intrinsic::amdgcn_ds_gws_sema_v:
1593 case Intrinsic::amdgcn_ds_gws_sema_br:
1594 case Intrinsic::amdgcn_ds_gws_sema_p:
1595 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1596 Info.opc = ISD::INTRINSIC_VOID;
1597
1598 const GCNTargetMachine &TM =
1599 static_cast<const GCNTargetMachine &>(getTargetMachine());
1600
1602 Info.ptrVal = MFI->getGWSPSV(TM);
1603
1604 // This is an abstract access, but we need to specify a type and size.
1605 Info.memVT = MVT::i32;
1606 Info.size = 4;
1607 Info.align = Align(4);
1608
1609 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1610 Info.flags |= MachineMemOperand::MOLoad;
1611 else
1612 Info.flags |= MachineMemOperand::MOStore;
1613 return true;
1614 }
1615 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1616 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1617 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1618 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1619 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1620 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1621 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1622 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1623 Info.opc = ISD::INTRINSIC_VOID;
1624 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1625 Info.ptrVal = CI.getArgOperand(1);
1627 return true;
1628 }
1629 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1630 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1631 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1632 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1633 Info.opc = ISD::INTRINSIC_VOID;
1634 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1635 Info.ptrVal = CI.getArgOperand(0);
1637 return true;
1638 }
1639 case Intrinsic::amdgcn_load_to_lds:
1640 case Intrinsic::amdgcn_global_load_lds: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1643 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1644 Info.ptrVal = CI.getArgOperand(1);
1646 return true;
1647 }
1648 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1649 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1650 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1651 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1652 Info.opc = ISD::INTRINSIC_W_CHAIN;
1653
1654 const GCNTargetMachine &TM =
1655 static_cast<const GCNTargetMachine &>(getTargetMachine());
1656
1658 Info.ptrVal = MFI->getGWSPSV(TM);
1659
1660 // This is an abstract access, but we need to specify a type and size.
1661 Info.memVT = MVT::i32;
1662 Info.size = 4;
1663 Info.align = Align(4);
1664
1666 return true;
1667 }
1668 case Intrinsic::amdgcn_s_prefetch_data:
1669 case Intrinsic::amdgcn_flat_prefetch:
1670 case Intrinsic::amdgcn_global_prefetch: {
1671 Info.opc = ISD::INTRINSIC_VOID;
1672 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1673 Info.ptrVal = CI.getArgOperand(0);
1674 Info.flags |= MachineMemOperand::MOLoad;
1675 return true;
1676 }
1677 default:
1678 return false;
1679 }
1680}
1681
1683 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1685 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1686 // The DAG's ValueType loses the addrspaces.
1687 // Add them as 2 extra Constant operands "from" and "to".
1688 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1689 unsigned DstAS = I.getType()->getPointerAddressSpace();
1690 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1691 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1692 break;
1693 }
1694 default:
1695 break;
1696 }
1697}
1698
1701 Type *&AccessTy) const {
1702 Value *Ptr = nullptr;
1703 switch (II->getIntrinsicID()) {
1704 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1705 case Intrinsic::amdgcn_cluster_load_b128:
1706 case Intrinsic::amdgcn_cluster_load_b64:
1707 case Intrinsic::amdgcn_cluster_load_b32:
1708 case Intrinsic::amdgcn_ds_append:
1709 case Intrinsic::amdgcn_ds_consume:
1710 case Intrinsic::amdgcn_ds_load_tr8_b64:
1711 case Intrinsic::amdgcn_ds_load_tr16_b128:
1712 case Intrinsic::amdgcn_ds_load_tr4_b64:
1713 case Intrinsic::amdgcn_ds_load_tr6_b96:
1714 case Intrinsic::amdgcn_ds_read_tr4_b64:
1715 case Intrinsic::amdgcn_ds_read_tr6_b96:
1716 case Intrinsic::amdgcn_ds_read_tr8_b64:
1717 case Intrinsic::amdgcn_ds_read_tr16_b64:
1718 case Intrinsic::amdgcn_ds_ordered_add:
1719 case Intrinsic::amdgcn_ds_ordered_swap:
1720 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1721 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1722 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1723 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1724 case Intrinsic::amdgcn_flat_load_monitor_b128:
1725 case Intrinsic::amdgcn_flat_load_monitor_b32:
1726 case Intrinsic::amdgcn_flat_load_monitor_b64:
1727 case Intrinsic::amdgcn_global_atomic_csub:
1728 case Intrinsic::amdgcn_global_atomic_fmax_num:
1729 case Intrinsic::amdgcn_global_atomic_fmin_num:
1730 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1731 case Intrinsic::amdgcn_global_load_monitor_b128:
1732 case Intrinsic::amdgcn_global_load_monitor_b32:
1733 case Intrinsic::amdgcn_global_load_monitor_b64:
1734 case Intrinsic::amdgcn_global_load_tr_b64:
1735 case Intrinsic::amdgcn_global_load_tr_b128:
1736 case Intrinsic::amdgcn_global_load_tr4_b64:
1737 case Intrinsic::amdgcn_global_load_tr6_b96:
1738 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1739 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1740 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1741 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1742 Ptr = II->getArgOperand(0);
1743 break;
1744 case Intrinsic::amdgcn_load_to_lds:
1745 case Intrinsic::amdgcn_global_load_lds:
1746 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1747 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1748 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1749 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1750 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1751 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1752 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1753 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1754 Ptr = II->getArgOperand(1);
1755 break;
1756 default:
1757 return false;
1758 }
1759 AccessTy = II->getType();
1760 Ops.push_back(Ptr);
1761 return true;
1762}
1763
1765 unsigned AddrSpace) const {
1766 if (!Subtarget->hasFlatInstOffsets()) {
1767 // Flat instructions do not have offsets, and only have the register
1768 // address.
1769 return AM.BaseOffs == 0 && AM.Scale == 0;
1770 }
1771
1772 decltype(SIInstrFlags::FLAT) FlatVariant =
1776
1777 return AM.Scale == 0 &&
1778 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1779 AM.BaseOffs, AddrSpace, FlatVariant));
1780}
1781
1783 if (Subtarget->hasFlatGlobalInsts())
1785
1786 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1787 // Assume the we will use FLAT for all global memory accesses
1788 // on VI.
1789 // FIXME: This assumption is currently wrong. On VI we still use
1790 // MUBUF instructions for the r + i addressing mode. As currently
1791 // implemented, the MUBUF instructions only work on buffer < 4GB.
1792 // It may be possible to support > 4GB buffers with MUBUF instructions,
1793 // by setting the stride value in the resource descriptor which would
1794 // increase the size limit to (stride * 4GB). However, this is risky,
1795 // because it has never been validated.
1797 }
1798
1799 return isLegalMUBUFAddressingMode(AM);
1800}
1801
1802bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1803 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1804 // additionally can do r + r + i with addr64. 32-bit has more addressing
1805 // mode options. Depending on the resource constant, it can also do
1806 // (i64 r0) + (i32 r1) * (i14 i).
1807 //
1808 // Private arrays end up using a scratch buffer most of the time, so also
1809 // assume those use MUBUF instructions. Scratch loads / stores are currently
1810 // implemented as mubuf instructions with offen bit set, so slightly
1811 // different than the normal addr64.
1812 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1813 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1814 return false;
1815
1816 // FIXME: Since we can split immediate into soffset and immediate offset,
1817 // would it make sense to allow any immediate?
1818
1819 switch (AM.Scale) {
1820 case 0: // r + i or just i, depending on HasBaseReg.
1821 return true;
1822 case 1:
1823 return true; // We have r + r or r + i.
1824 case 2:
1825 if (AM.HasBaseReg) {
1826 // Reject 2 * r + r.
1827 return false;
1828 }
1829
1830 // Allow 2 * r as r + r
1831 // Or 2 * r + i is allowed as r + r + i.
1832 return true;
1833 default: // Don't allow n * r
1834 return false;
1835 }
1836}
1837
1839 const AddrMode &AM, Type *Ty,
1840 unsigned AS,
1841 Instruction *I) const {
1842 // No global is ever allowed as a base.
1843 if (AM.BaseGV)
1844 return false;
1845
1846 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1847 return isLegalGlobalAddressingMode(AM);
1848
1849 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1853 // If the offset isn't a multiple of 4, it probably isn't going to be
1854 // correctly aligned.
1855 // FIXME: Can we get the real alignment here?
1856 if (AM.BaseOffs % 4 != 0)
1857 return isLegalMUBUFAddressingMode(AM);
1858
1859 if (!Subtarget->hasScalarSubwordLoads()) {
1860 // There are no SMRD extloads, so if we have to do a small type access we
1861 // will use a MUBUF load.
1862 // FIXME?: We also need to do this if unaligned, but we don't know the
1863 // alignment here.
1864 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1865 return isLegalGlobalAddressingMode(AM);
1866 }
1867
1868 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1869 // SMRD instructions have an 8-bit, dword offset on SI.
1870 if (!isUInt<8>(AM.BaseOffs / 4))
1871 return false;
1872 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1873 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1874 // in 8-bits, it can use a smaller encoding.
1875 if (!isUInt<32>(AM.BaseOffs / 4))
1876 return false;
1877 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1878 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1879 if (!isUInt<20>(AM.BaseOffs))
1880 return false;
1881 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1882 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1883 // for S_BUFFER_* instructions).
1884 if (!isInt<21>(AM.BaseOffs))
1885 return false;
1886 } else {
1887 // On GFX12, all offsets are signed 24-bit in bytes.
1888 if (!isInt<24>(AM.BaseOffs))
1889 return false;
1890 }
1891
1892 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1894 AM.BaseOffs < 0) {
1895 // Scalar (non-buffer) loads can only use a negative offset if
1896 // soffset+offset is non-negative. Since the compiler can only prove that
1897 // in a few special cases, it is safer to claim that negative offsets are
1898 // not supported.
1899 return false;
1900 }
1901
1902 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1903 return true;
1904
1905 if (AM.Scale == 1 && AM.HasBaseReg)
1906 return true;
1907
1908 return false;
1909 }
1910
1911 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1912 return Subtarget->enableFlatScratch()
1914 : isLegalMUBUFAddressingMode(AM);
1915
1916 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1917 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1918 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1919 // field.
1920 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1921 // an 8-bit dword offset but we don't know the alignment here.
1922 if (!isUInt<16>(AM.BaseOffs))
1923 return false;
1924
1925 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1926 return true;
1927
1928 if (AM.Scale == 1 && AM.HasBaseReg)
1929 return true;
1930
1931 return false;
1932 }
1933
1935 // For an unknown address space, this usually means that this is for some
1936 // reason being used for pure arithmetic, and not based on some addressing
1937 // computation. We don't have instructions that compute pointers with any
1938 // addressing modes, so treat them as having no offset like flat
1939 // instructions.
1941 }
1942
1943 // Assume a user alias of global for unknown address spaces.
1944 return isLegalGlobalAddressingMode(AM);
1945}
1946
1948 const MachineFunction &MF) const {
1950 return (MemVT.getSizeInBits() <= 4 * 32);
1951 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1952 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1953 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1954 }
1956 return (MemVT.getSizeInBits() <= 2 * 32);
1957 return true;
1958}
1959
1961 unsigned Size, unsigned AddrSpace, Align Alignment,
1962 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1963 if (IsFast)
1964 *IsFast = 0;
1965
1966 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1967 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1968 // Check if alignment requirements for ds_read/write instructions are
1969 // disabled.
1970 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1971 return false;
1972
1973 Align RequiredAlignment(
1974 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1975 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1976 Alignment < RequiredAlignment)
1977 return false;
1978
1979 // Either, the alignment requirements are "enabled", or there is an
1980 // unaligned LDS access related hardware bug though alignment requirements
1981 // are "disabled". In either case, we need to check for proper alignment
1982 // requirements.
1983 //
1984 switch (Size) {
1985 case 64:
1986 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1987 // address is negative, then the instruction is incorrectly treated as
1988 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1989 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1990 // load later in the SILoadStoreOptimizer.
1991 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1992 return false;
1993
1994 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1995 // can do a 4 byte aligned, 8 byte access in a single operation using
1996 // ds_read2/write2_b32 with adjacent offsets.
1997 RequiredAlignment = Align(4);
1998
1999 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2000 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2001 // ds_write2_b32 depending on the alignment. In either case with either
2002 // alignment there is no faster way of doing this.
2003
2004 // The numbers returned here and below are not additive, it is a 'speed
2005 // rank'. They are just meant to be compared to decide if a certain way
2006 // of lowering an operation is faster than another. For that purpose
2007 // naturally aligned operation gets it bitsize to indicate that "it
2008 // operates with a speed comparable to N-bit wide load". With the full
2009 // alignment ds128 is slower than ds96 for example. If underaligned it
2010 // is comparable to a speed of a single dword access, which would then
2011 // mean 32 < 128 and it is faster to issue a wide load regardless.
2012 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2013 // wider load which will not be aligned anymore the latter is slower.
2014 if (IsFast)
2015 *IsFast = (Alignment >= RequiredAlignment) ? 64
2016 : (Alignment < Align(4)) ? 32
2017 : 1;
2018 return true;
2019 }
2020
2021 break;
2022 case 96:
2023 if (!Subtarget->hasDS96AndDS128())
2024 return false;
2025
2026 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2027 // gfx8 and older.
2028
2029 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2030 // Naturally aligned access is fastest. However, also report it is Fast
2031 // if memory is aligned less than DWORD. A narrow load or store will be
2032 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2033 // be more of them, so overall we will pay less penalty issuing a single
2034 // instruction.
2035
2036 // See comment on the values above.
2037 if (IsFast)
2038 *IsFast = (Alignment >= RequiredAlignment) ? 96
2039 : (Alignment < Align(4)) ? 32
2040 : 1;
2041 return true;
2042 }
2043
2044 break;
2045 case 128:
2046 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2047 return false;
2048
2049 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2050 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2051 // single operation using ds_read2/write2_b64.
2052 RequiredAlignment = Align(8);
2053
2054 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2055 // Naturally aligned access is fastest. However, also report it is Fast
2056 // if memory is aligned less than DWORD. A narrow load or store will be
2057 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2058 // will be more of them, so overall we will pay less penalty issuing a
2059 // single instruction.
2060
2061 // See comment on the values above.
2062 if (IsFast)
2063 *IsFast = (Alignment >= RequiredAlignment) ? 128
2064 : (Alignment < Align(4)) ? 32
2065 : 1;
2066 return true;
2067 }
2068
2069 break;
2070 default:
2071 if (Size > 32)
2072 return false;
2073
2074 break;
2075 }
2076
2077 // See comment on the values above.
2078 // Note that we have a single-dword or sub-dword here, so if underaligned
2079 // it is a slowest possible access, hence returned value is 0.
2080 if (IsFast)
2081 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2082
2083 return Alignment >= RequiredAlignment ||
2084 Subtarget->hasUnalignedDSAccessEnabled();
2085 }
2086
2087 // FIXME: We have to be conservative here and assume that flat operations
2088 // will access scratch. If we had access to the IR function, then we
2089 // could determine if any private memory was used in the function.
2090 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2091 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2092 bool AlignedBy4 = Alignment >= Align(4);
2093 if (IsFast)
2094 *IsFast = AlignedBy4;
2095
2096 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2097 }
2098
2099 // So long as they are correct, wide global memory operations perform better
2100 // than multiple smaller memory ops -- even when misaligned
2101 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2102 if (IsFast)
2103 *IsFast = Size;
2104
2105 return Alignment >= Align(4) ||
2106 Subtarget->hasUnalignedBufferAccessEnabled();
2107 }
2108
2109 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2110 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2111 // out-of-bounds behavior, but in the edge case where an access starts
2112 // out-of-bounds and then enter in-bounds, the entire access would be treated
2113 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2114 // natural alignment of buffer accesses.
2115 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2116 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2117 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2118 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2119 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2120 return false;
2121 }
2122
2123 // Smaller than dword value must be aligned.
2124 if (Size < 32)
2125 return false;
2126
2127 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2128 // byte-address are ignored, thus forcing Dword alignment.
2129 // This applies to private, global, and constant memory.
2130 if (IsFast)
2131 *IsFast = 1;
2132
2133 return Size >= 32 && Alignment >= Align(4);
2134}
2135
2137 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2138 unsigned *IsFast) const {
2140 Alignment, Flags, IsFast);
2141}
2142
2144 LLVMContext &Context, const MemOp &Op,
2145 const AttributeList &FuncAttributes) const {
2146 // FIXME: Should account for address space here.
2147
2148 // The default fallback uses the private pointer size as a guess for a type to
2149 // use. Make sure we switch these to 64-bit accesses.
2150
2151 if (Op.size() >= 16 &&
2152 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2153 return MVT::v4i32;
2154
2155 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2156 return MVT::v2i32;
2157
2158 // Use the default.
2159 return MVT::Other;
2160}
2161
2163 const MemSDNode *MemNode = cast<MemSDNode>(N);
2164 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2165}
2166
2171
2173 unsigned DestAS) const {
2174 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2175 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2176 Subtarget->hasGloballyAddressableScratch()) {
2177 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2178 return false;
2179 }
2180
2181 // Flat -> private/local is a simple truncate.
2182 // Flat -> global is no-op
2183 return true;
2184 }
2185
2186 const GCNTargetMachine &TM =
2187 static_cast<const GCNTargetMachine &>(getTargetMachine());
2188 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2189}
2190
2198
2200 Type *Ty) const {
2201 // FIXME: Could be smarter if called for vector constants.
2202 return true;
2203}
2204
2206 unsigned Index) const {
2208 return false;
2209
2210 // TODO: Add more cases that are cheap.
2211 return Index == 0;
2212}
2213
2214bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2215 // TODO: This should be more aggressive, particular for 16-bit element
2216 // vectors. However there are some mixed improvements and regressions.
2217 EVT EltTy = VT.getVectorElementType();
2218 return EltTy.getSizeInBits() % 32 == 0;
2219}
2220
2222 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2223 switch (Op) {
2224 case ISD::LOAD:
2225 case ISD::STORE:
2226 return true;
2227 default:
2228 return false;
2229 }
2230 }
2231
2232 // SimplifySetCC uses this function to determine whether or not it should
2233 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2234 if (VT == MVT::i1 && Op == ISD::SETCC)
2235 return false;
2236
2238}
2239
2240SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2241 const SDLoc &SL,
2242 SDValue Chain,
2243 uint64_t Offset) const {
2244 const DataLayout &DL = DAG.getDataLayout();
2248
2249 auto [InputPtrReg, RC, ArgTy] =
2250 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2251
2252 // We may not have the kernarg segment argument if we have no kernel
2253 // arguments.
2254 if (!InputPtrReg)
2255 return DAG.getConstant(Offset, SL, PtrVT);
2256
2258 SDValue BasePtr = DAG.getCopyFromReg(
2259 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2260
2261 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2262}
2263
2264SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2265 const SDLoc &SL) const {
2268 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2269}
2270
2271SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2272 const SDLoc &SL) const {
2273
2275 std::optional<uint32_t> KnownSize =
2277 if (KnownSize.has_value())
2278 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2279 return SDValue();
2280}
2281
2282SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2283 const SDLoc &SL, SDValue Val,
2284 bool Signed,
2285 const ISD::InputArg *Arg) const {
2286 // First, if it is a widened vector, narrow it.
2287 if (VT.isVector() &&
2289 EVT NarrowedVT =
2292 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2293 DAG.getConstant(0, SL, MVT::i32));
2294 }
2295
2296 // Then convert the vector elements or scalar value.
2297 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2298 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2299 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2300 }
2301
2302 if (MemVT.isFloatingPoint())
2303 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2304 else if (Signed)
2305 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2306 else
2307 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2308
2309 return Val;
2310}
2311
2312SDValue SITargetLowering::lowerKernargMemParameter(
2313 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2314 uint64_t Offset, Align Alignment, bool Signed,
2315 const ISD::InputArg *Arg) const {
2316 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2317
2318 // Try to avoid using an extload by loading earlier than the argument address,
2319 // and extracting the relevant bits. The load should hopefully be merged with
2320 // the previous argument.
2321 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2322 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2323 int64_t AlignDownOffset = alignDown(Offset, 4);
2324 int64_t OffsetDiff = Offset - AlignDownOffset;
2325
2326 EVT IntVT = MemVT.changeTypeToInteger();
2327
2328 // TODO: If we passed in the base kernel offset we could have a better
2329 // alignment than 4, but we don't really need it.
2330 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2331 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2334
2335 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2336 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2337
2338 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2339 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2340 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2341
2342 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2343 }
2344
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2346 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2349
2350 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2351 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2352}
2353
2354SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2355 CCValAssign &VA, const SDLoc &SL,
2356 SDValue Chain,
2357 const ISD::InputArg &Arg) const {
2358 MachineFunction &MF = DAG.getMachineFunction();
2359 MachineFrameInfo &MFI = MF.getFrameInfo();
2360
2361 if (Arg.Flags.isByVal()) {
2362 unsigned Size = Arg.Flags.getByValSize();
2363 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2364 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2365 }
2366
2367 unsigned ArgOffset = VA.getLocMemOffset();
2368 unsigned ArgSize = VA.getValVT().getStoreSize();
2369
2370 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2371
2372 // Create load nodes to retrieve arguments from the stack.
2373 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2374 SDValue ArgValue;
2375
2376 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2378 MVT MemVT = VA.getValVT();
2379
2380 switch (VA.getLocInfo()) {
2381 default:
2382 break;
2383 case CCValAssign::BCvt:
2384 MemVT = VA.getLocVT();
2385 break;
2386 case CCValAssign::SExt:
2387 ExtType = ISD::SEXTLOAD;
2388 break;
2389 case CCValAssign::ZExt:
2390 ExtType = ISD::ZEXTLOAD;
2391 break;
2392 case CCValAssign::AExt:
2393 ExtType = ISD::EXTLOAD;
2394 break;
2395 }
2396
2397 ArgValue = DAG.getExtLoad(
2398 ExtType, SL, VA.getLocVT(), Chain, FIN,
2400 return ArgValue;
2401}
2402
2403SDValue SITargetLowering::getPreloadedValue(
2404 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2406 const ArgDescriptor *Reg = nullptr;
2407 const TargetRegisterClass *RC;
2408 LLT Ty;
2409
2411 const ArgDescriptor WorkGroupIDX =
2412 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2413 // If GridZ is not programmed in an entry function then the hardware will set
2414 // it to all zeros, so there is no need to mask the GridY value in the low
2415 // order bits.
2416 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2417 AMDGPU::TTMP7,
2418 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2419 const ArgDescriptor WorkGroupIDZ =
2420 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2421 if (Subtarget->hasArchitectedSGPRs() &&
2424 switch (PVID) {
2426 Reg = &WorkGroupIDX;
2427 RC = &AMDGPU::SReg_32RegClass;
2428 Ty = LLT::scalar(32);
2429 break;
2431 Reg = &WorkGroupIDY;
2432 RC = &AMDGPU::SReg_32RegClass;
2433 Ty = LLT::scalar(32);
2434 break;
2436 Reg = &WorkGroupIDZ;
2437 RC = &AMDGPU::SReg_32RegClass;
2438 Ty = LLT::scalar(32);
2439 break;
2440 default:
2441 break;
2442 }
2443 }
2444
2445 if (!Reg)
2446 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2447 if (!Reg) {
2449 // It's possible for a kernarg intrinsic call to appear in a kernel with
2450 // no allocated segment, in which case we do not add the user sgpr
2451 // argument, so just return null.
2452 return DAG.getConstant(0, SDLoc(), VT);
2453 }
2454
2455 // It's undefined behavior if a function marked with the amdgpu-no-*
2456 // attributes uses the corresponding intrinsic.
2457 return DAG.getPOISON(VT);
2458 }
2459
2460 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2461}
2462
2464 CallingConv::ID CallConv,
2465 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2466 FunctionType *FType,
2468 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2469 const ISD::InputArg *Arg = &Ins[I];
2470
2471 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2472 "vector type argument should have been split");
2473
2474 // First check if it's a PS input addr.
2475 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2476 PSInputNum <= 15) {
2477 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2478
2479 // Inconveniently only the first part of the split is marked as isSplit,
2480 // so skip to the end. We only want to increment PSInputNum once for the
2481 // entire split argument.
2482 if (Arg->Flags.isSplit()) {
2483 while (!Arg->Flags.isSplitEnd()) {
2484 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2485 "unexpected vector split in ps argument type");
2486 if (!SkipArg)
2487 Splits.push_back(*Arg);
2488 Arg = &Ins[++I];
2489 }
2490 }
2491
2492 if (SkipArg) {
2493 // We can safely skip PS inputs.
2494 Skipped.set(Arg->getOrigArgIndex());
2495 ++PSInputNum;
2496 continue;
2497 }
2498
2499 Info->markPSInputAllocated(PSInputNum);
2500 if (Arg->Used)
2501 Info->markPSInputEnabled(PSInputNum);
2502
2503 ++PSInputNum;
2504 }
2505
2506 Splits.push_back(*Arg);
2507 }
2508}
2509
2510// Allocate special inputs passed in VGPRs.
2512 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2513 SIMachineFunctionInfo &Info) const {
2514 const LLT S32 = LLT::scalar(32);
2516
2517 if (Info.hasWorkItemIDX()) {
2518 Register Reg = AMDGPU::VGPR0;
2519 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2520
2521 CCInfo.AllocateReg(Reg);
2522 unsigned Mask =
2523 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2524 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2525 }
2526
2527 if (Info.hasWorkItemIDY()) {
2528 assert(Info.hasWorkItemIDX());
2529 if (Subtarget->hasPackedTID()) {
2530 Info.setWorkItemIDY(
2531 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2532 } else {
2533 unsigned Reg = AMDGPU::VGPR1;
2534 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2535
2536 CCInfo.AllocateReg(Reg);
2537 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2538 }
2539 }
2540
2541 if (Info.hasWorkItemIDZ()) {
2542 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2543 if (Subtarget->hasPackedTID()) {
2544 Info.setWorkItemIDZ(
2545 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2546 } else {
2547 unsigned Reg = AMDGPU::VGPR2;
2548 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2549
2550 CCInfo.AllocateReg(Reg);
2551 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2552 }
2553 }
2554}
2555
2556// Try to allocate a VGPR at the end of the argument list, or if no argument
2557// VGPRs are left allocating a stack slot.
2558// If \p Mask is is given it indicates bitfield position in the register.
2559// If \p Arg is given use it with new ]p Mask instead of allocating new.
2560static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2561 ArgDescriptor Arg = ArgDescriptor()) {
2562 if (Arg.isSet())
2563 return ArgDescriptor::createArg(Arg, Mask);
2564
2565 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2566 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2567 if (RegIdx == ArgVGPRs.size()) {
2568 // Spill to stack required.
2569 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2570
2571 return ArgDescriptor::createStack(Offset, Mask);
2572 }
2573
2574 unsigned Reg = ArgVGPRs[RegIdx];
2575 Reg = CCInfo.AllocateReg(Reg);
2576 assert(Reg != AMDGPU::NoRegister);
2577
2578 MachineFunction &MF = CCInfo.getMachineFunction();
2579 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2580 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2581 return ArgDescriptor::createRegister(Reg, Mask);
2582}
2583
2585 const TargetRegisterClass *RC,
2586 unsigned NumArgRegs) {
2587 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2588 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2589 if (RegIdx == ArgSGPRs.size())
2590 report_fatal_error("ran out of SGPRs for arguments");
2591
2592 unsigned Reg = ArgSGPRs[RegIdx];
2593 Reg = CCInfo.AllocateReg(Reg);
2594 assert(Reg != AMDGPU::NoRegister);
2595
2596 MachineFunction &MF = CCInfo.getMachineFunction();
2597 MF.addLiveIn(Reg, RC);
2599}
2600
2601// If this has a fixed position, we still should allocate the register in the
2602// CCInfo state. Technically we could get away with this for values passed
2603// outside of the normal argument range.
2605 const TargetRegisterClass *RC,
2606 MCRegister Reg) {
2607 Reg = CCInfo.AllocateReg(Reg);
2608 assert(Reg != AMDGPU::NoRegister);
2609 MachineFunction &MF = CCInfo.getMachineFunction();
2610 MF.addLiveIn(Reg, RC);
2611}
2612
2613static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2614 if (Arg) {
2615 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2616 Arg.getRegister());
2617 } else
2618 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2619}
2620
2621static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2622 if (Arg) {
2623 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2624 Arg.getRegister());
2625 } else
2626 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2627}
2628
2629/// Allocate implicit function VGPR arguments at the end of allocated user
2630/// arguments.
2632 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2633 SIMachineFunctionInfo &Info) const {
2634 const unsigned Mask = 0x3ff;
2635 ArgDescriptor Arg;
2636
2637 if (Info.hasWorkItemIDX()) {
2638 Arg = allocateVGPR32Input(CCInfo, Mask);
2639 Info.setWorkItemIDX(Arg);
2640 }
2641
2642 if (Info.hasWorkItemIDY()) {
2643 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2644 Info.setWorkItemIDY(Arg);
2645 }
2646
2647 if (Info.hasWorkItemIDZ())
2648 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2649}
2650
2651/// Allocate implicit function VGPR arguments in fixed registers.
2653 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2654 SIMachineFunctionInfo &Info) const {
2655 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2656 if (!Reg)
2657 report_fatal_error("failed to allocate VGPR for implicit arguments");
2658
2659 const unsigned Mask = 0x3ff;
2660 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2661 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2662 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2663}
2664
2666 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2667 SIMachineFunctionInfo &Info) const {
2668 auto &ArgInfo = Info.getArgInfo();
2669 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2670
2671 // TODO: Unify handling with private memory pointers.
2672 if (UserSGPRInfo.hasDispatchPtr())
2673 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2674
2675 if (UserSGPRInfo.hasQueuePtr())
2676 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2677
2678 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2679 // constant offset from the kernarg segment.
2680 if (Info.hasImplicitArgPtr())
2681 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2682
2683 if (UserSGPRInfo.hasDispatchID())
2684 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2685
2686 // flat_scratch_init is not applicable for non-kernel functions.
2687
2688 if (Info.hasWorkGroupIDX())
2689 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2690
2691 if (Info.hasWorkGroupIDY())
2692 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2693
2694 if (Info.hasWorkGroupIDZ())
2695 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2696
2697 if (Info.hasLDSKernelId())
2698 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2699}
2700
2701// Allocate special inputs passed in user SGPRs.
2703 MachineFunction &MF,
2704 const SIRegisterInfo &TRI,
2705 SIMachineFunctionInfo &Info) const {
2706 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2707 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2708 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2709 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2710 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2711 }
2712
2713 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2714 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2715 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2716 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2717 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2718 }
2719
2720 if (UserSGPRInfo.hasDispatchPtr()) {
2721 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2722 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2723 CCInfo.AllocateReg(DispatchPtrReg);
2724 }
2725
2726 if (UserSGPRInfo.hasQueuePtr()) {
2727 Register QueuePtrReg = Info.addQueuePtr(TRI);
2728 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2729 CCInfo.AllocateReg(QueuePtrReg);
2730 }
2731
2732 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2734 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2735 CCInfo.AllocateReg(InputPtrReg);
2736
2737 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2738 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2739 }
2740
2741 if (UserSGPRInfo.hasDispatchID()) {
2742 Register DispatchIDReg = Info.addDispatchID(TRI);
2743 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2744 CCInfo.AllocateReg(DispatchIDReg);
2745 }
2746
2747 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2748 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2749 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2750 CCInfo.AllocateReg(FlatScratchInitReg);
2751 }
2752
2753 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2754 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2755 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2756 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2757 }
2758
2759 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2760 // these from the dispatch pointer.
2761}
2762
2763// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2764// sequential starting from the first argument.
2766 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2768 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2769 Function &F = MF.getFunction();
2770 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2771 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2772 bool InPreloadSequence = true;
2773 unsigned InIdx = 0;
2774 bool AlignedForImplictArgs = false;
2775 unsigned ImplicitArgOffset = 0;
2776 for (auto &Arg : F.args()) {
2777 if (!InPreloadSequence || !Arg.hasInRegAttr())
2778 break;
2779
2780 unsigned ArgIdx = Arg.getArgNo();
2781 // Don't preload non-original args or parts not in the current preload
2782 // sequence.
2783 if (InIdx < Ins.size() &&
2784 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2785 break;
2786
2787 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2788 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2789 InIdx++) {
2790 assert(ArgLocs[ArgIdx].isMemLoc());
2791 auto &ArgLoc = ArgLocs[InIdx];
2792 const Align KernelArgBaseAlign = Align(16);
2793 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2794 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2795 unsigned NumAllocSGPRs =
2796 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2797
2798 // Fix alignment for hidden arguments.
2799 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2800 if (!AlignedForImplictArgs) {
2801 ImplicitArgOffset =
2802 alignTo(LastExplicitArgOffset,
2803 Subtarget->getAlignmentForImplicitArgPtr()) -
2804 LastExplicitArgOffset;
2805 AlignedForImplictArgs = true;
2806 }
2807 ArgOffset += ImplicitArgOffset;
2808 }
2809
2810 // Arg is preloaded into the previous SGPR.
2811 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2812 assert(InIdx >= 1 && "No previous SGPR");
2813 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2814 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2815 continue;
2816 }
2817
2818 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2819 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2820 // Check for free user SGPRs for preloading.
2821 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2822 InPreloadSequence = false;
2823 break;
2824 }
2825
2826 // Preload this argument.
2827 const TargetRegisterClass *RC =
2828 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2829 SmallVectorImpl<MCRegister> *PreloadRegs =
2830 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2831
2832 if (PreloadRegs->size() > 1)
2833 RC = &AMDGPU::SGPR_32RegClass;
2834 for (auto &Reg : *PreloadRegs) {
2835 assert(Reg);
2836 MF.addLiveIn(Reg, RC);
2837 CCInfo.AllocateReg(Reg);
2838 }
2839
2840 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2841 }
2842 }
2843}
2844
2846 const SIRegisterInfo &TRI,
2847 SIMachineFunctionInfo &Info) const {
2848 // Always allocate this last since it is a synthetic preload.
2849 if (Info.hasLDSKernelId()) {
2850 Register Reg = Info.addLDSKernelId();
2851 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2852 CCInfo.AllocateReg(Reg);
2853 }
2854}
2855
2856// Allocate special input registers that are initialized per-wave.
2859 CallingConv::ID CallConv,
2860 bool IsShader) const {
2861 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2862 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2863 // Note: user SGPRs are handled by the front-end for graphics shaders
2864 // Pad up the used user SGPRs with dead inputs.
2865
2866 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2867 // before enabling architected SGPRs for workgroup IDs.
2868 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2869
2870 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2871 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2872 // rely on it to reach 16 since if we end up having no stack usage, it will
2873 // not really be added.
2874 unsigned NumRequiredSystemSGPRs =
2875 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2876 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2877 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2878 Register Reg = Info.addReservedUserSGPR();
2879 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2880 CCInfo.AllocateReg(Reg);
2881 }
2882 }
2883
2884 if (!HasArchitectedSGPRs) {
2885 if (Info.hasWorkGroupIDX()) {
2886 Register Reg = Info.addWorkGroupIDX();
2887 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2888 CCInfo.AllocateReg(Reg);
2889 }
2890
2891 if (Info.hasWorkGroupIDY()) {
2892 Register Reg = Info.addWorkGroupIDY();
2893 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2894 CCInfo.AllocateReg(Reg);
2895 }
2896
2897 if (Info.hasWorkGroupIDZ()) {
2898 Register Reg = Info.addWorkGroupIDZ();
2899 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2900 CCInfo.AllocateReg(Reg);
2901 }
2902 }
2903
2904 if (Info.hasWorkGroupInfo()) {
2905 Register Reg = Info.addWorkGroupInfo();
2906 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2907 CCInfo.AllocateReg(Reg);
2908 }
2909
2910 if (Info.hasPrivateSegmentWaveByteOffset()) {
2911 // Scratch wave offset passed in system SGPR.
2912 unsigned PrivateSegmentWaveByteOffsetReg;
2913
2914 if (IsShader) {
2915 PrivateSegmentWaveByteOffsetReg =
2916 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2917
2918 // This is true if the scratch wave byte offset doesn't have a fixed
2919 // location.
2920 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2921 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2922 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2923 }
2924 } else
2925 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2926
2927 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2928 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2929 }
2930
2931 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2932 Info.getNumPreloadedSGPRs() >= 16);
2933}
2934
2936 MachineFunction &MF,
2937 const SIRegisterInfo &TRI,
2939 // Now that we've figured out where the scratch register inputs are, see if
2940 // should reserve the arguments and use them directly.
2941 MachineFrameInfo &MFI = MF.getFrameInfo();
2942 bool HasStackObjects = MFI.hasStackObjects();
2943 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2944
2945 // Record that we know we have non-spill stack objects so we don't need to
2946 // check all stack objects later.
2947 if (HasStackObjects)
2948 Info.setHasNonSpillStackObjects(true);
2949
2950 // Everything live out of a block is spilled with fast regalloc, so it's
2951 // almost certain that spilling will be required.
2952 if (TM.getOptLevel() == CodeGenOptLevel::None)
2953 HasStackObjects = true;
2954
2955 // For now assume stack access is needed in any callee functions, so we need
2956 // the scratch registers to pass in.
2957 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2958
2959 if (!ST.enableFlatScratch()) {
2960 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2961 // If we have stack objects, we unquestionably need the private buffer
2962 // resource. For the Code Object V2 ABI, this will be the first 4 user
2963 // SGPR inputs. We can reserve those and use them directly.
2964
2965 Register PrivateSegmentBufferReg =
2967 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2968 } else {
2969 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2970 // We tentatively reserve the last registers (skipping the last registers
2971 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2972 // we'll replace these with the ones immediately after those which were
2973 // really allocated. In the prologue copies will be inserted from the
2974 // argument to these reserved registers.
2975
2976 // Without HSA, relocations are used for the scratch pointer and the
2977 // buffer resource setup is always inserted in the prologue. Scratch wave
2978 // offset is still in an input SGPR.
2979 Info.setScratchRSrcReg(ReservedBufferReg);
2980 }
2981 }
2982
2984
2985 // For entry functions we have to set up the stack pointer if we use it,
2986 // whereas non-entry functions get this "for free". This means there is no
2987 // intrinsic advantage to using S32 over S34 in cases where we do not have
2988 // calls but do need a frame pointer (i.e. if we are requested to have one
2989 // because frame pointer elimination is disabled). To keep things simple we
2990 // only ever use S32 as the call ABI stack pointer, and so using it does not
2991 // imply we need a separate frame pointer.
2992 //
2993 // Try to use s32 as the SP, but move it if it would interfere with input
2994 // arguments. This won't work with calls though.
2995 //
2996 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2997 // registers.
2998 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2999 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3000 } else {
3002
3003 if (MFI.hasCalls())
3004 report_fatal_error("call in graphics shader with too many input SGPRs");
3005
3006 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3007 if (!MRI.isLiveIn(Reg)) {
3008 Info.setStackPtrOffsetReg(Reg);
3009 break;
3010 }
3011 }
3012
3013 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3014 report_fatal_error("failed to find register for SP");
3015 }
3016
3017 // hasFP should be accurate for entry functions even before the frame is
3018 // finalized, because it does not rely on the known stack size, only
3019 // properties like whether variable sized objects are present.
3020 if (ST.getFrameLowering()->hasFP(MF)) {
3021 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3022 }
3023}
3024
3027 return !Info->isEntryFunction();
3028}
3029
3031
3033 MachineBasicBlock *Entry,
3034 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3036
3037 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3038 if (!IStart)
3039 return;
3040
3041 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3042 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3043 MachineBasicBlock::iterator MBBI = Entry->begin();
3044 for (const MCPhysReg *I = IStart; *I; ++I) {
3045 const TargetRegisterClass *RC = nullptr;
3046 if (AMDGPU::SReg_64RegClass.contains(*I))
3047 RC = &AMDGPU::SGPR_64RegClass;
3048 else if (AMDGPU::SReg_32RegClass.contains(*I))
3049 RC = &AMDGPU::SGPR_32RegClass;
3050 else
3051 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3052
3053 Register NewVR = MRI->createVirtualRegister(RC);
3054 // Create copy from CSR to a virtual register.
3055 Entry->addLiveIn(*I);
3056 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3057 .addReg(*I);
3058
3059 // Insert the copy-back instructions right before the terminator.
3060 for (auto *Exit : Exits)
3061 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3062 TII->get(TargetOpcode::COPY), *I)
3063 .addReg(NewVR);
3064 }
3065}
3066
3068 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3069 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3070 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3072
3074 const Function &Fn = MF.getFunction();
3077 bool IsError = false;
3078
3079 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3081 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3082 IsError = true;
3083 }
3084
3087 BitVector Skipped(Ins.size());
3088 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3089 *DAG.getContext());
3090
3091 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3092 bool IsKernel = AMDGPU::isKernel(CallConv);
3093 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3094
3095 if (IsGraphics) {
3096 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3097 assert(!UserSGPRInfo.hasDispatchPtr() &&
3098 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3099 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3100 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3101 (void)UserSGPRInfo;
3102 if (!Subtarget->enableFlatScratch())
3103 assert(!UserSGPRInfo.hasFlatScratchInit());
3104 if ((CallConv != CallingConv::AMDGPU_CS &&
3105 CallConv != CallingConv::AMDGPU_Gfx &&
3106 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3107 !Subtarget->hasArchitectedSGPRs())
3108 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3109 !Info->hasWorkGroupIDZ());
3110 }
3111
3112 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3113
3114 if (CallConv == CallingConv::AMDGPU_PS) {
3115 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3116
3117 // At least one interpolation mode must be enabled or else the GPU will
3118 // hang.
3119 //
3120 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3121 // set PSInputAddr, the user wants to enable some bits after the compilation
3122 // based on run-time states. Since we can't know what the final PSInputEna
3123 // will look like, so we shouldn't do anything here and the user should take
3124 // responsibility for the correct programming.
3125 //
3126 // Otherwise, the following restrictions apply:
3127 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3128 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3129 // enabled too.
3130 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3131 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3132 CCInfo.AllocateReg(AMDGPU::VGPR0);
3133 CCInfo.AllocateReg(AMDGPU::VGPR1);
3134 Info->markPSInputAllocated(0);
3135 Info->markPSInputEnabled(0);
3136 }
3137 if (Subtarget->isAmdPalOS()) {
3138 // For isAmdPalOS, the user does not enable some bits after compilation
3139 // based on run-time states; the register values being generated here are
3140 // the final ones set in hardware. Therefore we need to apply the
3141 // workaround to PSInputAddr and PSInputEnable together. (The case where
3142 // a bit is set in PSInputAddr but not PSInputEnable is where the
3143 // frontend set up an input arg for a particular interpolation mode, but
3144 // nothing uses that input arg. Really we should have an earlier pass
3145 // that removes such an arg.)
3146 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3147 if ((PsInputBits & 0x7F) == 0 ||
3148 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3149 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3150 }
3151 } else if (IsKernel) {
3152 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3153 } else {
3154 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3155 Ins.end());
3156 }
3157
3158 if (IsKernel)
3159 analyzeFormalArgumentsCompute(CCInfo, Ins);
3160
3161 if (IsEntryFunc) {
3162 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3163 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3164 if (IsKernel && Subtarget->hasKernargPreload())
3165 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3166
3167 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3168 } else if (!IsGraphics) {
3169 // For the fixed ABI, pass workitem IDs in the last argument register.
3170 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3171
3172 // FIXME: Sink this into allocateSpecialInputSGPRs
3173 if (!Subtarget->enableFlatScratch())
3174 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3175
3176 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3177 }
3178
3179 if (!IsKernel) {
3180 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3181 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3182
3183 // This assumes the registers are allocated by CCInfo in ascending order
3184 // with no gaps.
3185 Info->setNumWaveDispatchSGPRs(
3186 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3187 Info->setNumWaveDispatchVGPRs(
3188 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3189 } else if (Info->getNumKernargPreloadedSGPRs()) {
3190 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3191 }
3192
3194
3195 if (IsWholeWaveFunc) {
3197 {MVT::i1, MVT::Other}, Chain);
3198 InVals.push_back(Setup.getValue(0));
3199 Chains.push_back(Setup.getValue(1));
3200 }
3201
3202 // FIXME: This is the minimum kernel argument alignment. We should improve
3203 // this to the maximum alignment of the arguments.
3204 //
3205 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3206 // kern arg offset.
3207 const Align KernelArgBaseAlign = Align(16);
3208
3209 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3210 ++i) {
3211 const ISD::InputArg &Arg = Ins[i];
3212 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3213 InVals.push_back(DAG.getPOISON(Arg.VT));
3214 continue;
3215 }
3216
3217 CCValAssign &VA = ArgLocs[ArgIdx++];
3218 MVT VT = VA.getLocVT();
3219
3220 if (IsEntryFunc && VA.isMemLoc()) {
3221 VT = Ins[i].VT;
3222 EVT MemVT = VA.getLocVT();
3223
3224 const uint64_t Offset = VA.getLocMemOffset();
3225 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3226
3227 if (Arg.Flags.isByRef()) {
3228 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3229
3230 const GCNTargetMachine &TM =
3231 static_cast<const GCNTargetMachine &>(getTargetMachine());
3232 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3233 Arg.Flags.getPointerAddrSpace())) {
3236 }
3237
3238 InVals.push_back(Ptr);
3239 continue;
3240 }
3241
3242 SDValue NewArg;
3243 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3244 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3245 // In this case the argument is packed into the previous preload SGPR.
3246 int64_t AlignDownOffset = alignDown(Offset, 4);
3247 int64_t OffsetDiff = Offset - AlignDownOffset;
3248 EVT IntVT = MemVT.changeTypeToInteger();
3249
3250 const SIMachineFunctionInfo *Info =
3253 Register Reg =
3254 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3255
3256 assert(Reg);
3257 Register VReg = MRI.getLiveInVirtReg(Reg);
3258 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3259
3260 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3261 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3262
3263 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3264 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3265 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3266 Ins[i].Flags.isSExt(), &Ins[i]);
3267
3268 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3269 } else {
3270 const SIMachineFunctionInfo *Info =
3273 const SmallVectorImpl<MCRegister> &PreloadRegs =
3274 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3275
3276 SDValue Copy;
3277 if (PreloadRegs.size() == 1) {
3278 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3279 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3280 NewArg = DAG.getCopyFromReg(
3281 Chain, DL, VReg,
3283 TRI->getRegSizeInBits(*RC)));
3284
3285 } else {
3286 // If the kernarg alignment does not match the alignment of the SGPR
3287 // tuple RC that can accommodate this argument, it will be built up
3288 // via copies from from the individual SGPRs that the argument was
3289 // preloaded to.
3291 for (auto Reg : PreloadRegs) {
3292 Register VReg = MRI.getLiveInVirtReg(Reg);
3293 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3294 Elts.push_back(Copy);
3295 }
3296 NewArg =
3297 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3298 PreloadRegs.size()),
3299 DL, Elts);
3300 }
3301
3302 // If the argument was preloaded to multiple consecutive 32-bit
3303 // registers because of misalignment between addressable SGPR tuples
3304 // and the argument size, we can still assume that because of kernarg
3305 // segment alignment restrictions that NewArg's size is the same as
3306 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3307 // truncate since we cannot preload to less than a single SGPR and the
3308 // MemVT may be smaller.
3309 EVT MemVTInt =
3311 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3312 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3313
3314 NewArg = DAG.getBitcast(MemVT, NewArg);
3315 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3316 Ins[i].Flags.isSExt(), &Ins[i]);
3317 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3318 }
3319 } else {
3320 // Hidden arguments that are in the kernel signature must be preloaded
3321 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3322 // the argument list and is not preloaded.
3323 if (Arg.isOrigArg()) {
3324 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3325 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3327 *OrigArg->getParent(),
3328 "hidden argument in kernel signature was not preloaded",
3329 DL.getDebugLoc()));
3330 }
3331 }
3332
3333 NewArg =
3334 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3335 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3336 }
3337 Chains.push_back(NewArg.getValue(1));
3338
3339 auto *ParamTy =
3340 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3341 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3342 ParamTy &&
3343 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3344 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3345 // On SI local pointers are just offsets into LDS, so they are always
3346 // less than 16-bits. On CI and newer they could potentially be
3347 // real pointers, so we can't guarantee their size.
3348 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3349 DAG.getValueType(MVT::i16));
3350 }
3351
3352 InVals.push_back(NewArg);
3353 continue;
3354 }
3355 if (!IsEntryFunc && VA.isMemLoc()) {
3356 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3357 InVals.push_back(Val);
3358 if (!Arg.Flags.isByVal())
3359 Chains.push_back(Val.getValue(1));
3360 continue;
3361 }
3362
3363 assert(VA.isRegLoc() && "Parameter must be in a register!");
3364
3365 Register Reg = VA.getLocReg();
3366 const TargetRegisterClass *RC = nullptr;
3367 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3368 RC = &AMDGPU::VGPR_32RegClass;
3369 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3370 RC = &AMDGPU::SGPR_32RegClass;
3371 else
3372 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3373 EVT ValVT = VA.getValVT();
3374
3375 Reg = MF.addLiveIn(Reg, RC);
3376 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3377
3378 if (Arg.Flags.isSRet()) {
3379 // The return object should be reasonably addressable.
3380
3381 // FIXME: This helps when the return is a real sret. If it is a
3382 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3383 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3384 unsigned NumBits =
3386 Val = DAG.getNode(
3387 ISD::AssertZext, DL, VT, Val,
3388 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3389 }
3390
3391 // If this is an 8 or 16-bit value, it is really passed promoted
3392 // to 32 bits. Insert an assert[sz]ext to capture this, then
3393 // truncate to the right size.
3394 switch (VA.getLocInfo()) {
3395 case CCValAssign::Full:
3396 break;
3397 case CCValAssign::BCvt:
3398 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3399 break;
3400 case CCValAssign::SExt:
3401 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3402 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3403 break;
3404 case CCValAssign::ZExt:
3405 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3406 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3407 break;
3408 case CCValAssign::AExt:
3409 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3410 break;
3411 default:
3412 llvm_unreachable("Unknown loc info!");
3413 }
3414
3415 InVals.push_back(Val);
3416 }
3417
3418 // Start adding system SGPRs.
3419 if (IsEntryFunc)
3420 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3421
3422 // DAG.getPass() returns nullptr when using new pass manager.
3423 // TODO: Use DAG.getMFAM() to access analysis result.
3424 if (DAG.getPass()) {
3425 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3426 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3427 }
3428
3429 unsigned StackArgSize = CCInfo.getStackSize();
3430 Info->setBytesInStackArgArea(StackArgSize);
3431
3432 return Chains.empty() ? Chain
3433 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3434}
3435
3436// TODO: If return values can't fit in registers, we should return as many as
3437// possible in registers before passing on stack.
3439 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3440 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3441 const Type *RetTy) const {
3442 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3443 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3444 // for shaders. Vector types should be explicitly handled by CC.
3445 if (AMDGPU::isEntryFunctionCC(CallConv))
3446 return true;
3447
3449 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3450 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3451 return false;
3452
3453 // We must use the stack if return would require unavailable registers.
3454 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3455 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3456 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3457 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3458 return false;
3459
3460 return true;
3461}
3462
3463SDValue
3465 bool isVarArg,
3467 const SmallVectorImpl<SDValue> &OutVals,
3468 const SDLoc &DL, SelectionDAG &DAG) const {
3472
3473 if (AMDGPU::isKernel(CallConv)) {
3474 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3475 OutVals, DL, DAG);
3476 }
3477
3478 bool IsShader = AMDGPU::isShader(CallConv);
3479
3480 Info->setIfReturnsVoid(Outs.empty());
3481 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3482
3483 // CCValAssign - represent the assignment of the return value to a location.
3485
3486 // CCState - Info about the registers and stack slots.
3487 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3488 *DAG.getContext());
3489
3490 // Analyze outgoing return values.
3491 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3492
3493 SDValue Glue;
3495 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3496
3497 SDValue ReadFirstLane =
3498 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3499 // Copy the result values into the output registers.
3500 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3501 ++I, ++RealRVLocIdx) {
3502 CCValAssign &VA = RVLocs[I];
3503 assert(VA.isRegLoc() && "Can only return in registers!");
3504 // TODO: Partially return in registers if return values don't fit.
3505 SDValue Arg = OutVals[RealRVLocIdx];
3506
3507 // Copied from other backends.
3508 switch (VA.getLocInfo()) {
3509 case CCValAssign::Full:
3510 break;
3511 case CCValAssign::BCvt:
3512 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3513 break;
3514 case CCValAssign::SExt:
3515 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3516 break;
3517 case CCValAssign::ZExt:
3518 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3519 break;
3520 case CCValAssign::AExt:
3521 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3522 break;
3523 default:
3524 llvm_unreachable("Unknown loc info!");
3525 }
3526 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3528 ReadFirstLane, Arg);
3529 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3530 Glue = Chain.getValue(1);
3531 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3532 }
3533
3534 // FIXME: Does sret work properly?
3535 if (!Info->isEntryFunction()) {
3536 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3537 const MCPhysReg *I =
3538 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3539 if (I) {
3540 for (; *I; ++I) {
3541 if (AMDGPU::SReg_64RegClass.contains(*I))
3542 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3543 else if (AMDGPU::SReg_32RegClass.contains(*I))
3544 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3545 else
3546 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3547 }
3548 }
3549 }
3550
3551 // Update chain and glue.
3552 RetOps[0] = Chain;
3553 if (Glue.getNode())
3554 RetOps.push_back(Glue);
3555
3556 unsigned Opc = AMDGPUISD::ENDPGM;
3557 if (!IsWaveEnd)
3558 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3559 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3561 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3562}
3563
3565 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3566 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3567 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3568 SDValue ThisVal) const {
3569 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3570
3571 // Assign locations to each value returned by this call.
3573 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3574 *DAG.getContext());
3575 CCInfo.AnalyzeCallResult(Ins, RetCC);
3576
3577 // Copy all of the result registers out of their specified physreg.
3578 for (CCValAssign VA : RVLocs) {
3579 SDValue Val;
3580
3581 if (VA.isRegLoc()) {
3582 Val =
3583 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3584 Chain = Val.getValue(1);
3585 InGlue = Val.getValue(2);
3586 } else if (VA.isMemLoc()) {
3587 report_fatal_error("TODO: return values in memory");
3588 } else
3589 llvm_unreachable("unknown argument location type");
3590
3591 switch (VA.getLocInfo()) {
3592 case CCValAssign::Full:
3593 break;
3594 case CCValAssign::BCvt:
3595 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3596 break;
3597 case CCValAssign::ZExt:
3598 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3599 DAG.getValueType(VA.getValVT()));
3600 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3601 break;
3602 case CCValAssign::SExt:
3603 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3604 DAG.getValueType(VA.getValVT()));
3605 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3606 break;
3607 case CCValAssign::AExt:
3608 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3609 break;
3610 default:
3611 llvm_unreachable("Unknown loc info!");
3612 }
3613
3614 InVals.push_back(Val);
3615 }
3616
3617 return Chain;
3618}
3619
3620// Add code to pass special inputs required depending on used features separate
3621// from the explicit user arguments present in the IR.
3623 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3624 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3625 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3626 // If we don't have a call site, this was a call inserted by
3627 // legalization. These can never use special inputs.
3628 if (!CLI.CB)
3629 return;
3630
3631 SelectionDAG &DAG = CLI.DAG;
3632 const SDLoc &DL = CLI.DL;
3633 const Function &F = DAG.getMachineFunction().getFunction();
3634
3635 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3636 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3637
3638 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3640 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3641 // DAG.getPass() returns nullptr when using new pass manager.
3642 // TODO: Use DAG.getMFAM() to access analysis result.
3643 if (DAG.getPass()) {
3644 auto &ArgUsageInfo =
3646 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3647 }
3648 }
3649
3650 // TODO: Unify with private memory register handling. This is complicated by
3651 // the fact that at least in kernels, the input argument is not necessarily
3652 // in the same location as the input.
3653 // clang-format off
3654 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3656 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3657 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3658 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3659 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3660 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3661 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3662 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3663 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3664 };
3665 // clang-format on
3666
3667 for (auto [InputID, Attr] : ImplicitAttrs) {
3668 // If the callee does not use the attribute value, skip copying the value.
3669 if (CLI.CB->hasFnAttr(Attr))
3670 continue;
3671
3672 const auto [OutgoingArg, ArgRC, ArgTy] =
3673 CalleeArgInfo->getPreloadedValue(InputID);
3674 if (!OutgoingArg)
3675 continue;
3676
3677 const auto [IncomingArg, IncomingArgRC, Ty] =
3678 CallerArgInfo.getPreloadedValue(InputID);
3679 assert(IncomingArgRC == ArgRC);
3680
3681 // All special arguments are ints for now.
3682 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3683 SDValue InputReg;
3684
3685 if (IncomingArg) {
3686 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3687 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3688 // The implicit arg ptr is special because it doesn't have a corresponding
3689 // input for kernels, and is computed from the kernarg segment pointer.
3690 InputReg = getImplicitArgPtr(DAG, DL);
3691 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3692 std::optional<uint32_t> Id =
3694 if (Id.has_value()) {
3695 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3696 } else {
3697 InputReg = DAG.getPOISON(ArgVT);
3698 }
3699 } else {
3700 // We may have proven the input wasn't needed, although the ABI is
3701 // requiring it. We just need to allocate the register appropriately.
3702 InputReg = DAG.getPOISON(ArgVT);
3703 }
3704
3705 if (OutgoingArg->isRegister()) {
3706 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3707 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3708 report_fatal_error("failed to allocate implicit input argument");
3709 } else {
3710 unsigned SpecialArgOffset =
3711 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3712 SDValue ArgStore =
3713 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3714 MemOpChains.push_back(ArgStore);
3715 }
3716 }
3717
3718 // Pack workitem IDs into a single register or pass it as is if already
3719 // packed.
3720
3721 auto [OutgoingArg, ArgRC, Ty] =
3723 if (!OutgoingArg)
3724 std::tie(OutgoingArg, ArgRC, Ty) =
3726 if (!OutgoingArg)
3727 std::tie(OutgoingArg, ArgRC, Ty) =
3729 if (!OutgoingArg)
3730 return;
3731
3732 const ArgDescriptor *IncomingArgX = std::get<0>(
3734 const ArgDescriptor *IncomingArgY = std::get<0>(
3736 const ArgDescriptor *IncomingArgZ = std::get<0>(
3738
3739 SDValue InputReg;
3740 SDLoc SL;
3741
3742 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3743 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3744 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3745
3746 // If incoming ids are not packed we need to pack them.
3747 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3748 NeedWorkItemIDX) {
3749 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3750 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3751 } else {
3752 InputReg = DAG.getConstant(0, DL, MVT::i32);
3753 }
3754 }
3755
3756 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3757 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3758 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3759 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3760 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3761 InputReg = InputReg.getNode()
3762 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3763 : Y;
3764 }
3765
3766 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3767 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3768 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3769 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3770 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3771 InputReg = InputReg.getNode()
3772 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3773 : Z;
3774 }
3775
3776 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3777 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3778 // We're in a situation where the outgoing function requires the workitem
3779 // ID, but the calling function does not have it (e.g a graphics function
3780 // calling a C calling convention function). This is illegal, but we need
3781 // to produce something.
3782 InputReg = DAG.getPOISON(MVT::i32);
3783 } else {
3784 // Workitem ids are already packed, any of present incoming arguments
3785 // will carry all required fields.
3786 ArgDescriptor IncomingArg =
3787 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3788 : IncomingArgY ? *IncomingArgY
3789 : *IncomingArgZ,
3790 ~0u);
3791 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3792 }
3793 }
3794
3795 if (OutgoingArg->isRegister()) {
3796 if (InputReg)
3797 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3798
3799 CCInfo.AllocateReg(OutgoingArg->getRegister());
3800 } else {
3801 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3802 if (InputReg) {
3803 SDValue ArgStore =
3804 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3805 MemOpChains.push_back(ArgStore);
3806 }
3807 }
3808}
3809
3811 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3813 const SmallVectorImpl<SDValue> &OutVals,
3814 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3815 if (AMDGPU::isChainCC(CalleeCC))
3816 return true;
3817
3818 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3819 return false;
3820
3821 // For a divergent call target, we need to do a waterfall loop over the
3822 // possible callees which precludes us from using a simple jump.
3823 if (Callee->isDivergent())
3824 return false;
3825
3827 const Function &CallerF = MF.getFunction();
3828 CallingConv::ID CallerCC = CallerF.getCallingConv();
3830 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3831
3832 // Kernels aren't callable, and don't have a live in return address so it
3833 // doesn't make sense to do a tail call with entry functions.
3834 if (!CallerPreserved)
3835 return false;
3836
3837 bool CCMatch = CallerCC == CalleeCC;
3838
3840 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3841 return true;
3842 return false;
3843 }
3844
3845 // TODO: Can we handle var args?
3846 if (IsVarArg)
3847 return false;
3848
3849 for (const Argument &Arg : CallerF.args()) {
3850 if (Arg.hasByValAttr())
3851 return false;
3852 }
3853
3854 LLVMContext &Ctx = *DAG.getContext();
3855
3856 // Check that the call results are passed in the same way.
3857 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3858 CCAssignFnForCall(CalleeCC, IsVarArg),
3859 CCAssignFnForCall(CallerCC, IsVarArg)))
3860 return false;
3861
3862 // The callee has to preserve all registers the caller needs to preserve.
3863 if (!CCMatch) {
3864 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3865 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3866 return false;
3867 }
3868
3869 // Nothing more to check if the callee is taking no arguments.
3870 if (Outs.empty())
3871 return true;
3872
3874 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3875
3876 // FIXME: We are not allocating special input registers, so we will be
3877 // deciding based on incorrect register assignments.
3878 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3879
3880 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3881 // If the stack arguments for this call do not fit into our own save area then
3882 // the call cannot be made tail.
3883 // TODO: Is this really necessary?
3884 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3885 return false;
3886
3887 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3888 // FIXME: What about inreg arguments that end up passed in memory?
3889 if (!CCVA.isRegLoc())
3890 continue;
3891
3892 // If we are passing an argument in an SGPR, and the value is divergent,
3893 // this call requires a waterfall loop.
3894 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3895 LLVM_DEBUG(
3896 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3897 << printReg(CCVA.getLocReg(), TRI) << '\n');
3898 return false;
3899 }
3900 }
3901
3902 const MachineRegisterInfo &MRI = MF.getRegInfo();
3903 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3904}
3905
3907 if (!CI->isTailCall())
3908 return false;
3909
3910 const Function *ParentFn = CI->getParent()->getParent();
3912 return false;
3913 return true;
3914}
3915
3916namespace {
3917// Chain calls have special arguments that we need to handle. These are
3918// tagging along at the end of the arguments list(s), after the SGPR and VGPR
3919// arguments (index 0 and 1 respectively).
3920enum ChainCallArgIdx {
3921 Exec = 2,
3922 Flags,
3923 NumVGPRs,
3924 FallbackExec,
3925 FallbackCallee
3926};
3927} // anonymous namespace
3928
3929// The wave scratch offset register is used as the global base pointer.
3931 SmallVectorImpl<SDValue> &InVals) const {
3932 CallingConv::ID CallConv = CLI.CallConv;
3933 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3934
3935 SelectionDAG &DAG = CLI.DAG;
3936
3937 const SDLoc &DL = CLI.DL;
3938 SDValue Chain = CLI.Chain;
3939 SDValue Callee = CLI.Callee;
3940
3941 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
3942 bool UsesDynamicVGPRs = false;
3943 if (IsChainCallConv) {
3944 // The last arguments should be the value that we need to put in EXEC,
3945 // followed by the flags and any other arguments with special meanings.
3946 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
3947 // we don't treat them like the "real" arguments.
3948 auto RequestedExecIt =
3949 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
3950 return Arg.OrigArgIndex == 2;
3951 });
3952 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
3953
3954 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
3955 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
3956 CLI.OutVals.end());
3957 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
3958
3959 assert(CLI.Outs.back().OrigArgIndex < 2 &&
3960 "Haven't popped all the special args");
3961
3962 TargetLowering::ArgListEntry RequestedExecArg =
3963 CLI.Args[ChainCallArgIdx::Exec];
3964 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3965 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3966
3967 // Convert constants into TargetConstants, so they become immediate operands
3968 // instead of being selected into S_MOV.
3969 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
3970 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
3971 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
3972 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
3973 } else
3974 ChainCallSpecialArgs.push_back(Arg.Node);
3975 };
3976
3977 PushNodeOrTargetConstant(RequestedExecArg);
3978
3979 // Process any other special arguments depending on the value of the flags.
3980 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
3981
3982 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
3983 if (FlagsValue.isZero()) {
3984 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
3985 return lowerUnhandledCall(CLI, InVals,
3986 "no additional args allowed if flags == 0");
3987 } else if (FlagsValue.isOneBitSet(0)) {
3988 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3989 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
3990 }
3991
3992 if (!Subtarget->isWave32()) {
3993 return lowerUnhandledCall(
3994 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
3995 }
3996
3997 UsesDynamicVGPRs = true;
3998 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
3999 CLI.Args.end(), PushNodeOrTargetConstant);
4000 }
4001 }
4002
4004 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4006 bool &IsTailCall = CLI.IsTailCall;
4007 bool IsVarArg = CLI.IsVarArg;
4008 bool IsSibCall = false;
4010
4011 if (Callee.isUndef() || isNullConstant(Callee)) {
4012 if (!CLI.IsTailCall) {
4013 for (ISD::InputArg &Arg : CLI.Ins)
4014 InVals.push_back(DAG.getPOISON(Arg.VT));
4015 }
4016
4017 return Chain;
4018 }
4019
4020 if (IsVarArg) {
4021 return lowerUnhandledCall(CLI, InVals,
4022 "unsupported call to variadic function ");
4023 }
4024
4025 if (!CLI.CB)
4026 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4027
4028 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4029 return lowerUnhandledCall(CLI, InVals,
4030 "unsupported required tail call to function ");
4031 }
4032
4033 if (IsTailCall) {
4034 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4035 Outs, OutVals, Ins, DAG);
4036 if (!IsTailCall &&
4037 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4038 report_fatal_error("failed to perform tail call elimination on a call "
4039 "site marked musttail or on llvm.amdgcn.cs.chain");
4040 }
4041
4042 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4043
4044 // A sibling call is one where we're under the usual C ABI and not planning
4045 // to change that but can still do a tail call:
4046 if (!TailCallOpt && IsTailCall)
4047 IsSibCall = true;
4048
4049 if (IsTailCall)
4050 ++NumTailCalls;
4051 }
4052
4055 SmallVector<SDValue, 8> MemOpChains;
4056
4057 // Analyze operands of the call, assigning locations to each operand.
4059 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4060 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4061
4062 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4064 // With a fixed ABI, allocate fixed registers before user arguments.
4065 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4066 }
4067
4068 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4069
4070 // Get a count of how many bytes are to be pushed on the stack.
4071 unsigned NumBytes = CCInfo.getStackSize();
4072
4073 if (IsSibCall) {
4074 // Since we're not changing the ABI to make this a tail call, the memory
4075 // operands are already available in the caller's incoming argument space.
4076 NumBytes = 0;
4077 }
4078
4079 // FPDiff is the byte offset of the call's argument area from the callee's.
4080 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4081 // by this amount for a tail call. In a sibling call it must be 0 because the
4082 // caller will deallocate the entire stack and the callee still expects its
4083 // arguments to begin at SP+0. Completely unused for non-tail calls.
4084 int32_t FPDiff = 0;
4085 MachineFrameInfo &MFI = MF.getFrameInfo();
4086 auto *TRI = Subtarget->getRegisterInfo();
4087
4088 // Adjust the stack pointer for the new arguments...
4089 // These operations are automatically eliminated by the prolog/epilog pass
4090 if (!IsSibCall)
4091 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4092
4093 if (!IsSibCall || IsChainCallConv) {
4094 if (!Subtarget->enableFlatScratch()) {
4095 SmallVector<SDValue, 4> CopyFromChains;
4096
4097 // In the HSA case, this should be an identity copy.
4098 SDValue ScratchRSrcReg =
4099 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4100 RegsToPass.emplace_back(IsChainCallConv
4101 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4102 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4103 ScratchRSrcReg);
4104 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4105 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4106 }
4107 }
4108
4109 const unsigned NumSpecialInputs = RegsToPass.size();
4110
4111 MVT PtrVT = MVT::i32;
4112
4113 // Walk the register/memloc assignments, inserting copies/loads.
4114 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4115 CCValAssign &VA = ArgLocs[i];
4116 SDValue Arg = OutVals[i];
4117
4118 // Promote the value if needed.
4119 switch (VA.getLocInfo()) {
4120 case CCValAssign::Full:
4121 break;
4122 case CCValAssign::BCvt:
4123 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4124 break;
4125 case CCValAssign::ZExt:
4126 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4127 break;
4128 case CCValAssign::SExt:
4129 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4130 break;
4131 case CCValAssign::AExt:
4132 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4133 break;
4134 case CCValAssign::FPExt:
4135 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4136 break;
4137 default:
4138 llvm_unreachable("Unknown loc info!");
4139 }
4140
4141 if (VA.isRegLoc()) {
4142 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4143 } else {
4144 assert(VA.isMemLoc());
4145
4146 SDValue DstAddr;
4147 MachinePointerInfo DstInfo;
4148
4149 unsigned LocMemOffset = VA.getLocMemOffset();
4150 int32_t Offset = LocMemOffset;
4151
4152 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4153 MaybeAlign Alignment;
4154
4155 if (IsTailCall) {
4156 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4157 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4158 : VA.getValVT().getStoreSize();
4159
4160 // FIXME: We can have better than the minimum byval required alignment.
4161 Alignment =
4162 Flags.isByVal()
4163 ? Flags.getNonZeroByValAlign()
4164 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4165
4166 Offset = Offset + FPDiff;
4167 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4168
4169 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4170 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4171
4172 // Make sure any stack arguments overlapping with where we're storing
4173 // are loaded before this eventual operation. Otherwise they'll be
4174 // clobbered.
4175
4176 // FIXME: Why is this really necessary? This seems to just result in a
4177 // lot of code to copy the stack and write them back to the same
4178 // locations, which are supposed to be immutable?
4179 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4180 } else {
4181 // Stores to the argument stack area are relative to the stack pointer.
4182 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4183 MVT::i32);
4184 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4185 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4186 Alignment =
4187 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4188 }
4189
4190 if (Outs[i].Flags.isByVal()) {
4191 SDValue SizeNode =
4192 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4193 SDValue Cpy =
4194 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4195 Outs[i].Flags.getNonZeroByValAlign(),
4196 /*isVol = */ false, /*AlwaysInline = */ true,
4197 /*CI=*/nullptr, std::nullopt, DstInfo,
4199
4200 MemOpChains.push_back(Cpy);
4201 } else {
4202 SDValue Store =
4203 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4204 MemOpChains.push_back(Store);
4205 }
4206 }
4207 }
4208
4209 if (!MemOpChains.empty())
4210 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4211
4212 SDValue ReadFirstLaneID =
4213 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4214
4215 SDValue TokenGlue;
4216 if (CLI.ConvergenceControlToken) {
4217 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4219 }
4220
4221 // Build a sequence of copy-to-reg nodes chained together with token chain
4222 // and flag operands which copy the outgoing args into the appropriate regs.
4223 SDValue InGlue;
4224
4225 unsigned ArgIdx = 0;
4226 for (auto [Reg, Val] : RegsToPass) {
4227 if (ArgIdx++ >= NumSpecialInputs &&
4228 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4229 // For chain calls, the inreg arguments are required to be
4230 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4231 // they are uniform.
4232 //
4233 // For other calls, if an inreg arguments is known to be uniform,
4234 // speculatively insert a readfirstlane in case it is in a VGPR.
4235 //
4236 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4237 // value, so let that continue to produce invalid code.
4238
4239 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4240 if (TokenGlue)
4241 ReadfirstlaneArgs.push_back(TokenGlue);
4243 ReadfirstlaneArgs);
4244 }
4245
4246 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4247 InGlue = Chain.getValue(1);
4248 }
4249
4250 // We don't usually want to end the call-sequence here because we would tidy
4251 // the frame up *after* the call, however in the ABI-changing tail-call case
4252 // we've carefully laid out the parameters so that when sp is reset they'll be
4253 // in the correct location.
4254 if (IsTailCall && !IsSibCall) {
4255 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4256 InGlue = Chain.getValue(1);
4257 }
4258
4259 std::vector<SDValue> Ops({Chain});
4260
4261 // Add a redundant copy of the callee global which will not be legalized, as
4262 // we need direct access to the callee later.
4264 const GlobalValue *GV = GSD->getGlobal();
4265 Ops.push_back(Callee);
4266 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4267 } else {
4268 if (IsTailCall) {
4269 // isEligibleForTailCallOptimization considered whether the call target is
4270 // divergent, but we may still end up with a uniform value in a VGPR.
4271 // Insert a readfirstlane just in case.
4272 SDValue ReadFirstLaneID =
4273 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4274
4275 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4276 if (TokenGlue)
4277 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4278 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4279 ReadfirstlaneArgs);
4280 }
4281
4282 Ops.push_back(Callee);
4283 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4284 }
4285
4286 if (IsTailCall) {
4287 // Each tail call may have to adjust the stack by a different amount, so
4288 // this information must travel along with the operation for eventual
4289 // consumption by emitEpilogue.
4290 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4291 }
4292
4293 if (IsChainCallConv)
4294 llvm::append_range(Ops, ChainCallSpecialArgs);
4295
4296 // Add argument registers to the end of the list so that they are known live
4297 // into the call.
4298 for (auto &[Reg, Val] : RegsToPass)
4299 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4300
4301 // Add a register mask operand representing the call-preserved registers.
4302 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4303 assert(Mask && "Missing call preserved mask for calling convention");
4304 Ops.push_back(DAG.getRegisterMask(Mask));
4305
4306 if (SDValue Token = CLI.ConvergenceControlToken) {
4308 GlueOps.push_back(Token);
4309 if (InGlue)
4310 GlueOps.push_back(InGlue);
4311
4312 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4313 MVT::Glue, GlueOps),
4314 0);
4315 }
4316
4317 if (InGlue)
4318 Ops.push_back(InGlue);
4319
4320 // If we're doing a tall call, use a TC_RETURN here rather than an
4321 // actual call instruction.
4322 if (IsTailCall) {
4323 MFI.setHasTailCall();
4324 unsigned OPC = AMDGPUISD::TC_RETURN;
4325 switch (CallConv) {
4328 break;
4331 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4333 break;
4334 }
4335
4336 // If the caller is a whole wave function, we need to use a special opcode
4337 // so we can patch up EXEC.
4338 if (Info->isWholeWaveFunction())
4340
4341 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4342 }
4343
4344 // Returns a chain and a flag for retval copy to use.
4345 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4346 Chain = Call.getValue(0);
4347 InGlue = Call.getValue(1);
4348
4349 uint64_t CalleePopBytes = NumBytes;
4350 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4351 if (!Ins.empty())
4352 InGlue = Chain.getValue(1);
4353
4354 // Handle result values, copying them out of physregs into vregs that we
4355 // return.
4356 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4357 InVals, /*IsThisReturn=*/false, SDValue());
4358}
4359
4360// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4361// except for:
4362// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4363// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4365 SelectionDAG &DAG) const {
4366 const MachineFunction &MF = DAG.getMachineFunction();
4368
4369 SDLoc dl(Op);
4370 EVT VT = Op.getValueType();
4371 SDValue Chain = Op.getOperand(0);
4372 Register SPReg = Info->getStackPtrOffsetReg();
4373
4374 // Chain the dynamic stack allocation so that it doesn't modify the stack
4375 // pointer when other instructions are using the stack.
4376 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4377
4378 SDValue Size = Op.getOperand(1);
4379 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4380 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4381
4382 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4384 "Stack grows upwards for AMDGPU");
4385
4386 Chain = BaseAddr.getValue(1);
4387 Align StackAlign = TFL->getStackAlign();
4388 if (Alignment > StackAlign) {
4389 uint64_t ScaledAlignment = Alignment.value()
4390 << Subtarget->getWavefrontSizeLog2();
4391 uint64_t StackAlignMask = ScaledAlignment - 1;
4392 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4393 DAG.getConstant(StackAlignMask, dl, VT));
4394 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4395 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4396 }
4397
4398 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4399 SDValue NewSP;
4401 // For constant sized alloca, scale alloca size by wave-size
4402 SDValue ScaledSize = DAG.getNode(
4403 ISD::SHL, dl, VT, Size,
4404 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4405 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4406 } else {
4407 // For dynamic sized alloca, perform wave-wide reduction to get max of
4408 // alloca size(divergent) and then scale it by wave-size
4409 SDValue WaveReduction =
4410 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4411 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4412 Size, DAG.getConstant(0, dl, MVT::i32));
4413 SDValue ScaledSize = DAG.getNode(
4414 ISD::SHL, dl, VT, Size,
4415 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4416 NewSP =
4417 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4418 SDValue ReadFirstLaneID =
4419 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4420 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4421 NewSP);
4422 }
4423
4424 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4425 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4426
4427 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4428}
4429
4431 if (Op.getValueType() != MVT::i32)
4432 return Op; // Defer to cannot select error.
4433
4435 SDLoc SL(Op);
4436
4437 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4438
4439 // Convert from wave uniform to swizzled vector address. This should protect
4440 // from any edge cases where the stacksave result isn't directly used with
4441 // stackrestore.
4442 SDValue VectorAddress =
4443 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4444 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4445}
4446
4448 SelectionDAG &DAG) const {
4449 SDLoc SL(Op);
4450 assert(Op.getValueType() == MVT::i32);
4451
4452 uint32_t BothRoundHwReg =
4454 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4455
4456 SDValue IntrinID =
4457 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4458 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4459 Op.getOperand(0), IntrinID, GetRoundBothImm);
4460
4461 // There are two rounding modes, one for f32 and one for f64/f16. We only
4462 // report in the standard value range if both are the same.
4463 //
4464 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4465 // ties away from zero is not supported, and the other values are rotated by
4466 // 1.
4467 //
4468 // If the two rounding modes are not the same, report a target defined value.
4469
4470 // Mode register rounding mode fields:
4471 //
4472 // [1:0] Single-precision round mode.
4473 // [3:2] Double/Half-precision round mode.
4474 //
4475 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4476 //
4477 // Hardware Spec
4478 // Toward-0 3 0
4479 // Nearest Even 0 1
4480 // +Inf 1 2
4481 // -Inf 2 3
4482 // NearestAway0 N/A 4
4483 //
4484 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4485 // table we can index by the raw hardware mode.
4486 //
4487 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4488
4489 SDValue BitTable =
4491
4492 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4493 SDValue RoundModeTimesNumBits =
4494 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4495
4496 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4497 // knew only one mode was demanded.
4498 SDValue TableValue =
4499 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4500 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4501
4502 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4503 SDValue TableEntry =
4504 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4505
4506 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4507 // if it's an extended value.
4508 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4509 SDValue IsStandardValue =
4510 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4511 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4512 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4513 TableEntry, EnumOffset);
4514
4515 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4516}
4517
4519 SelectionDAG &DAG) const {
4520 SDLoc SL(Op);
4521
4522 SDValue NewMode = Op.getOperand(1);
4523 assert(NewMode.getValueType() == MVT::i32);
4524
4525 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4526 // hardware MODE.fp_round values.
4527 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4528 uint32_t ClampedVal = std::min(
4529 static_cast<uint32_t>(ConstMode->getZExtValue()),
4531 NewMode = DAG.getConstant(
4532 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4533 } else {
4534 // If we know the input can only be one of the supported standard modes in
4535 // the range 0-3, we can use a simplified mapping to hardware values.
4536 KnownBits KB = DAG.computeKnownBits(NewMode);
4537 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4538 // The supported standard values are 0-3. The extended values start at 8. We
4539 // need to offset by 4 if the value is in the extended range.
4540
4541 if (UseReducedTable) {
4542 // Truncate to the low 32-bits.
4543 SDValue BitTable = DAG.getConstant(
4544 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4545
4546 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4547 SDValue RoundModeTimesNumBits =
4548 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4549
4550 NewMode =
4551 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4552
4553 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4554 // the table extracted bits into inline immediates.
4555 } else {
4556 // table_index = umin(value, value - 4)
4557 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4558 SDValue BitTable =
4560
4561 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4562 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4563 SDValue IndexVal =
4564 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4565
4566 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4567 SDValue RoundModeTimesNumBits =
4568 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4569
4570 SDValue TableValue =
4571 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4572 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4573
4574 // No need to mask out the high bits since the setreg will ignore them
4575 // anyway.
4576 NewMode = TruncTable;
4577 }
4578
4579 // Insert a readfirstlane in case the value is a VGPR. We could do this
4580 // earlier and keep more operations scalar, but that interferes with
4581 // combining the source.
4582 SDValue ReadFirstLaneID =
4583 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4584 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4585 ReadFirstLaneID, NewMode);
4586 }
4587
4588 // N.B. The setreg will be later folded into s_round_mode on supported
4589 // targets.
4590 SDValue IntrinID =
4591 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4592 uint32_t BothRoundHwReg =
4594 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4595
4596 SDValue SetReg =
4597 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4598 IntrinID, RoundBothImm, NewMode);
4599
4600 return SetReg;
4601}
4602
4604 if (Op->isDivergent() &&
4605 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4606 // Cannot do I$ prefetch with divergent pointer.
4607 return SDValue();
4608
4609 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4613 break;
4615 if (Subtarget->hasSafeSmemPrefetch())
4616 break;
4617 [[fallthrough]];
4618 default:
4619 return SDValue();
4620 }
4621
4622 // I$ prefetch
4623 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4624 return SDValue();
4625
4626 return Op;
4627}
4628
4629// Work around DAG legality rules only based on the result type.
4631 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4632 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4633 EVT SrcVT = Src.getValueType();
4634
4635 if (SrcVT.getScalarType() != MVT::bf16)
4636 return Op;
4637
4638 SDLoc SL(Op);
4639 SDValue BitCast =
4640 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4641
4642 EVT DstVT = Op.getValueType();
4643 if (IsStrict)
4644 llvm_unreachable("Need STRICT_BF16_TO_FP");
4645
4646 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4647}
4648
4650 SDLoc SL(Op);
4651 if (Op.getValueType() != MVT::i64)
4652 return Op;
4653
4654 uint32_t ModeHwReg =
4656 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4657 uint32_t TrapHwReg =
4659 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4660
4661 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4662 SDValue IntrinID =
4663 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4664 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4665 Op.getOperand(0), IntrinID, ModeHwRegImm);
4666 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4667 Op.getOperand(0), IntrinID, TrapHwRegImm);
4668 SDValue TokenReg =
4669 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4670 GetTrapReg.getValue(1));
4671
4672 SDValue CvtPtr =
4673 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4674 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4675
4676 return DAG.getMergeValues({Result, TokenReg}, SL);
4677}
4678
4680 SDLoc SL(Op);
4681 if (Op.getOperand(1).getValueType() != MVT::i64)
4682 return Op;
4683
4684 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4685 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4686 DAG.getConstant(0, SL, MVT::i32));
4687 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4688 DAG.getConstant(1, SL, MVT::i32));
4689
4690 SDValue ReadFirstLaneID =
4691 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4692 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4693 ReadFirstLaneID, NewModeReg);
4694 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4695 ReadFirstLaneID, NewTrapReg);
4696
4697 unsigned ModeHwReg =
4699 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4700 unsigned TrapHwReg =
4702 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4703
4704 SDValue IntrinID =
4705 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4706 SDValue SetModeReg =
4707 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4708 IntrinID, ModeHwRegImm, NewModeReg);
4709 SDValue SetTrapReg =
4710 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4711 IntrinID, TrapHwRegImm, NewTrapReg);
4712 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4713}
4714
4716 const MachineFunction &MF) const {
4717 const Function &Fn = MF.getFunction();
4718
4720 .Case("m0", AMDGPU::M0)
4721 .Case("exec", AMDGPU::EXEC)
4722 .Case("exec_lo", AMDGPU::EXEC_LO)
4723 .Case("exec_hi", AMDGPU::EXEC_HI)
4724 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4725 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4726 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4727 .Default(Register());
4728 if (!Reg)
4729 return Reg;
4730
4731 if (!Subtarget->hasFlatScrRegister() &&
4732 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4733 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4734 "\" for subtarget."));
4735 }
4736
4737 switch (Reg) {
4738 case AMDGPU::M0:
4739 case AMDGPU::EXEC_LO:
4740 case AMDGPU::EXEC_HI:
4741 case AMDGPU::FLAT_SCR_LO:
4742 case AMDGPU::FLAT_SCR_HI:
4743 if (VT.getSizeInBits() == 32)
4744 return Reg;
4745 break;
4746 case AMDGPU::EXEC:
4747 case AMDGPU::FLAT_SCR:
4748 if (VT.getSizeInBits() == 64)
4749 return Reg;
4750 break;
4751 default:
4752 llvm_unreachable("missing register type checking");
4753 }
4754
4756 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4757}
4758
4759// If kill is not the last instruction, split the block so kill is always a
4760// proper terminator.
4763 MachineBasicBlock *BB) const {
4764 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4766 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4767 return SplitBB;
4768}
4769
4770// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4771// \p MI will be the only instruction in the loop body block. Otherwise, it will
4772// be the first instruction in the remainder block.
4773//
4774/// \returns { LoopBody, Remainder }
4775static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4777 MachineFunction *MF = MBB.getParent();
4779
4780 // To insert the loop we need to split the block. Move everything after this
4781 // point to a new block, and insert a new empty block between the two.
4783 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4785 ++MBBI;
4786
4787 MF->insert(MBBI, LoopBB);
4788 MF->insert(MBBI, RemainderBB);
4789
4790 LoopBB->addSuccessor(LoopBB);
4791 LoopBB->addSuccessor(RemainderBB);
4792
4793 // Move the rest of the block into a new block.
4794 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4795
4796 if (InstInLoop) {
4797 auto Next = std::next(I);
4798
4799 // Move instruction to loop body.
4800 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4801
4802 // Move the rest of the block.
4803 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4804 } else {
4805 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4806 }
4807
4808 MBB.addSuccessor(LoopBB);
4809
4810 return std::pair(LoopBB, RemainderBB);
4811}
4812
4813/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4815 MachineBasicBlock *MBB = MI.getParent();
4817 auto I = MI.getIterator();
4818 auto E = std::next(I);
4819
4820 // clang-format off
4821 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4822 .addImm(0);
4823 // clang-format on
4824
4825 MIBundleBuilder Bundler(*MBB, I, E);
4826 finalizeBundle(*MBB, Bundler.begin());
4827}
4828
4831 MachineBasicBlock *BB) const {
4832 const DebugLoc &DL = MI.getDebugLoc();
4833
4835
4837
4838 // Apparently kill flags are only valid if the def is in the same block?
4839 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4840 Src->setIsKill(false);
4841
4842 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4843
4844 MachineBasicBlock::iterator I = LoopBB->end();
4845
4846 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4848
4849 // Clear TRAP_STS.MEM_VIOL
4850 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4851 .addImm(0)
4852 .addImm(EncodedReg);
4853
4855
4856 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4857
4858 // Load and check TRAP_STS.MEM_VIOL
4859 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4860 .addImm(EncodedReg);
4861
4862 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4863 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4864 .addReg(Reg, RegState::Kill)
4865 .addImm(0);
4866 // clang-format off
4867 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4868 .addMBB(LoopBB);
4869 // clang-format on
4870
4871 return RemainderBB;
4872}
4873
4874// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4875// wavefront. If the value is uniform and just happens to be in a VGPR, this
4876// will only do one iteration. In the worst case, this will loop 64 times.
4877//
4878// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4881 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4882 const DebugLoc &DL, const MachineOperand &Idx,
4883 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4884 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4885 Register &SGPRIdxReg) {
4886
4887 MachineFunction *MF = OrigBB.getParent();
4888 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4889 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4891
4892 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4893 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4894 Register NewExec = MRI.createVirtualRegister(BoolRC);
4895 Register CurrentIdxReg =
4896 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4897 Register CondReg = MRI.createVirtualRegister(BoolRC);
4898
4899 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4900 .addReg(InitReg)
4901 .addMBB(&OrigBB)
4902 .addReg(ResultReg)
4903 .addMBB(&LoopBB);
4904
4905 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4906 .addReg(InitSaveExecReg)
4907 .addMBB(&OrigBB)
4908 .addReg(NewExec)
4909 .addMBB(&LoopBB);
4910
4911 // Read the next variant <- also loop target.
4912 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4913 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4914
4915 // Compare the just read M0 value to all possible Idx values.
4916 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4917 .addReg(CurrentIdxReg)
4918 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4919
4920 // Update EXEC, save the original EXEC value to VCC.
4921 BuildMI(LoopBB, I, DL,
4922 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4923 : AMDGPU::S_AND_SAVEEXEC_B64),
4924 NewExec)
4925 .addReg(CondReg, RegState::Kill);
4926
4927 MRI.setSimpleHint(NewExec, CondReg);
4928
4929 if (UseGPRIdxMode) {
4930 if (Offset == 0) {
4931 SGPRIdxReg = CurrentIdxReg;
4932 } else {
4933 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4934 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4935 .addReg(CurrentIdxReg, RegState::Kill)
4936 .addImm(Offset);
4937 }
4938 } else {
4939 // Move index from VCC into M0
4940 if (Offset == 0) {
4941 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4942 .addReg(CurrentIdxReg, RegState::Kill);
4943 } else {
4944 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4945 .addReg(CurrentIdxReg, RegState::Kill)
4946 .addImm(Offset);
4947 }
4948 }
4949
4950 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4951 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4952 MachineInstr *InsertPt =
4953 BuildMI(LoopBB, I, DL,
4954 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4955 : AMDGPU::S_XOR_B64_term),
4956 Exec)
4957 .addReg(Exec)
4958 .addReg(NewExec);
4959
4960 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4961 // s_cbranch_scc0?
4962
4963 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4964 // clang-format off
4965 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4966 .addMBB(&LoopBB);
4967 // clang-format on
4968
4969 return InsertPt->getIterator();
4970}
4971
4972// This has slightly sub-optimal regalloc when the source vector is killed by
4973// the read. The register allocator does not understand that the kill is
4974// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4975// subregister from it, using 1 more VGPR than necessary. This was saved when
4976// this was expanded after register allocation.
4979 unsigned InitResultReg, unsigned PhiReg, int Offset,
4980 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4981 MachineFunction *MF = MBB.getParent();
4982 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4985 const DebugLoc &DL = MI.getDebugLoc();
4987
4988 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4989 Register DstReg = MI.getOperand(0).getReg();
4990 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4991 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4992 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4993 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4994
4995 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4996
4997 // Save the EXEC mask
4998 // clang-format off
4999 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
5000 .addReg(Exec);
5001 // clang-format on
5002
5003 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5004
5005 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5006
5007 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5008 InitResultReg, DstReg, PhiReg, TmpExec,
5009 Offset, UseGPRIdxMode, SGPRIdxReg);
5010
5011 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5013 ++MBBI;
5014 MF->insert(MBBI, LandingPad);
5015 LoopBB->removeSuccessor(RemainderBB);
5016 LandingPad->addSuccessor(RemainderBB);
5017 LoopBB->addSuccessor(LandingPad);
5018 MachineBasicBlock::iterator First = LandingPad->begin();
5019 // clang-format off
5020 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
5021 .addReg(SaveExec);
5022 // clang-format on
5023
5024 return InsPt;
5025}
5026
5027// Returns subreg index, offset
5028static std::pair<unsigned, int>
5030 const TargetRegisterClass *SuperRC, unsigned VecReg,
5031 int Offset) {
5032 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5033
5034 // Skip out of bounds offsets, or else we would end up using an undefined
5035 // register.
5036 if (Offset >= NumElts || Offset < 0)
5037 return std::pair(AMDGPU::sub0, Offset);
5038
5039 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5040}
5041
5044 int Offset) {
5045 MachineBasicBlock *MBB = MI.getParent();
5046 const DebugLoc &DL = MI.getDebugLoc();
5048
5049 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5050
5051 assert(Idx->getReg() != AMDGPU::NoRegister);
5052
5053 if (Offset == 0) {
5054 // clang-format off
5055 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5056 .add(*Idx);
5057 // clang-format on
5058 } else {
5059 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5060 .add(*Idx)
5061 .addImm(Offset);
5062 }
5063}
5064
5067 int Offset) {
5068 MachineBasicBlock *MBB = MI.getParent();
5069 const DebugLoc &DL = MI.getDebugLoc();
5071
5072 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5073
5074 if (Offset == 0)
5075 return Idx->getReg();
5076
5077 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5078 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5079 .add(*Idx)
5080 .addImm(Offset);
5081 return Tmp;
5082}
5083
5086 const GCNSubtarget &ST) {
5087 const SIInstrInfo *TII = ST.getInstrInfo();
5088 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5089 MachineFunction *MF = MBB.getParent();
5091
5092 Register Dst = MI.getOperand(0).getReg();
5093 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5094 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5095 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5096
5097 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5098 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5099
5100 unsigned SubReg;
5101 std::tie(SubReg, Offset) =
5102 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5103
5104 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5105
5106 // Check for a SGPR index.
5107 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5109 const DebugLoc &DL = MI.getDebugLoc();
5110
5111 if (UseGPRIdxMode) {
5112 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5113 // to avoid interfering with other uses, so probably requires a new
5114 // optimization pass.
5116
5117 const MCInstrDesc &GPRIDXDesc =
5118 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5119 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5120 .addReg(SrcReg)
5121 .addReg(Idx)
5122 .addImm(SubReg);
5123 } else {
5125
5126 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5127 .addReg(SrcReg, 0, SubReg)
5128 .addReg(SrcReg, RegState::Implicit);
5129 }
5130
5131 MI.eraseFromParent();
5132
5133 return &MBB;
5134 }
5135
5136 // Control flow needs to be inserted if indexing with a VGPR.
5137 const DebugLoc &DL = MI.getDebugLoc();
5139
5140 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5141 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5142
5143 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5144
5145 Register SGPRIdxReg;
5146 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5147 UseGPRIdxMode, SGPRIdxReg);
5148
5149 MachineBasicBlock *LoopBB = InsPt->getParent();
5150
5151 if (UseGPRIdxMode) {
5152 const MCInstrDesc &GPRIDXDesc =
5153 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5154
5155 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5156 .addReg(SrcReg)
5157 .addReg(SGPRIdxReg)
5158 .addImm(SubReg);
5159 } else {
5160 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5161 .addReg(SrcReg, 0, SubReg)
5162 .addReg(SrcReg, RegState::Implicit);
5163 }
5164
5165 MI.eraseFromParent();
5166
5167 return LoopBB;
5168}
5169
5172 const GCNSubtarget &ST) {
5173 const SIInstrInfo *TII = ST.getInstrInfo();
5174 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5175 MachineFunction *MF = MBB.getParent();
5177
5178 Register Dst = MI.getOperand(0).getReg();
5179 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5180 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5181 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5182 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5183 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5184 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5185
5186 // This can be an immediate, but will be folded later.
5187 assert(Val->getReg());
5188
5189 unsigned SubReg;
5190 std::tie(SubReg, Offset) =
5191 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5192 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5193
5194 if (Idx->getReg() == AMDGPU::NoRegister) {
5196 const DebugLoc &DL = MI.getDebugLoc();
5197
5198 assert(Offset == 0);
5199
5200 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5201 .add(*SrcVec)
5202 .add(*Val)
5203 .addImm(SubReg);
5204
5205 MI.eraseFromParent();
5206 return &MBB;
5207 }
5208
5209 // Check for a SGPR index.
5210 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5212 const DebugLoc &DL = MI.getDebugLoc();
5213
5214 if (UseGPRIdxMode) {
5216
5217 const MCInstrDesc &GPRIDXDesc =
5218 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5219 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5220 .addReg(SrcVec->getReg())
5221 .add(*Val)
5222 .addReg(Idx)
5223 .addImm(SubReg);
5224 } else {
5226
5227 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5228 TRI.getRegSizeInBits(*VecRC), 32, false);
5229 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5230 .addReg(SrcVec->getReg())
5231 .add(*Val)
5232 .addImm(SubReg);
5233 }
5234 MI.eraseFromParent();
5235 return &MBB;
5236 }
5237
5238 // Control flow needs to be inserted if indexing with a VGPR.
5239 if (Val->isReg())
5240 MRI.clearKillFlags(Val->getReg());
5241
5242 const DebugLoc &DL = MI.getDebugLoc();
5243
5244 Register PhiReg = MRI.createVirtualRegister(VecRC);
5245
5246 Register SGPRIdxReg;
5247 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5248 UseGPRIdxMode, SGPRIdxReg);
5249 MachineBasicBlock *LoopBB = InsPt->getParent();
5250
5251 if (UseGPRIdxMode) {
5252 const MCInstrDesc &GPRIDXDesc =
5253 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5254
5255 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5256 .addReg(PhiReg)
5257 .add(*Val)
5258 .addReg(SGPRIdxReg)
5259 .addImm(SubReg);
5260 } else {
5261 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5262 TRI.getRegSizeInBits(*VecRC), 32, false);
5263 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5264 .addReg(PhiReg)
5265 .add(*Val)
5266 .addImm(SubReg);
5267 }
5268
5269 MI.eraseFromParent();
5270 return LoopBB;
5271}
5272
5274 switch (Opc) {
5275 case AMDGPU::S_MIN_U32:
5276 return std::numeric_limits<uint32_t>::max();
5277 case AMDGPU::S_MIN_I32:
5278 return std::numeric_limits<int32_t>::max();
5279 case AMDGPU::S_MAX_U32:
5280 return std::numeric_limits<uint32_t>::min();
5281 case AMDGPU::S_MAX_I32:
5282 return std::numeric_limits<int32_t>::min();
5283 case AMDGPU::S_ADD_I32:
5284 case AMDGPU::S_SUB_I32:
5285 case AMDGPU::S_OR_B32:
5286 case AMDGPU::S_XOR_B32:
5287 return std::numeric_limits<uint32_t>::min();
5288 case AMDGPU::S_AND_B32:
5289 return std::numeric_limits<uint32_t>::max();
5290 default:
5291 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5292 }
5293}
5294
5297 const GCNSubtarget &ST,
5298 unsigned Opc) {
5300 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5301 const DebugLoc &DL = MI.getDebugLoc();
5302 const SIInstrInfo *TII = ST.getInstrInfo();
5303
5304 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5305 Register SrcReg = MI.getOperand(1).getReg();
5306 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5307 Register DstReg = MI.getOperand(0).getReg();
5308 MachineBasicBlock *RetBB = nullptr;
5309 if (isSGPR) {
5310 switch (Opc) {
5311 case AMDGPU::S_MIN_U32:
5312 case AMDGPU::S_MIN_I32:
5313 case AMDGPU::S_MAX_U32:
5314 case AMDGPU::S_MAX_I32:
5315 case AMDGPU::S_AND_B32:
5316 case AMDGPU::S_OR_B32: {
5317 // Idempotent operations.
5318 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5319 RetBB = &BB;
5320 break;
5321 }
5322 case AMDGPU::S_XOR_B32:
5323 case AMDGPU::S_ADD_I32:
5324 case AMDGPU::S_SUB_I32: {
5325 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5326 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5327 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5328 Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5329
5330 bool IsWave32 = ST.isWave32();
5331 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5332 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5333 unsigned CountReg =
5334 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5335
5336 auto Exec =
5337 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5338
5339 auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5340 .addReg(Exec->getOperand(0).getReg());
5341
5342 switch (Opc) {
5343 case AMDGPU::S_XOR_B32: {
5344 // Performing an XOR operation on a uniform value
5345 // depends on the parity of the number of active lanes.
5346 // For even parity, the result will be 0, for odd
5347 // parity the result will be the same as the input value.
5348 Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5349
5350 auto ParityReg =
5351 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5352 .addReg(NewAccumulator->getOperand(0).getReg())
5353 .addImm(1);
5354 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5355 .addReg(SrcReg)
5356 .addReg(ParityReg->getOperand(0).getReg());
5357 break;
5358 }
5359 case AMDGPU::S_SUB_I32: {
5360 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5361
5362 // Take the negation of the source operand.
5363 auto InvertedValReg =
5364 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5365 .addImm(-1)
5366 .addReg(SrcReg);
5367 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5368 .addReg(InvertedValReg->getOperand(0).getReg())
5369 .addReg(NewAccumulator->getOperand(0).getReg());
5370 break;
5371 }
5372 case AMDGPU::S_ADD_I32: {
5373 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5374 .addReg(SrcReg)
5375 .addReg(NewAccumulator->getOperand(0).getReg());
5376 break;
5377 }
5378 }
5379 RetBB = &BB;
5380 }
5381 }
5382 } else {
5383 // TODO: Implement DPP Strategy and switch based on immediate strategy
5384 // operand. For now, for all the cases (default, Iterative and DPP we use
5385 // iterative approach by default.)
5386
5387 // To reduce the VGPR using iterative approach, we need to iterate
5388 // over all the active lanes. Lowering consists of ComputeLoop,
5389 // which iterate over only active lanes. We use copy of EXEC register
5390 // as induction variable and every active lane modifies it using bitset0
5391 // so that we will get the next active lane for next iteration.
5393 Register SrcReg = MI.getOperand(1).getReg();
5394
5395 // Create Control flow for loop
5396 // Split MI's Machine Basic block into For loop
5397 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5398
5399 // Create virtual registers required for lowering.
5400 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5401 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5402 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5403 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5404
5405 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5406 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5407 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5408
5409 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5410 Register LaneValueReg =
5411 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5412
5413 bool IsWave32 = ST.isWave32();
5414 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5415 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5416
5417 // Create initial values of induction variable from Exec, Accumulator and
5418 // insert branch instr to newly created ComputeBlock
5420 auto TmpSReg =
5421 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5422 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5423 .addImm(InitalValue);
5424 // clang-format off
5425 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5426 .addMBB(ComputeLoop);
5427 // clang-format on
5428
5429 // Start constructing ComputeLoop
5430 I = ComputeLoop->end();
5431 auto Accumulator =
5432 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5433 .addReg(InitalValReg)
5434 .addMBB(&BB);
5435 auto ActiveBits =
5436 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5437 .addReg(TmpSReg->getOperand(0).getReg())
5438 .addMBB(&BB);
5439
5440 // Perform the computations
5441 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5442 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5443 .addReg(ActiveBits->getOperand(0).getReg());
5444 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5445 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5446 .addReg(SrcReg)
5447 .addReg(FF1->getOperand(0).getReg());
5448 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5449 .addReg(Accumulator->getOperand(0).getReg())
5450 .addReg(LaneValue->getOperand(0).getReg());
5451
5452 // Manipulate the iterator to get the next active lane
5453 unsigned BITSETOpc =
5454 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5455 auto NewActiveBits =
5456 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5457 .addReg(FF1->getOperand(0).getReg())
5458 .addReg(ActiveBits->getOperand(0).getReg());
5459
5460 // Add phi nodes
5461 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5462 .addMBB(ComputeLoop);
5463 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5464 .addMBB(ComputeLoop);
5465
5466 // Creating branching
5467 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5468 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5469 .addReg(NewActiveBits->getOperand(0).getReg())
5470 .addImm(0);
5471 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5472 .addMBB(ComputeLoop);
5473
5474 RetBB = ComputeEnd;
5475 }
5476 MI.eraseFromParent();
5477 return RetBB;
5478}
5479
5482 MachineBasicBlock *BB) const {
5483
5485 MachineFunction *MF = BB->getParent();
5487
5488 switch (MI.getOpcode()) {
5489 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5490 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5491 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5492 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5493 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5494 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5495 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5496 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5497 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5498 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5499 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5500 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5501 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5502 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5503 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5504 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5505 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5506 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5507 case AMDGPU::S_UADDO_PSEUDO:
5508 case AMDGPU::S_USUBO_PSEUDO: {
5509 const DebugLoc &DL = MI.getDebugLoc();
5510 MachineOperand &Dest0 = MI.getOperand(0);
5511 MachineOperand &Dest1 = MI.getOperand(1);
5512 MachineOperand &Src0 = MI.getOperand(2);
5513 MachineOperand &Src1 = MI.getOperand(3);
5514
5515 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5516 ? AMDGPU::S_ADD_I32
5517 : AMDGPU::S_SUB_I32;
5518 // clang-format off
5519 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5520 .add(Src0)
5521 .add(Src1);
5522 // clang-format on
5523
5524 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5525 .addImm(1)
5526 .addImm(0);
5527
5528 MI.eraseFromParent();
5529 return BB;
5530 }
5531 case AMDGPU::S_ADD_U64_PSEUDO:
5532 case AMDGPU::S_SUB_U64_PSEUDO: {
5533 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5534 // For GFX12, we emit s_add_u64 and s_sub_u64.
5535 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5537 const DebugLoc &DL = MI.getDebugLoc();
5538 MachineOperand &Dest = MI.getOperand(0);
5539 MachineOperand &Src0 = MI.getOperand(1);
5540 MachineOperand &Src1 = MI.getOperand(2);
5541 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5542 if (Subtarget->hasScalarAddSub64()) {
5543 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5544 // clang-format off
5545 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5546 .add(Src0)
5547 .add(Src1);
5548 // clang-format on
5549 } else {
5550 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5551 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5552
5553 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5554 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5555
5556 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5557 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5558 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5559 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5560
5561 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5562 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5563 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5564 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5565
5566 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5567 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5568 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5569 .add(Src0Sub0)
5570 .add(Src1Sub0);
5571 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5572 .add(Src0Sub1)
5573 .add(Src1Sub1);
5574 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5575 .addReg(DestSub0)
5576 .addImm(AMDGPU::sub0)
5577 .addReg(DestSub1)
5578 .addImm(AMDGPU::sub1);
5579 }
5580 MI.eraseFromParent();
5581 return BB;
5582 }
5583 case AMDGPU::V_ADD_U64_PSEUDO:
5584 case AMDGPU::V_SUB_U64_PSEUDO: {
5586 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5587 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5588 const DebugLoc &DL = MI.getDebugLoc();
5589
5590 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5591
5592 MachineOperand &Dest = MI.getOperand(0);
5593 MachineOperand &Src0 = MI.getOperand(1);
5594 MachineOperand &Src1 = MI.getOperand(2);
5595
5596 if (ST.hasAddSubU64Insts()) {
5597 auto I = BuildMI(*BB, MI, DL,
5598 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5599 : AMDGPU::V_SUB_U64_e64),
5600 Dest.getReg())
5601 .add(Src0)
5602 .add(Src1)
5603 .addImm(0); // clamp
5604 TII->legalizeOperands(*I);
5605 MI.eraseFromParent();
5606 return BB;
5607 }
5608
5609 if (IsAdd && ST.hasLshlAddU64Inst()) {
5610 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5611 Dest.getReg())
5612 .add(Src0)
5613 .addImm(0)
5614 .add(Src1);
5615 TII->legalizeOperands(*Add);
5616 MI.eraseFromParent();
5617 return BB;
5618 }
5619
5620 const auto *CarryRC = TRI->getWaveMaskRegClass();
5621
5622 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5623 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5624
5625 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5626 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5627
5628 const TargetRegisterClass *Src0RC = Src0.isReg()
5629 ? MRI.getRegClass(Src0.getReg())
5630 : &AMDGPU::VReg_64RegClass;
5631 const TargetRegisterClass *Src1RC = Src1.isReg()
5632 ? MRI.getRegClass(Src1.getReg())
5633 : &AMDGPU::VReg_64RegClass;
5634
5635 const TargetRegisterClass *Src0SubRC =
5636 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5637 const TargetRegisterClass *Src1SubRC =
5638 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5639
5640 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5641 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5642 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5643 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5644
5645 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5646 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5647 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5648 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5649
5650 unsigned LoOpc =
5651 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5652 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5653 .addReg(CarryReg, RegState::Define)
5654 .add(SrcReg0Sub0)
5655 .add(SrcReg1Sub0)
5656 .addImm(0); // clamp bit
5657
5658 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5659 MachineInstr *HiHalf =
5660 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5661 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5662 .add(SrcReg0Sub1)
5663 .add(SrcReg1Sub1)
5664 .addReg(CarryReg, RegState::Kill)
5665 .addImm(0); // clamp bit
5666
5667 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5668 .addReg(DestSub0)
5669 .addImm(AMDGPU::sub0)
5670 .addReg(DestSub1)
5671 .addImm(AMDGPU::sub1);
5672 TII->legalizeOperands(*LoHalf);
5673 TII->legalizeOperands(*HiHalf);
5674 MI.eraseFromParent();
5675 return BB;
5676 }
5677 case AMDGPU::S_ADD_CO_PSEUDO:
5678 case AMDGPU::S_SUB_CO_PSEUDO: {
5679 // This pseudo has a chance to be selected
5680 // only from uniform add/subcarry node. All the VGPR operands
5681 // therefore assumed to be splat vectors.
5683 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5684 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5686 const DebugLoc &DL = MI.getDebugLoc();
5687 MachineOperand &Dest = MI.getOperand(0);
5688 MachineOperand &CarryDest = MI.getOperand(1);
5689 MachineOperand &Src0 = MI.getOperand(2);
5690 MachineOperand &Src1 = MI.getOperand(3);
5691 MachineOperand &Src2 = MI.getOperand(4);
5692 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5693 ? AMDGPU::S_ADDC_U32
5694 : AMDGPU::S_SUBB_U32;
5695 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5696 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5697 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5698 .addReg(Src0.getReg());
5699 Src0.setReg(RegOp0);
5700 }
5701 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5702 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5703 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5704 .addReg(Src1.getReg());
5705 Src1.setReg(RegOp1);
5706 }
5707 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5708 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5709 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5710 .addReg(Src2.getReg());
5711 Src2.setReg(RegOp2);
5712 }
5713
5714 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5715 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5716 assert(WaveSize == 64 || WaveSize == 32);
5717
5718 if (WaveSize == 64) {
5719 if (ST.hasScalarCompareEq64()) {
5720 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5721 .addReg(Src2.getReg())
5722 .addImm(0);
5723 } else {
5724 const TargetRegisterClass *SubRC =
5725 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5726 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5727 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5728 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5729 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5730 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5731
5732 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5733 .add(Src2Sub0)
5734 .add(Src2Sub1);
5735
5736 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5737 .addReg(Src2_32, RegState::Kill)
5738 .addImm(0);
5739 }
5740 } else {
5741 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5742 .addReg(Src2.getReg())
5743 .addImm(0);
5744 }
5745
5746 // clang-format off
5747 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5748 .add(Src0)
5749 .add(Src1);
5750 // clang-format on
5751
5752 unsigned SelOpc =
5753 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5754
5755 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5756 .addImm(-1)
5757 .addImm(0);
5758
5759 MI.eraseFromParent();
5760 return BB;
5761 }
5762 case AMDGPU::SI_INIT_M0: {
5763 MachineOperand &M0Init = MI.getOperand(0);
5764 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5765 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5766 AMDGPU::M0)
5767 .add(M0Init);
5768 MI.eraseFromParent();
5769 return BB;
5770 }
5771 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5772 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
5773 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5774 TII->get(AMDGPU::S_CMP_EQ_U32))
5775 .addImm(0)
5776 .addImm(0);
5777 return BB;
5778 }
5779 case AMDGPU::GET_GROUPSTATICSIZE: {
5780 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5781 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5782 DebugLoc DL = MI.getDebugLoc();
5783 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5784 .add(MI.getOperand(0))
5785 .addImm(MFI->getLDSSize());
5786 MI.eraseFromParent();
5787 return BB;
5788 }
5789 case AMDGPU::GET_SHADERCYCLESHILO: {
5792 const DebugLoc &DL = MI.getDebugLoc();
5793 // The algorithm is:
5794 //
5795 // hi1 = getreg(SHADER_CYCLES_HI)
5796 // lo1 = getreg(SHADER_CYCLES_LO)
5797 // hi2 = getreg(SHADER_CYCLES_HI)
5798 //
5799 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5800 // Otherwise there was overflow and the result is hi2:0. In both cases the
5801 // result should represent the actual time at some point during the sequence
5802 // of three getregs.
5803 using namespace AMDGPU::Hwreg;
5804 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5805 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5806 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5807 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5808 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5809 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5810 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5811 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5812 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5813 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5814 .addReg(RegHi1)
5815 .addReg(RegHi2);
5816 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5817 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5818 .addReg(RegLo1)
5819 .addImm(0);
5820 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5821 .add(MI.getOperand(0))
5822 .addReg(RegLo)
5823 .addImm(AMDGPU::sub0)
5824 .addReg(RegHi2)
5825 .addImm(AMDGPU::sub1);
5826 MI.eraseFromParent();
5827 return BB;
5828 }
5829 case AMDGPU::SI_INDIRECT_SRC_V1:
5830 case AMDGPU::SI_INDIRECT_SRC_V2:
5831 case AMDGPU::SI_INDIRECT_SRC_V4:
5832 case AMDGPU::SI_INDIRECT_SRC_V8:
5833 case AMDGPU::SI_INDIRECT_SRC_V9:
5834 case AMDGPU::SI_INDIRECT_SRC_V10:
5835 case AMDGPU::SI_INDIRECT_SRC_V11:
5836 case AMDGPU::SI_INDIRECT_SRC_V12:
5837 case AMDGPU::SI_INDIRECT_SRC_V16:
5838 case AMDGPU::SI_INDIRECT_SRC_V32:
5839 return emitIndirectSrc(MI, *BB, *getSubtarget());
5840 case AMDGPU::SI_INDIRECT_DST_V1:
5841 case AMDGPU::SI_INDIRECT_DST_V2:
5842 case AMDGPU::SI_INDIRECT_DST_V4:
5843 case AMDGPU::SI_INDIRECT_DST_V8:
5844 case AMDGPU::SI_INDIRECT_DST_V9:
5845 case AMDGPU::SI_INDIRECT_DST_V10:
5846 case AMDGPU::SI_INDIRECT_DST_V11:
5847 case AMDGPU::SI_INDIRECT_DST_V12:
5848 case AMDGPU::SI_INDIRECT_DST_V16:
5849 case AMDGPU::SI_INDIRECT_DST_V32:
5850 return emitIndirectDst(MI, *BB, *getSubtarget());
5851 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5852 case AMDGPU::SI_KILL_I1_PSEUDO:
5853 return splitKillBlock(MI, BB);
5854 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5856 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5857 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5858
5859 Register Dst = MI.getOperand(0).getReg();
5860 const MachineOperand &Src0 = MI.getOperand(1);
5861 const MachineOperand &Src1 = MI.getOperand(2);
5862 const DebugLoc &DL = MI.getDebugLoc();
5863 Register SrcCond = MI.getOperand(3).getReg();
5864
5865 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5866 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5867 const auto *CondRC = TRI->getWaveMaskRegClass();
5868 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5869
5870 const TargetRegisterClass *Src0RC = Src0.isReg()
5871 ? MRI.getRegClass(Src0.getReg())
5872 : &AMDGPU::VReg_64RegClass;
5873 const TargetRegisterClass *Src1RC = Src1.isReg()
5874 ? MRI.getRegClass(Src1.getReg())
5875 : &AMDGPU::VReg_64RegClass;
5876
5877 const TargetRegisterClass *Src0SubRC =
5878 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5879 const TargetRegisterClass *Src1SubRC =
5880 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5881
5882 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5883 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5884 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5885 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5886
5887 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5888 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5889 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5890 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5891
5892 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5893 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5894 .addImm(0)
5895 .add(Src0Sub0)
5896 .addImm(0)
5897 .add(Src1Sub0)
5898 .addReg(SrcCondCopy);
5899 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5900 .addImm(0)
5901 .add(Src0Sub1)
5902 .addImm(0)
5903 .add(Src1Sub1)
5904 .addReg(SrcCondCopy);
5905
5906 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5907 .addReg(DstLo)
5908 .addImm(AMDGPU::sub0)
5909 .addReg(DstHi)
5910 .addImm(AMDGPU::sub1);
5911 MI.eraseFromParent();
5912 return BB;
5913 }
5914 case AMDGPU::SI_BR_UNDEF: {
5916 const DebugLoc &DL = MI.getDebugLoc();
5917 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5918 .add(MI.getOperand(0));
5919 Br->getOperand(1).setIsUndef(); // read undef SCC
5920 MI.eraseFromParent();
5921 return BB;
5922 }
5923 case AMDGPU::ADJCALLSTACKUP:
5924 case AMDGPU::ADJCALLSTACKDOWN: {
5926 MachineInstrBuilder MIB(*MF, &MI);
5927 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5928 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5929 return BB;
5930 }
5931 case AMDGPU::SI_CALL_ISEL: {
5933 const DebugLoc &DL = MI.getDebugLoc();
5934
5935 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5936
5938 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5939
5940 for (const MachineOperand &MO : MI.operands())
5941 MIB.add(MO);
5942
5943 MIB.cloneMemRefs(MI);
5944 MI.eraseFromParent();
5945 return BB;
5946 }
5947 case AMDGPU::V_ADD_CO_U32_e32:
5948 case AMDGPU::V_SUB_CO_U32_e32:
5949 case AMDGPU::V_SUBREV_CO_U32_e32: {
5950 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5951 const DebugLoc &DL = MI.getDebugLoc();
5952 unsigned Opc = MI.getOpcode();
5953
5954 bool NeedClampOperand = false;
5955 if (TII->pseudoToMCOpcode(Opc) == -1) {
5957 NeedClampOperand = true;
5958 }
5959
5960 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5961 if (TII->isVOP3(*I)) {
5962 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5963 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5964 I.addReg(TRI->getVCC(), RegState::Define);
5965 }
5966 I.add(MI.getOperand(1)).add(MI.getOperand(2));
5967 if (NeedClampOperand)
5968 I.addImm(0); // clamp bit for e64 encoding
5969
5970 TII->legalizeOperands(*I);
5971
5972 MI.eraseFromParent();
5973 return BB;
5974 }
5975 case AMDGPU::V_ADDC_U32_e32:
5976 case AMDGPU::V_SUBB_U32_e32:
5977 case AMDGPU::V_SUBBREV_U32_e32:
5978 // These instructions have an implicit use of vcc which counts towards the
5979 // constant bus limit.
5980 TII->legalizeOperands(MI);
5981 return BB;
5982 case AMDGPU::DS_GWS_INIT:
5983 case AMDGPU::DS_GWS_SEMA_BR:
5984 case AMDGPU::DS_GWS_BARRIER:
5985 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5986 [[fallthrough]];
5987 case AMDGPU::DS_GWS_SEMA_V:
5988 case AMDGPU::DS_GWS_SEMA_P:
5989 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5990 // A s_waitcnt 0 is required to be the instruction immediately following.
5991 if (getSubtarget()->hasGWSAutoReplay()) {
5993 return BB;
5994 }
5995
5996 return emitGWSMemViolTestLoop(MI, BB);
5997 case AMDGPU::S_SETREG_B32: {
5998 // Try to optimize cases that only set the denormal mode or rounding mode.
5999 //
6000 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6001 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6002 // instead.
6003 //
6004 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6005 // allow you to have a no side effect instruction in the output of a
6006 // sideeffecting pattern.
6007 auto [ID, Offset, Width] =
6008 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6010 return BB;
6011
6012 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6013 const unsigned SetMask = WidthMask << Offset;
6014
6015 if (getSubtarget()->hasDenormModeInst()) {
6016 unsigned SetDenormOp = 0;
6017 unsigned SetRoundOp = 0;
6018
6019 // The dedicated instructions can only set the whole denorm or round mode
6020 // at once, not a subset of bits in either.
6021 if (SetMask ==
6023 // If this fully sets both the round and denorm mode, emit the two
6024 // dedicated instructions for these.
6025 SetRoundOp = AMDGPU::S_ROUND_MODE;
6026 SetDenormOp = AMDGPU::S_DENORM_MODE;
6027 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6028 SetRoundOp = AMDGPU::S_ROUND_MODE;
6029 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6030 SetDenormOp = AMDGPU::S_DENORM_MODE;
6031 }
6032
6033 if (SetRoundOp || SetDenormOp) {
6035 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6036 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6037 unsigned ImmVal = Def->getOperand(1).getImm();
6038 if (SetRoundOp) {
6039 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6040 .addImm(ImmVal & 0xf);
6041
6042 // If we also have the denorm mode, get just the denorm mode bits.
6043 ImmVal >>= 4;
6044 }
6045
6046 if (SetDenormOp) {
6047 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6048 .addImm(ImmVal & 0xf);
6049 }
6050
6051 MI.eraseFromParent();
6052 return BB;
6053 }
6054 }
6055 }
6056
6057 // If only FP bits are touched, used the no side effects pseudo.
6058 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6059 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6060 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6061
6062 return BB;
6063 }
6064 case AMDGPU::S_INVERSE_BALLOT_U32:
6065 case AMDGPU::S_INVERSE_BALLOT_U64:
6066 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6067 // necessary. After that they are equivalent to a COPY.
6068 MI.setDesc(TII->get(AMDGPU::COPY));
6069 return BB;
6070 case AMDGPU::ENDPGM_TRAP: {
6071 const DebugLoc &DL = MI.getDebugLoc();
6072 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6073 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6074 MI.addOperand(MachineOperand::CreateImm(0));
6075 return BB;
6076 }
6077
6078 // We need a block split to make the real endpgm a terminator. We also don't
6079 // want to break phis in successor blocks, so we can't just delete to the
6080 // end of the block.
6081
6082 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6084 MF->push_back(TrapBB);
6085 // clang-format off
6086 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6087 .addImm(0);
6088 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6089 .addMBB(TrapBB);
6090 // clang-format on
6091
6092 BB->addSuccessor(TrapBB);
6093 MI.eraseFromParent();
6094 return SplitBB;
6095 }
6096 case AMDGPU::SIMULATED_TRAP: {
6097 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6099 MachineBasicBlock *SplitBB =
6100 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6101 MI.eraseFromParent();
6102 return SplitBB;
6103 }
6104 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6105 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6107
6108 // During ISel, it's difficult to propagate the original EXEC mask to use as
6109 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6110 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6111 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6112 Register OriginalExec = Setup->getOperand(0).getReg();
6113 MF->getRegInfo().clearKillFlags(OriginalExec);
6114 MI.getOperand(0).setReg(OriginalExec);
6115 return BB;
6116 }
6117 default:
6118 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6119 if (!MI.mayStore())
6121 return BB;
6122 }
6124 }
6125}
6126
6128 // This currently forces unfolding various combinations of fsub into fma with
6129 // free fneg'd operands. As long as we have fast FMA (controlled by
6130 // isFMAFasterThanFMulAndFAdd), we should perform these.
6131
6132 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6133 // most of these combines appear to be cycle neutral but save on instruction
6134 // count / code size.
6135 return true;
6136}
6137
6139
6141 EVT VT) const {
6142 if (!VT.isVector()) {
6143 return MVT::i1;
6144 }
6145 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6146}
6147
6149 // TODO: Should i16 be used always if legal? For now it would force VALU
6150 // shifts.
6151 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6152}
6153
6155 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6156 ? Ty.changeElementSize(16)
6157 : Ty.changeElementSize(32);
6158}
6159
6160// Answering this is somewhat tricky and depends on the specific device which
6161// have different rates for fma or all f64 operations.
6162//
6163// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6164// regardless of which device (although the number of cycles differs between
6165// devices), so it is always profitable for f64.
6166//
6167// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6168// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6169// which we can always do even without fused FP ops since it returns the same
6170// result as the separate operations and since it is always full
6171// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6172// however does not support denormals, so we do report fma as faster if we have
6173// a fast fma device and require denormals.
6174//
6176 EVT VT) const {
6177 VT = VT.getScalarType();
6178
6179 switch (VT.getSimpleVT().SimpleTy) {
6180 case MVT::f32: {
6181 // If mad is not available this depends only on if f32 fma is full rate.
6182 if (!Subtarget->hasMadMacF32Insts())
6183 return Subtarget->hasFastFMAF32();
6184
6185 // Otherwise f32 mad is always full rate and returns the same result as
6186 // the separate operations so should be preferred over fma.
6187 // However does not support denormals.
6189 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6190
6191 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6192 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6193 }
6194 case MVT::f64:
6195 return true;
6196 case MVT::f16:
6197 case MVT::bf16:
6198 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6199 default:
6200 break;
6201 }
6202
6203 return false;
6204}
6205
6207 LLT Ty) const {
6208 switch (Ty.getScalarSizeInBits()) {
6209 case 16:
6210 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6211 case 32:
6212 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6213 case 64:
6214 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6215 default:
6216 break;
6217 }
6218
6219 return false;
6220}
6221
6223 if (!Ty.isScalar())
6224 return false;
6225
6226 if (Ty.getScalarSizeInBits() == 16)
6227 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6228 if (Ty.getScalarSizeInBits() == 32)
6229 return Subtarget->hasMadMacF32Insts() &&
6230 denormalModeIsFlushAllF32(*MI.getMF());
6231
6232 return false;
6233}
6234
6236 const SDNode *N) const {
6237 // TODO: Check future ftz flag
6238 // v_mad_f32/v_mac_f32 do not support denormals.
6239 EVT VT = N->getValueType(0);
6240 if (VT == MVT::f32)
6241 return Subtarget->hasMadMacF32Insts() &&
6243 if (VT == MVT::f16) {
6244 return Subtarget->hasMadF16() &&
6246 }
6247
6248 return false;
6249}
6250
6251//===----------------------------------------------------------------------===//
6252// Custom DAG Lowering Operations
6253//===----------------------------------------------------------------------===//
6254
6255// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6256// wider vector type is legal.
6258 SelectionDAG &DAG) const {
6259 unsigned Opc = Op.getOpcode();
6260 EVT VT = Op.getValueType();
6261 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6262 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6263 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6264 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6265
6266 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6267
6268 SDLoc SL(Op);
6269 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6270 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6271
6272 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6273}
6274
6275// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6276// wider vector type is legal.
6278 SelectionDAG &DAG) const {
6279 unsigned Opc = Op.getOpcode();
6280 EVT VT = Op.getValueType();
6281 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6282 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6283 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6284 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6285 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6286 VT == MVT::v32bf16);
6287
6288 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6289 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6290
6291 SDLoc SL(Op);
6292
6293 SDValue OpLo =
6294 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6295 SDValue OpHi =
6296 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6297
6298 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6299}
6300
6302 SelectionDAG &DAG) const {
6303 unsigned Opc = Op.getOpcode();
6304 EVT VT = Op.getValueType();
6305 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6306 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6307 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6308 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6309 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6310 VT == MVT::v32bf16);
6311
6312 SDValue Op0 = Op.getOperand(0);
6313 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6314 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6315 : std::pair(Op0, Op0);
6316
6317 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6318 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6319
6320 SDLoc SL(Op);
6321 auto ResVT = DAG.GetSplitDestVTs(VT);
6322
6323 SDValue OpLo =
6324 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6325 SDValue OpHi =
6326 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6327
6328 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6329}
6330
6332 switch (Op.getOpcode()) {
6333 default:
6335 case ISD::BRCOND:
6336 return LowerBRCOND(Op, DAG);
6337 case ISD::RETURNADDR:
6338 return LowerRETURNADDR(Op, DAG);
6339 case ISD::LOAD: {
6340 SDValue Result = LowerLOAD(Op, DAG);
6341 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6342 "Load should return a value and a chain");
6343 return Result;
6344 }
6345 case ISD::FSQRT: {
6346 EVT VT = Op.getValueType();
6347 if (VT == MVT::f32)
6348 return lowerFSQRTF32(Op, DAG);
6349 if (VT == MVT::f64)
6350 return lowerFSQRTF64(Op, DAG);
6351 return SDValue();
6352 }
6353 case ISD::FSIN:
6354 case ISD::FCOS:
6355 return LowerTrig(Op, DAG);
6356 case ISD::SELECT:
6357 return LowerSELECT(Op, DAG);
6358 case ISD::FDIV:
6359 return LowerFDIV(Op, DAG);
6360 case ISD::FFREXP:
6361 return LowerFFREXP(Op, DAG);
6362 case ISD::ATOMIC_CMP_SWAP:
6363 return LowerATOMIC_CMP_SWAP(Op, DAG);
6364 case ISD::STORE:
6365 return LowerSTORE(Op, DAG);
6366 case ISD::GlobalAddress: {
6369 return LowerGlobalAddress(MFI, Op, DAG);
6370 }
6372 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6374 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6376 return LowerINTRINSIC_VOID(Op, DAG);
6377 case ISD::ADDRSPACECAST:
6378 return lowerADDRSPACECAST(Op, DAG);
6380 return lowerINSERT_SUBVECTOR(Op, DAG);
6382 return lowerINSERT_VECTOR_ELT(Op, DAG);
6384 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6386 return lowerVECTOR_SHUFFLE(Op, DAG);
6388 return lowerSCALAR_TO_VECTOR(Op, DAG);
6389 case ISD::BUILD_VECTOR:
6390 return lowerBUILD_VECTOR(Op, DAG);
6391 case ISD::FP_ROUND:
6393 return lowerFP_ROUND(Op, DAG);
6394 case ISD::TRAP:
6395 return lowerTRAP(Op, DAG);
6396 case ISD::DEBUGTRAP:
6397 return lowerDEBUGTRAP(Op, DAG);
6398 case ISD::ABS:
6399 case ISD::FABS:
6400 case ISD::FNEG:
6401 case ISD::FCANONICALIZE:
6402 case ISD::BSWAP:
6403 return splitUnaryVectorOp(Op, DAG);
6404 case ISD::FMINNUM:
6405 case ISD::FMAXNUM:
6406 return lowerFMINNUM_FMAXNUM(Op, DAG);
6407 case ISD::FMINIMUMNUM:
6408 case ISD::FMAXIMUMNUM:
6409 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6410 case ISD::FMINIMUM:
6411 case ISD::FMAXIMUM:
6412 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6413 case ISD::FLDEXP:
6414 case ISD::STRICT_FLDEXP:
6415 return lowerFLDEXP(Op, DAG);
6416 case ISD::FMA:
6417 return splitTernaryVectorOp(Op, DAG);
6418 case ISD::FP_TO_SINT:
6419 case ISD::FP_TO_UINT:
6420 return LowerFP_TO_INT(Op, DAG);
6421 case ISD::SHL:
6422 case ISD::SRA:
6423 case ISD::SRL:
6424 case ISD::ADD:
6425 case ISD::SUB:
6426 case ISD::SMIN:
6427 case ISD::SMAX:
6428 case ISD::UMIN:
6429 case ISD::UMAX:
6430 case ISD::FADD:
6431 case ISD::FMUL:
6432 case ISD::FMINNUM_IEEE:
6433 case ISD::FMAXNUM_IEEE:
6434 case ISD::UADDSAT:
6435 case ISD::USUBSAT:
6436 case ISD::SADDSAT:
6437 case ISD::SSUBSAT:
6438 return splitBinaryVectorOp(Op, DAG);
6439 case ISD::FCOPYSIGN:
6440 return lowerFCOPYSIGN(Op, DAG);
6441 case ISD::MUL:
6442 return lowerMUL(Op, DAG);
6443 case ISD::SMULO:
6444 case ISD::UMULO:
6445 return lowerXMULO(Op, DAG);
6446 case ISD::SMUL_LOHI:
6447 case ISD::UMUL_LOHI:
6448 return lowerXMUL_LOHI(Op, DAG);
6449 case ISD::DYNAMIC_STACKALLOC:
6450 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6451 case ISD::STACKSAVE:
6452 return LowerSTACKSAVE(Op, DAG);
6453 case ISD::GET_ROUNDING:
6454 return lowerGET_ROUNDING(Op, DAG);
6455 case ISD::SET_ROUNDING:
6456 return lowerSET_ROUNDING(Op, DAG);
6457 case ISD::PREFETCH:
6458 return lowerPREFETCH(Op, DAG);
6459 case ISD::FP_EXTEND:
6461 return lowerFP_EXTEND(Op, DAG);
6462 case ISD::GET_FPENV:
6463 return lowerGET_FPENV(Op, DAG);
6464 case ISD::SET_FPENV:
6465 return lowerSET_FPENV(Op, DAG);
6466 }
6467 return SDValue();
6468}
6469
6470// Used for D16: Casts the result of an instruction into the right vector,
6471// packs values if loads return unpacked values.
6473 const SDLoc &DL, SelectionDAG &DAG,
6474 bool Unpacked) {
6475 if (!LoadVT.isVector())
6476 return Result;
6477
6478 // Cast back to the original packed type or to a larger type that is a
6479 // multiple of 32 bit for D16. Widening the return type is a required for
6480 // legalization.
6481 EVT FittingLoadVT = LoadVT;
6482 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6483 FittingLoadVT =
6485 LoadVT.getVectorNumElements() + 1);
6486 }
6487
6488 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6489 // Truncate to v2i16/v4i16.
6490 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6491
6492 // Workaround legalizer not scalarizing truncate after vector op
6493 // legalization but not creating intermediate vector trunc.
6495 DAG.ExtractVectorElements(Result, Elts);
6496 for (SDValue &Elt : Elts)
6497 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6498
6499 // Pad illegal v1i16/v3fi6 to v4i16
6500 if ((LoadVT.getVectorNumElements() % 2) == 1)
6501 Elts.push_back(DAG.getPOISON(MVT::i16));
6502
6503 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6504
6505 // Bitcast to original type (v2f16/v4f16).
6506 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6507 }
6508
6509 // Cast back to the original packed type.
6510 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6511}
6512
6513SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6514 SelectionDAG &DAG,
6516 bool IsIntrinsic) const {
6517 SDLoc DL(M);
6518
6519 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6520 EVT LoadVT = M->getValueType(0);
6521
6522 EVT EquivLoadVT = LoadVT;
6523 if (LoadVT.isVector()) {
6524 if (Unpacked) {
6525 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6526 LoadVT.getVectorNumElements());
6527 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6528 // Widen v3f16 to legal type
6529 EquivLoadVT =
6531 LoadVT.getVectorNumElements() + 1);
6532 }
6533 }
6534
6535 // Change from v4f16/v2f16 to EquivLoadVT.
6536 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6537
6539 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6540 M->getMemoryVT(), M->getMemOperand());
6541
6542 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6543
6544 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6545}
6546
6547SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6548 SelectionDAG &DAG,
6549 ArrayRef<SDValue> Ops) const {
6550 SDLoc DL(M);
6551 EVT LoadVT = M->getValueType(0);
6552 EVT EltType = LoadVT.getScalarType();
6553 EVT IntVT = LoadVT.changeTypeToInteger();
6554
6555 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6556
6557 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6558 bool IsTFE = M->getNumValues() == 3;
6559
6560 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6562 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6563 : AMDGPUISD::BUFFER_LOAD;
6564
6565 if (IsD16) {
6566 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6567 }
6568
6569 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6570 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6571 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6572 IsTFE);
6573
6574 if (isTypeLegal(LoadVT)) {
6575 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6576 M->getMemOperand(), DAG);
6577 }
6578
6579 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6580 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6581 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6582 M->getMemOperand(), DAG);
6583 return DAG.getMergeValues(
6584 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6585 DL);
6586}
6587
6589 SelectionDAG &DAG) {
6590 EVT VT = N->getValueType(0);
6591 unsigned CondCode = N->getConstantOperandVal(3);
6592 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6593 return DAG.getPOISON(VT);
6594
6595 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6596
6597 SDValue LHS = N->getOperand(1);
6598 SDValue RHS = N->getOperand(2);
6599
6600 SDLoc DL(N);
6601
6602 EVT CmpVT = LHS.getValueType();
6603 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6604 unsigned PromoteOp =
6606 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6607 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6608 }
6609
6610 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6611
6612 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6613 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6614
6615 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6616 DAG.getCondCode(CCOpcode));
6617 if (VT.bitsEq(CCVT))
6618 return SetCC;
6619 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6620}
6621
6623 SelectionDAG &DAG) {
6624 EVT VT = N->getValueType(0);
6625
6626 unsigned CondCode = N->getConstantOperandVal(3);
6627 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6628 return DAG.getPOISON(VT);
6629
6630 SDValue Src0 = N->getOperand(1);
6631 SDValue Src1 = N->getOperand(2);
6632 EVT CmpVT = Src0.getValueType();
6633 SDLoc SL(N);
6634
6635 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6636 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6637 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6638 }
6639
6640 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6641 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6642 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6643 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6644 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6645 DAG.getCondCode(CCOpcode));
6646 if (VT.bitsEq(CCVT))
6647 return SetCC;
6648 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6649}
6650
6652 SelectionDAG &DAG) {
6653 EVT VT = N->getValueType(0);
6654 SDValue Src = N->getOperand(1);
6655 SDLoc SL(N);
6656
6657 if (Src.getOpcode() == ISD::SETCC) {
6658 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6659 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6660 Src.getOperand(1), Src.getOperand(2));
6661 }
6662 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6663 // (ballot 0) -> 0
6664 if (Arg->isZero())
6665 return DAG.getConstant(0, SL, VT);
6666
6667 // (ballot 1) -> EXEC/EXEC_LO
6668 if (Arg->isOne()) {
6669 Register Exec;
6670 if (VT.getScalarSizeInBits() == 32)
6671 Exec = AMDGPU::EXEC_LO;
6672 else if (VT.getScalarSizeInBits() == 64)
6673 Exec = AMDGPU::EXEC;
6674 else
6675 return SDValue();
6676
6677 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6678 }
6679 }
6680
6681 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6682 // ISD::SETNE)
6683 return DAG.getNode(
6684 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6685 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6686}
6687
6689 SelectionDAG &DAG) {
6690 EVT VT = N->getValueType(0);
6691 unsigned ValSize = VT.getSizeInBits();
6692 unsigned IID = N->getConstantOperandVal(0);
6693 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6694 IID == Intrinsic::amdgcn_permlanex16;
6695 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6696 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6697 SDLoc SL(N);
6698 MVT IntVT = MVT::getIntegerVT(ValSize);
6699 const GCNSubtarget *ST = TLI.getSubtarget();
6700 unsigned SplitSize = 32;
6701 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6702 ST->hasDPALU_DPP() &&
6703 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
6704 SplitSize = 64;
6705
6706 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6707 SDValue Src2, MVT ValT) -> SDValue {
6709 switch (IID) {
6710 case Intrinsic::amdgcn_permlane16:
6711 case Intrinsic::amdgcn_permlanex16:
6712 case Intrinsic::amdgcn_update_dpp:
6713 Operands.push_back(N->getOperand(6));
6714 Operands.push_back(N->getOperand(5));
6715 Operands.push_back(N->getOperand(4));
6716 [[fallthrough]];
6717 case Intrinsic::amdgcn_writelane:
6718 Operands.push_back(Src2);
6719 [[fallthrough]];
6720 case Intrinsic::amdgcn_readlane:
6721 case Intrinsic::amdgcn_set_inactive:
6722 case Intrinsic::amdgcn_set_inactive_chain_arg:
6723 case Intrinsic::amdgcn_mov_dpp8:
6724 Operands.push_back(Src1);
6725 [[fallthrough]];
6726 case Intrinsic::amdgcn_readfirstlane:
6727 case Intrinsic::amdgcn_permlane64:
6728 Operands.push_back(Src0);
6729 break;
6730 default:
6731 llvm_unreachable("unhandled lane op");
6732 }
6733
6734 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6735 std::reverse(Operands.begin(), Operands.end());
6736
6737 if (SDNode *GL = N->getGluedNode()) {
6738 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6739 GL = GL->getOperand(0).getNode();
6740 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6741 SDValue(GL, 0)));
6742 }
6743
6744 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6745 };
6746
6747 SDValue Src0 = N->getOperand(1);
6748 SDValue Src1, Src2;
6749 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6750 IID == Intrinsic::amdgcn_mov_dpp8 ||
6751 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6752 Src1 = N->getOperand(2);
6753 if (IID == Intrinsic::amdgcn_writelane ||
6754 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6755 Src2 = N->getOperand(3);
6756 }
6757
6758 if (ValSize == SplitSize) {
6759 // Already legal
6760 return SDValue();
6761 }
6762
6763 if (ValSize < 32) {
6764 bool IsFloat = VT.isFloatingPoint();
6765 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6766 SL, MVT::i32);
6767
6768 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6769 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6770 SL, MVT::i32);
6771 }
6772
6773 if (IID == Intrinsic::amdgcn_writelane) {
6774 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6775 SL, MVT::i32);
6776 }
6777
6778 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6779 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6780 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6781 }
6782
6783 if (ValSize % SplitSize != 0)
6784 return SDValue();
6785
6786 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6787 EVT VT = N->getValueType(0);
6788 unsigned NE = VT.getVectorNumElements();
6789 EVT EltVT = VT.getVectorElementType();
6791 unsigned NumOperands = N->getNumOperands();
6792 SmallVector<SDValue, 4> Operands(NumOperands);
6793 SDNode *GL = N->getGluedNode();
6794
6795 // only handle convergencectrl_glue
6796 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6797
6798 for (unsigned i = 0; i != NE; ++i) {
6799 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6800 ++j) {
6801 SDValue Operand = N->getOperand(j);
6802 EVT OperandVT = Operand.getValueType();
6803 if (OperandVT.isVector()) {
6804 // A vector operand; extract a single element.
6805 EVT OperandEltVT = OperandVT.getVectorElementType();
6806 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6807 Operand, DAG.getVectorIdxConstant(i, SL));
6808 } else {
6809 // A scalar operand; just use it as is.
6810 Operands[j] = Operand;
6811 }
6812 }
6813
6814 if (GL)
6815 Operands[NumOperands - 1] =
6816 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6817 SDValue(GL->getOperand(0).getNode(), 0));
6818
6819 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6820 }
6821
6822 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6823 return DAG.getBuildVector(VecVT, SL, Scalars);
6824 };
6825
6826 if (VT.isVector()) {
6827 switch (MVT::SimpleValueType EltTy =
6829 case MVT::i32:
6830 case MVT::f32:
6831 if (SplitSize == 32) {
6832 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6833 return unrollLaneOp(LaneOp.getNode());
6834 }
6835 [[fallthrough]];
6836 case MVT::i16:
6837 case MVT::f16:
6838 case MVT::bf16: {
6839 unsigned SubVecNumElt =
6840 SplitSize / VT.getVectorElementType().getSizeInBits();
6841 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
6843 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6844 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6845 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6846 DAG.getConstant(EltIdx, SL, MVT::i32));
6847
6848 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6849 IsPermLane16)
6850 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6851 DAG.getConstant(EltIdx, SL, MVT::i32));
6852
6853 if (IID == Intrinsic::amdgcn_writelane)
6854 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6855 DAG.getConstant(EltIdx, SL, MVT::i32));
6856
6857 Pieces.push_back(
6858 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6859 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6860 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6861 EltIdx += SubVecNumElt;
6862 }
6863 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6864 }
6865 default:
6866 // Handle all other cases by bitcasting to i32 vectors
6867 break;
6868 }
6869 }
6870
6871 MVT VecVT =
6872 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
6873 Src0 = DAG.getBitcast(VecVT, Src0);
6874
6875 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6876 Src1 = DAG.getBitcast(VecVT, Src1);
6877
6878 if (IID == Intrinsic::amdgcn_writelane)
6879 Src2 = DAG.getBitcast(VecVT, Src2);
6880
6881 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6882 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6883 return DAG.getBitcast(VT, UnrolledLaneOp);
6884}
6885
6888 SelectionDAG &DAG) const {
6889 switch (N->getOpcode()) {
6891 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6892 Results.push_back(Res);
6893 return;
6894 }
6896 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6897 Results.push_back(Res);
6898 return;
6899 }
6901 unsigned IID = N->getConstantOperandVal(0);
6902 switch (IID) {
6903 case Intrinsic::amdgcn_make_buffer_rsrc:
6904 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6905 return;
6906 case Intrinsic::amdgcn_cvt_pkrtz: {
6907 SDValue Src0 = N->getOperand(1);
6908 SDValue Src1 = N->getOperand(2);
6909 SDLoc SL(N);
6910 SDValue Cvt =
6911 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
6912 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6913 return;
6914 }
6915 case Intrinsic::amdgcn_cvt_pknorm_i16:
6916 case Intrinsic::amdgcn_cvt_pknorm_u16:
6917 case Intrinsic::amdgcn_cvt_pk_i16:
6918 case Intrinsic::amdgcn_cvt_pk_u16: {
6919 SDValue Src0 = N->getOperand(1);
6920 SDValue Src1 = N->getOperand(2);
6921 SDLoc SL(N);
6922 unsigned Opcode;
6923
6924 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6926 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6928 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6930 else
6932
6933 EVT VT = N->getValueType(0);
6934 if (isTypeLegal(VT))
6935 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6936 else {
6937 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6938 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6939 }
6940 return;
6941 }
6942 case Intrinsic::amdgcn_s_buffer_load: {
6943 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6944 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6945 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6946 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6947 // s_buffer_load_i8.
6948 if (!Subtarget->hasScalarSubwordLoads())
6949 return;
6950 SDValue Op = SDValue(N, 0);
6951 SDValue Rsrc = Op.getOperand(1);
6952 SDValue Offset = Op.getOperand(2);
6953 SDValue CachePolicy = Op.getOperand(3);
6954 EVT VT = Op.getValueType();
6955 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6956 SDLoc DL(Op);
6958 const DataLayout &DataLayout = DAG.getDataLayout();
6959 Align Alignment =
6965 VT.getStoreSize(), Alignment);
6966 SDValue LoadVal;
6967 if (!Offset->isDivergent()) {
6968 SDValue Ops[] = {Rsrc, // source register
6969 Offset, CachePolicy};
6970 SDValue BufferLoad =
6972 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6973 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6974 } else {
6975 SDValue Ops[] = {
6976 DAG.getEntryNode(), // Chain
6977 Rsrc, // rsrc
6978 DAG.getConstant(0, DL, MVT::i32), // vindex
6979 {}, // voffset
6980 {}, // soffset
6981 {}, // offset
6982 CachePolicy, // cachepolicy
6983 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6984 };
6985 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6986 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6987 }
6988 Results.push_back(LoadVal);
6989 return;
6990 }
6991 case Intrinsic::amdgcn_dead: {
6992 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
6993 Results.push_back(DAG.getPOISON(N->getValueType(I)));
6994 return;
6995 }
6996 }
6997 break;
6998 }
7000 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7001 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7002 // FIXME: Hacky
7003 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7004 Results.push_back(Res.getOperand(I));
7005 }
7006 } else {
7007 Results.push_back(Res);
7008 Results.push_back(Res.getValue(1));
7009 }
7010 return;
7011 }
7012
7013 break;
7014 }
7015 case ISD::SELECT: {
7016 SDLoc SL(N);
7017 EVT VT = N->getValueType(0);
7018 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7019 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7020 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7021
7022 EVT SelectVT = NewVT;
7023 if (NewVT.bitsLT(MVT::i32)) {
7024 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7025 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7026 SelectVT = MVT::i32;
7027 }
7028
7029 SDValue NewSelect =
7030 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7031
7032 if (NewVT != SelectVT)
7033 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7034 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7035 return;
7036 }
7037 case ISD::FNEG: {
7038 if (N->getValueType(0) != MVT::v2f16)
7039 break;
7040
7041 SDLoc SL(N);
7042 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7043
7044 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7045 DAG.getConstant(0x80008000, SL, MVT::i32));
7046 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7047 return;
7048 }
7049 case ISD::FABS: {
7050 if (N->getValueType(0) != MVT::v2f16)
7051 break;
7052
7053 SDLoc SL(N);
7054 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7055
7056 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7057 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7058 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7059 return;
7060 }
7061 case ISD::FSQRT: {
7062 if (N->getValueType(0) != MVT::f16)
7063 break;
7064 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7065 break;
7066 }
7067 default:
7069 break;
7070 }
7071}
7072
7073/// Helper function for LowerBRCOND
7074static SDNode *findUser(SDValue Value, unsigned Opcode) {
7075
7076 for (SDUse &U : Value->uses()) {
7077 if (U.get() != Value)
7078 continue;
7079
7080 if (U.getUser()->getOpcode() == Opcode)
7081 return U.getUser();
7082 }
7083 return nullptr;
7084}
7085
7086unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7087 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7088 switch (Intr->getConstantOperandVal(1)) {
7089 case Intrinsic::amdgcn_if:
7090 return AMDGPUISD::IF;
7091 case Intrinsic::amdgcn_else:
7092 return AMDGPUISD::ELSE;
7093 case Intrinsic::amdgcn_loop:
7094 return AMDGPUISD::LOOP;
7095 case Intrinsic::amdgcn_end_cf:
7096 llvm_unreachable("should not occur");
7097 default:
7098 return 0;
7099 }
7100 }
7101
7102 // break, if_break, else_break are all only used as inputs to loop, not
7103 // directly as branch conditions.
7104 return 0;
7105}
7106
7113
7115 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7116 return false;
7117
7118 // FIXME: Either avoid relying on address space here or change the default
7119 // address space for functions to avoid the explicit check.
7120 return (GV->getValueType()->isFunctionTy() ||
7123}
7124
7126 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7127}
7128
7130 if (!GV->hasExternalLinkage())
7131 return true;
7132
7133 const auto OS = getTargetMachine().getTargetTriple().getOS();
7134 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7135}
7136
7137/// This transforms the control flow intrinsics to get the branch destination as
7138/// last parameter, also switches branch target with BR if the need arise
7139SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7140 SDLoc DL(BRCOND);
7141
7142 SDNode *Intr = BRCOND.getOperand(1).getNode();
7143 SDValue Target = BRCOND.getOperand(2);
7144 SDNode *BR = nullptr;
7145 SDNode *SetCC = nullptr;
7146
7147 if (Intr->getOpcode() == ISD::SETCC) {
7148 // As long as we negate the condition everything is fine
7149 SetCC = Intr;
7150 Intr = SetCC->getOperand(0).getNode();
7151
7152 } else {
7153 // Get the target from BR if we don't negate the condition
7154 BR = findUser(BRCOND, ISD::BR);
7155 assert(BR && "brcond missing unconditional branch user");
7156 Target = BR->getOperand(1);
7157 }
7158
7159 unsigned CFNode = isCFIntrinsic(Intr);
7160 if (CFNode == 0) {
7161 // This is a uniform branch so we don't need to legalize.
7162 return BRCOND;
7163 }
7164
7165 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7167
7168 assert(!SetCC ||
7169 (SetCC->getConstantOperandVal(1) == 1 &&
7170 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7171 ISD::SETNE));
7172
7173 // operands of the new intrinsic call
7175 if (HaveChain)
7176 Ops.push_back(BRCOND.getOperand(0));
7177
7178 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7179 Ops.push_back(Target);
7180
7181 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7182
7183 // build the new intrinsic call
7184 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7185
7186 if (!HaveChain) {
7187 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7188
7190 }
7191
7192 if (BR) {
7193 // Give the branch instruction our target
7194 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7195 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7196 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7197 }
7198
7199 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7200
7201 // Copy the intrinsic results to registers
7202 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7203 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7204 if (!CopyToReg)
7205 continue;
7206
7207 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7208 SDValue(Result, i - 1), SDValue());
7209
7210 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7211 }
7212
7213 // Remove the old intrinsic from the chain
7214 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7215 Intr->getOperand(0));
7216
7217 return Chain;
7218}
7219
7220SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7221 MVT VT = Op.getSimpleValueType();
7222 SDLoc DL(Op);
7223 // Checking the depth
7224 if (Op.getConstantOperandVal(0) != 0)
7225 return DAG.getConstant(0, DL, VT);
7226
7227 MachineFunction &MF = DAG.getMachineFunction();
7228 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7229 // Check for kernel and shader functions
7230 if (Info->isEntryFunction())
7231 return DAG.getConstant(0, DL, VT);
7232
7233 MachineFrameInfo &MFI = MF.getFrameInfo();
7234 // There is a call to @llvm.returnaddress in this function
7235 MFI.setReturnAddressIsTaken(true);
7236
7237 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7238 // Get the return address reg and mark it as an implicit live-in
7239 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7240 getRegClassFor(VT, Op.getNode()->isDivergent()));
7241
7242 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7243}
7244
7245SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7246 const SDLoc &DL, EVT VT) const {
7247 return Op.getValueType().bitsLE(VT)
7248 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7249 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7250 DAG.getTargetConstant(0, DL, MVT::i32));
7251}
7252
7253SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7254 SelectionDAG &DAG) const {
7255 EVT DstVT = Op.getValueType();
7256 unsigned NumElts = DstVT.getVectorNumElements();
7257 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7258
7259 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7260
7261 SDLoc DL(Op);
7262 unsigned Opc = Op.getOpcode();
7263 SDValue Flags = Op.getOperand(1);
7264 EVT HalfDstVT =
7265 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7266 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7267 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7268
7269 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7270}
7271
7272SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7273 SDValue Src = Op.getOperand(0);
7274 EVT SrcVT = Src.getValueType();
7275 EVT DstVT = Op.getValueType();
7276
7277 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7278 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7279 if (SrcVT.getScalarType() != MVT::f32)
7280 return SDValue();
7281 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7282 }
7283
7284 if (SrcVT.getScalarType() != MVT::f64)
7285 return Op;
7286
7287 SDLoc DL(Op);
7288 if (DstVT == MVT::f16) {
7289 // TODO: Handle strictfp
7290 if (Op.getOpcode() != ISD::FP_ROUND)
7291 return Op;
7292
7293 if (!Subtarget->has16BitInsts()) {
7294 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7295 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7296 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7297 }
7298 if (Op->getFlags().hasApproximateFuncs()) {
7299 SDValue Flags = Op.getOperand(1);
7300 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7301 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7302 }
7303 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7304 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7305 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7306 }
7307
7308 assert(DstVT.getScalarType() == MVT::bf16 &&
7309 "custom lower FP_ROUND for f16 or bf16");
7310 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7311
7312 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7313 // hardware f32 -> bf16 instruction.
7314 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7315 MVT::f32;
7316 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7317 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7318 DAG.getTargetConstant(0, DL, MVT::i32));
7319}
7320
7321SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7322 SelectionDAG &DAG) const {
7323 EVT VT = Op.getValueType();
7324 const MachineFunction &MF = DAG.getMachineFunction();
7325 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7326 bool IsIEEEMode = Info->getMode().IEEE;
7327
7328 // FIXME: Assert during selection that this is only selected for
7329 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7330 // mode functions, but this happens to be OK since it's only done in cases
7331 // where there is known no sNaN.
7332 if (IsIEEEMode)
7333 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7334
7335 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7336 VT == MVT::v16bf16)
7337 return splitBinaryVectorOp(Op, DAG);
7338 return Op;
7339}
7340
7341SDValue
7342SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7343 SelectionDAG &DAG) const {
7344 EVT VT = Op.getValueType();
7345 const MachineFunction &MF = DAG.getMachineFunction();
7346 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7347 bool IsIEEEMode = Info->getMode().IEEE;
7348
7349 if (IsIEEEMode)
7350 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7351
7352 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7353 VT == MVT::v16bf16)
7354 return splitBinaryVectorOp(Op, DAG);
7355 return Op;
7356}
7357
7358SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7359 SelectionDAG &DAG) const {
7360 EVT VT = Op.getValueType();
7361 if (VT.isVector())
7362 return splitBinaryVectorOp(Op, DAG);
7363
7364 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7365 !Subtarget->hasMinimum3Maximum3F16() &&
7366 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7367 "should not need to widen f16 minimum/maximum to v2f16");
7368
7369 // Widen f16 operation to v2f16
7370
7371 // fminimum f16:x, f16:y ->
7372 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7373 // (v2f16 (scalar_to_vector y))), 0
7374 SDLoc SL(Op);
7375 SDValue WideSrc0 =
7376 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7377 SDValue WideSrc1 =
7378 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7379
7380 SDValue Widened =
7381 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7382
7383 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7384 DAG.getConstant(0, SL, MVT::i32));
7385}
7386
7387SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7388 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7389 EVT VT = Op.getValueType();
7390 assert(VT == MVT::f16);
7391
7392 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7393 EVT ExpVT = Exp.getValueType();
7394 if (ExpVT == MVT::i16)
7395 return Op;
7396
7397 SDLoc DL(Op);
7398
7399 // Correct the exponent type for f16 to i16.
7400 // Clamp the range of the exponent to the instruction's range.
7401
7402 // TODO: This should be a generic narrowing legalization, and can easily be
7403 // for GlobalISel.
7404
7405 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7406 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7407
7408 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7409 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7410
7411 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7412
7413 if (IsStrict) {
7414 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7415 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7416 }
7417
7418 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7419}
7420
7422 switch (Op->getOpcode()) {
7423 case ISD::SRA:
7424 case ISD::SMIN:
7425 case ISD::SMAX:
7426 return ISD::SIGN_EXTEND;
7427 case ISD::SRL:
7428 case ISD::UMIN:
7429 case ISD::UMAX:
7430 return ISD::ZERO_EXTEND;
7431 case ISD::ADD:
7432 case ISD::SUB:
7433 case ISD::AND:
7434 case ISD::OR:
7435 case ISD::XOR:
7436 case ISD::SHL:
7437 case ISD::SELECT:
7438 case ISD::MUL:
7439 // operation result won't be influenced by garbage high bits.
7440 // TODO: are all of those cases correct, and are there more?
7441 return ISD::ANY_EXTEND;
7442 case ISD::SETCC: {
7443 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7445 }
7446 default:
7447 llvm_unreachable("unexpected opcode!");
7448 }
7449}
7450
7451SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7452 DAGCombinerInfo &DCI) const {
7453 const unsigned Opc = Op.getOpcode();
7454 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7455 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7456 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7457 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7458 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7459
7460 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7461 : Op->getOperand(0).getValueType();
7462 auto ExtTy = OpTy.changeElementType(MVT::i32);
7463
7464 if (DCI.isBeforeLegalizeOps() ||
7465 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7466 return SDValue();
7467
7468 auto &DAG = DCI.DAG;
7469
7470 SDLoc DL(Op);
7471 SDValue LHS;
7472 SDValue RHS;
7473 if (Opc == ISD::SELECT) {
7474 LHS = Op->getOperand(1);
7475 RHS = Op->getOperand(2);
7476 } else {
7477 LHS = Op->getOperand(0);
7478 RHS = Op->getOperand(1);
7479 }
7480
7481 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7482 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7483
7484 // Special case: for shifts, the RHS always needs a zext.
7485 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7486 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7487 else
7488 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7489
7490 // setcc always return i1/i1 vec so no need to truncate after.
7491 if (Opc == ISD::SETCC) {
7492 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7493 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7494 }
7495
7496 // For other ops, we extend the operation's return type as well so we need to
7497 // truncate back to the original type.
7498 SDValue NewVal;
7499 if (Opc == ISD::SELECT)
7500 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7501 else
7502 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7503
7504 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7505}
7506
7507SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7508 SDValue Mag = Op.getOperand(0);
7509 EVT MagVT = Mag.getValueType();
7510
7511 if (MagVT.getVectorNumElements() > 2)
7512 return splitBinaryVectorOp(Op, DAG);
7513
7514 SDValue Sign = Op.getOperand(1);
7515 EVT SignVT = Sign.getValueType();
7516
7517 if (MagVT == SignVT)
7518 return Op;
7519
7520 // fcopysign v2f16:mag, v2f32:sign ->
7521 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7522
7523 SDLoc SL(Op);
7524 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7525 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7526
7527 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7528
7529 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7530}
7531
7532// Custom lowering for vector multiplications and s_mul_u64.
7533SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7534 EVT VT = Op.getValueType();
7535
7536 // Split vector operands.
7537 if (VT.isVector())
7538 return splitBinaryVectorOp(Op, DAG);
7539
7540 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7541
7542 // There are four ways to lower s_mul_u64:
7543 //
7544 // 1. If all the operands are uniform, then we lower it as it is.
7545 //
7546 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7547 // multiplications because there is not a vector equivalent of s_mul_u64.
7548 //
7549 // 3. If the cost model decides that it is more efficient to use vector
7550 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7551 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7552 //
7553 // 4. If the cost model decides to use vector registers and both of the
7554 // operands are zero-extended/sign-extended from 32-bits, then we split the
7555 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7556 // possible to check if the operands are zero-extended or sign-extended in
7557 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7558 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7559 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7560 // If the cost model decides that we have to use vector registers, then
7561 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7562 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7563 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7564 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7565 // SIInstrInfo.cpp .
7566
7567 if (Op->isDivergent())
7568 return SDValue();
7569
7570 SDValue Op0 = Op.getOperand(0);
7571 SDValue Op1 = Op.getOperand(1);
7572 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7573 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7574 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7575 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7576 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7577 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7578 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7579 SDLoc SL(Op);
7580 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7581 return SDValue(
7582 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7583 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7584 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7585 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7586 return SDValue(
7587 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7588 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7589 return Op;
7590}
7591
7592SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7593 EVT VT = Op.getValueType();
7594 SDLoc SL(Op);
7595 SDValue LHS = Op.getOperand(0);
7596 SDValue RHS = Op.getOperand(1);
7597 bool isSigned = Op.getOpcode() == ISD::SMULO;
7598
7599 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7600 const APInt &C = RHSC->getAPIntValue();
7601 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7602 if (C.isPowerOf2()) {
7603 // smulo(x, signed_min) is same as umulo(x, signed_min).
7604 bool UseArithShift = isSigned && !C.isMinSignedValue();
7605 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7606 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7607 SDValue Overflow =
7608 DAG.getSetCC(SL, MVT::i1,
7609 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7610 Result, ShiftAmt),
7611 LHS, ISD::SETNE);
7612 return DAG.getMergeValues({Result, Overflow}, SL);
7613 }
7614 }
7615
7616 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7617 SDValue Top =
7618 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7619
7620 SDValue Sign = isSigned
7621 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7622 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7623 SL, MVT::i32))
7624 : DAG.getConstant(0, SL, VT);
7625 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7626
7627 return DAG.getMergeValues({Result, Overflow}, SL);
7628}
7629
7630SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7631 if (Op->isDivergent()) {
7632 // Select to V_MAD_[IU]64_[IU]32.
7633 return Op;
7634 }
7635 if (Subtarget->hasSMulHi()) {
7636 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7637 return SDValue();
7638 }
7639 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7640 // calculate the high part, so we might as well do the whole thing with
7641 // V_MAD_[IU]64_[IU]32.
7642 return Op;
7643}
7644
7645SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7646 if (!Subtarget->isTrapHandlerEnabled() ||
7647 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7648 return lowerTrapEndpgm(Op, DAG);
7649
7650 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7651 : lowerTrapHsaQueuePtr(Op, DAG);
7652}
7653
7654SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7655 SDLoc SL(Op);
7656 SDValue Chain = Op.getOperand(0);
7657 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7658}
7659
7660SDValue
7661SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7662 const SDLoc &DL, Align Alignment,
7663 ImplicitParameter Param) const {
7664 MachineFunction &MF = DAG.getMachineFunction();
7665 uint64_t Offset = getImplicitParameterOffset(MF, Param);
7666 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7667 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7668 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7671}
7672
7673SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7674 SelectionDAG &DAG) const {
7675 SDLoc SL(Op);
7676 SDValue Chain = Op.getOperand(0);
7677
7678 SDValue QueuePtr;
7679 // For code object version 5, QueuePtr is passed through implicit kernarg.
7680 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7682 QueuePtr =
7683 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7684 } else {
7685 MachineFunction &MF = DAG.getMachineFunction();
7686 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7687 Register UserSGPR = Info->getQueuePtrUserSGPR();
7688
7689 if (UserSGPR == AMDGPU::NoRegister) {
7690 // We probably are in a function incorrectly marked with
7691 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7692 // trap, so just use a null pointer.
7693 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7694 } else {
7695 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7696 MVT::i64);
7697 }
7698 }
7699
7700 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7701 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7702
7703 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7704 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7705 ToReg.getValue(1)};
7706 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7707}
7708
7709SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7710 SDLoc SL(Op);
7711 SDValue Chain = Op.getOperand(0);
7712
7713 // We need to simulate the 's_trap 2' instruction on targets that run in
7714 // PRIV=1 (where it is treated as a nop).
7715 if (Subtarget->hasPrivEnabledTrap2NopBug())
7716 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7717
7718 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7719 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7720 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7721}
7722
7723SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7724 SDLoc SL(Op);
7725 SDValue Chain = Op.getOperand(0);
7726 MachineFunction &MF = DAG.getMachineFunction();
7727
7728 if (!Subtarget->isTrapHandlerEnabled() ||
7729 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7730 LLVMContext &Ctx = MF.getFunction().getContext();
7731 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
7732 "debugtrap handler not supported",
7733 Op.getDebugLoc(), DS_Warning));
7734 return Chain;
7735 }
7736
7737 uint64_t TrapID =
7738 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
7739 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7740 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7741}
7742
7743SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7744 SelectionDAG &DAG) const {
7745 if (Subtarget->hasApertureRegs()) {
7746 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7747 ? AMDGPU::SRC_SHARED_BASE
7748 : AMDGPU::SRC_PRIVATE_BASE;
7749 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7750 !Subtarget->hasGloballyAddressableScratch()) &&
7751 "Cannot use src_private_base with globally addressable scratch!");
7752 // Note: this feature (register) is broken. When used as a 32-bit operand,
7753 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7754 // bits.
7755 //
7756 // To work around the issue, directly emit a 64 bit mov from this register
7757 // then extract the high bits. Note that this shouldn't even result in a
7758 // shift being emitted and simply become a pair of registers (e.g.):
7759 // s_mov_b64 s[6:7], src_shared_base
7760 // v_mov_b32_e32 v1, s7
7761 //
7762 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7763 // coalescing would kick in and it would think it's okay to use the "HI"
7764 // subregister directly (instead of extracting the HI 32 bits) which is an
7765 // artificial (unusable) register.
7766 // Register TableGen definitions would need an overhaul to get rid of the
7767 // artificial "HI" aperture registers and prevent this kind of issue from
7768 // happening.
7769 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7770 DAG.getRegister(ApertureRegNo, MVT::i64));
7771 return DAG.getNode(
7772 ISD::TRUNCATE, DL, MVT::i32,
7773 DAG.getNode(ISD::SRL, DL, MVT::i64,
7774 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7775 }
7776
7777 // For code object version 5, private_base and shared_base are passed through
7778 // implicit kernargs.
7779 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7783 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7784 }
7785
7786 MachineFunction &MF = DAG.getMachineFunction();
7787 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7788 Register UserSGPR = Info->getQueuePtrUserSGPR();
7789 if (UserSGPR == AMDGPU::NoRegister) {
7790 // We probably are in a function incorrectly marked with
7791 // amdgpu-no-queue-ptr. This is undefined.
7792 return DAG.getPOISON(MVT::i32);
7793 }
7794
7795 SDValue QueuePtr =
7796 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7797
7798 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7799 // private_segment_aperture_base_hi.
7800 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7801
7802 SDValue Ptr =
7803 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7804
7805 // TODO: Use custom target PseudoSourceValue.
7806 // TODO: We should use the value from the IR intrinsic call, but it might not
7807 // be available and how do we get it?
7808 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7809 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7810 commonAlignment(Align(64), StructOffset),
7813}
7814
7815/// Return true if the value is a known valid address, such that a null check is
7816/// not necessary.
7818 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7820 return true;
7821
7822 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7823 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7824
7825 // TODO: Search through arithmetic, handle arguments and loads
7826 // marked nonnull.
7827 return false;
7828}
7829
7830SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7831 SelectionDAG &DAG) const {
7832 SDLoc SL(Op);
7833
7834 const AMDGPUTargetMachine &TM =
7835 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7836
7837 unsigned DestAS, SrcAS;
7838 SDValue Src;
7839 bool IsNonNull = false;
7840 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7841 SrcAS = ASC->getSrcAddressSpace();
7842 Src = ASC->getOperand(0);
7843 DestAS = ASC->getDestAddressSpace();
7844 } else {
7845 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7846 Op.getConstantOperandVal(0) ==
7847 Intrinsic::amdgcn_addrspacecast_nonnull);
7848 Src = Op->getOperand(1);
7849 SrcAS = Op->getConstantOperandVal(2);
7850 DestAS = Op->getConstantOperandVal(3);
7851 IsNonNull = true;
7852 }
7853
7854 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7855
7856 // flat -> local/private
7857 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7858 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7859 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7860 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7861
7862 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
7863 Subtarget->hasGloballyAddressableScratch()) {
7864 // flat -> private with globally addressable scratch: subtract
7865 // src_flat_scratch_base_lo.
7866 SDValue FlatScratchBaseLo(
7867 DAG.getMachineNode(
7868 AMDGPU::S_MOV_B32, SL, MVT::i32,
7869 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
7870 0);
7871 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
7872 }
7873
7874 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7875 return Ptr;
7876
7877 unsigned NullVal = TM.getNullPointerValue(DestAS);
7878 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7879 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7880
7881 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7882 SegmentNullPtr);
7883 }
7884 }
7885
7886 // local/private -> flat
7887 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7888 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7889 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7890 SDValue CvtPtr;
7891 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
7892 Subtarget->hasGloballyAddressableScratch()) {
7893 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
7894 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
7895 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
7896 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
7897 ThreadID = DAG.getNode(
7898 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7899 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
7900 AllOnes, ThreadID);
7901 if (Subtarget->isWave64())
7902 ThreadID = DAG.getNode(
7903 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
7904 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
7905 AllOnes, ThreadID);
7906 SDValue ShAmt = DAG.getShiftAmountConstant(
7907 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
7908 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
7909 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
7910 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7911 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
7912 // 64-bit hi:lo value.
7913 SDValue FlatScratchBase = {
7914 DAG.getMachineNode(
7915 AMDGPU::S_MOV_B64, SL, MVT::i64,
7916 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
7917 0};
7918 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
7919 } else {
7920 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7921 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7922 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7923 }
7924
7925 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7926 return CvtPtr;
7927
7928 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7929 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7930
7931 SDValue NonNull =
7932 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7933
7934 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7935 FlatNullPtr);
7936 }
7937 }
7938
7939 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7940 Op.getValueType() == MVT::i64) {
7941 const SIMachineFunctionInfo *Info =
7942 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
7943 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7944 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7945 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7946 }
7947
7948 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7949 Src.getValueType() == MVT::i64)
7950 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7951
7952 // global <-> flat are no-ops and never emitted.
7953
7954 // Invalid casts are poison.
7955 return DAG.getPOISON(Op->getValueType(0));
7956}
7957
7958// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7959// the small vector and inserting them into the big vector. That is better than
7960// the default expansion of doing it via a stack slot. Even though the use of
7961// the stack slot would be optimized away afterwards, the stack slot itself
7962// remains.
7963SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7964 SelectionDAG &DAG) const {
7965 SDValue Vec = Op.getOperand(0);
7966 SDValue Ins = Op.getOperand(1);
7967 SDValue Idx = Op.getOperand(2);
7968 EVT VecVT = Vec.getValueType();
7969 EVT InsVT = Ins.getValueType();
7970 EVT EltVT = VecVT.getVectorElementType();
7971 unsigned InsNumElts = InsVT.getVectorNumElements();
7972 unsigned IdxVal = Idx->getAsZExtVal();
7973 SDLoc SL(Op);
7974
7975 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7976 // Insert 32-bit registers at a time.
7977 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7978
7979 unsigned VecNumElts = VecVT.getVectorNumElements();
7980 EVT NewVecVT =
7981 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7982 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7984 MVT::i32, InsNumElts / 2);
7985
7986 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7987 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7988
7989 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7990 SDValue Elt;
7991 if (InsNumElts == 2) {
7992 Elt = Ins;
7993 } else {
7994 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7995 DAG.getConstant(I, SL, MVT::i32));
7996 }
7997 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7998 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7999 }
8000
8001 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8002 }
8003
8004 for (unsigned I = 0; I != InsNumElts; ++I) {
8005 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8006 DAG.getConstant(I, SL, MVT::i32));
8007 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8008 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8009 }
8010 return Vec;
8011}
8012
8013SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8014 SelectionDAG &DAG) const {
8015 SDValue Vec = Op.getOperand(0);
8016 SDValue InsVal = Op.getOperand(1);
8017 SDValue Idx = Op.getOperand(2);
8018 EVT VecVT = Vec.getValueType();
8019 EVT EltVT = VecVT.getVectorElementType();
8020 unsigned VecSize = VecVT.getSizeInBits();
8021 unsigned EltSize = EltVT.getSizeInBits();
8022 SDLoc SL(Op);
8023
8024 // Specially handle the case of v4i16 with static indexing.
8025 unsigned NumElts = VecVT.getVectorNumElements();
8026 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8027 if (NumElts == 4 && EltSize == 16 && KIdx) {
8028 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8029
8030 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8031 DAG.getConstant(0, SL, MVT::i32));
8032 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8033 DAG.getConstant(1, SL, MVT::i32));
8034
8035 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8036 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8037
8038 unsigned Idx = KIdx->getZExtValue();
8039 bool InsertLo = Idx < 2;
8040 SDValue InsHalf = DAG.getNode(
8041 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8042 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8043 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8044
8045 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8046
8047 SDValue Concat =
8048 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8049 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8050
8051 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8052 }
8053
8054 // Static indexing does not lower to stack access, and hence there is no need
8055 // for special custom lowering to avoid stack access.
8056 if (isa<ConstantSDNode>(Idx))
8057 return SDValue();
8058
8059 // Avoid stack access for dynamic indexing by custom lowering to
8060 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8061
8062 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8063
8064 MVT IntVT = MVT::getIntegerVT(VecSize);
8065
8066 // Convert vector index to bit-index and get the required bit mask.
8067 assert(isPowerOf2_32(EltSize));
8068 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8069 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8070 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8071 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8072 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8073
8074 // 1. Create a congruent vector with the target value in each element.
8075 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8076 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8077
8078 // 2. Mask off all other indices except the required index within (1).
8079 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8080
8081 // 3. Mask off the required index within the target vector.
8082 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8083 SDValue RHS =
8084 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8085
8086 // 4. Get (2) and (3) ORed into the target vector.
8087 SDValue BFI =
8088 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8089
8090 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8091}
8092
8093SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8094 SelectionDAG &DAG) const {
8095 SDLoc SL(Op);
8096
8097 EVT ResultVT = Op.getValueType();
8098 SDValue Vec = Op.getOperand(0);
8099 SDValue Idx = Op.getOperand(1);
8100 EVT VecVT = Vec.getValueType();
8101 unsigned VecSize = VecVT.getSizeInBits();
8102 EVT EltVT = VecVT.getVectorElementType();
8103
8104 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8105
8106 // Make sure we do any optimizations that will make it easier to fold
8107 // source modifiers before obscuring it with bit operations.
8108
8109 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8110 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8111 return Combined;
8112
8113 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8114 SDValue Lo, Hi;
8115 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8116
8117 if (VecSize == 128) {
8118 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8119 Lo = DAG.getBitcast(LoVT,
8120 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8121 DAG.getConstant(0, SL, MVT::i32)));
8122 Hi = DAG.getBitcast(HiVT,
8123 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8124 DAG.getConstant(1, SL, MVT::i32)));
8125 } else if (VecSize == 256) {
8126 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8127 SDValue Parts[4];
8128 for (unsigned P = 0; P < 4; ++P) {
8129 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8130 DAG.getConstant(P, SL, MVT::i32));
8131 }
8132
8133 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8134 Parts[0], Parts[1]));
8135 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8136 Parts[2], Parts[3]));
8137 } else {
8138 assert(VecSize == 512);
8139
8140 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8141 SDValue Parts[8];
8142 for (unsigned P = 0; P < 8; ++P) {
8143 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8144 DAG.getConstant(P, SL, MVT::i32));
8145 }
8146
8147 Lo = DAG.getBitcast(LoVT,
8148 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8149 Parts[0], Parts[1], Parts[2], Parts[3]));
8150 Hi = DAG.getBitcast(HiVT,
8151 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8152 Parts[4], Parts[5], Parts[6], Parts[7]));
8153 }
8154
8155 EVT IdxVT = Idx.getValueType();
8156 unsigned NElem = VecVT.getVectorNumElements();
8157 assert(isPowerOf2_32(NElem));
8158 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8159 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8160 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8161 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8162 }
8163
8164 assert(VecSize <= 64);
8165
8166 MVT IntVT = MVT::getIntegerVT(VecSize);
8167
8168 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8169 SDValue VecBC = peekThroughBitcasts(Vec);
8170 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8171 SDValue Src = VecBC.getOperand(0);
8172 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8173 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8174 }
8175
8176 unsigned EltSize = EltVT.getSizeInBits();
8177 assert(isPowerOf2_32(EltSize));
8178
8179 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8180
8181 // Convert vector index to bit-index (* EltSize)
8182 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8183
8184 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8185 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8186
8187 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8188 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8189 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8190 }
8191
8192 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8193}
8194
8195static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8196 assert(Elt % 2 == 0);
8197 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8198}
8199
8200static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8201 assert(Elt % 2 == 0);
8202 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8203 !(Mask[Elt + 1] & 1);
8204}
8205
8206SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8207 SelectionDAG &DAG) const {
8208 SDLoc SL(Op);
8209 EVT ResultVT = Op.getValueType();
8210 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8211 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8212 const int NewSrcNumElts = 2;
8213 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8214 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8215
8216 // Break up the shuffle into registers sized pieces.
8217 //
8218 // We're trying to form sub-shuffles that the register allocation pipeline
8219 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8220 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8221 // pair of copies into a consecutive register copy, so use the ordinary
8222 // extract_vector_elt lowering unless we can use the shuffle.
8223 //
8224 // TODO: This is a bit of hack, and we should probably always use
8225 // extract_subvector for the largest possible subvector we can (or at least
8226 // use it for PackVT aligned pieces). However we have worse support for
8227 // combines on them don't directly treat extract_subvector / insert_subvector
8228 // as legal. The DAG scheduler also ends up doing a worse job with the
8229 // extract_subvectors.
8230 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8231
8232 // vector_shuffle <0,1,6,7> lhs, rhs
8233 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8234 //
8235 // vector_shuffle <6,7,2,3> lhs, rhs
8236 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8237 //
8238 // vector_shuffle <6,7,0,1> lhs, rhs
8239 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8240
8241 // Avoid scalarizing when both halves are reading from consecutive elements.
8242
8243 // If we're treating 2 element shuffles as legal, also create odd-to-even
8244 // shuffles of neighboring pairs.
8245 //
8246 // vector_shuffle <3,2,7,6> lhs, rhs
8247 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8248 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8249
8251 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8252 if (ShouldUseConsecutiveExtract &&
8254 const int Idx = SVN->getMaskElt(I);
8255 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8256 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8257 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8258 SVN->getOperand(VecIdx),
8259 DAG.getConstant(EltIdx, SL, MVT::i32));
8260 Pieces.push_back(SubVec);
8261 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8263 int Idx0 = SVN->getMaskElt(I);
8264 int Idx1 = SVN->getMaskElt(I + 1);
8265
8266 SDValue SrcOp0 = SVN->getOperand(0);
8267 SDValue SrcOp1 = SrcOp0;
8268 if (Idx0 >= SrcNumElts) {
8269 SrcOp0 = SVN->getOperand(1);
8270 Idx0 -= SrcNumElts;
8271 }
8272
8273 if (Idx1 >= SrcNumElts) {
8274 SrcOp1 = SVN->getOperand(1);
8275 Idx1 -= SrcNumElts;
8276 }
8277
8278 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8279 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8280
8281 // Extract nearest even aligned piece.
8282 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8283 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8284 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8285 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8286
8287 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8288 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8289
8290 SDValue Result0 = SubVec0;
8291 SDValue Result1 = SubVec0;
8292
8293 if (SubVec0 != SubVec1) {
8294 NewMaskIdx1 += NewSrcNumElts;
8295 Result1 = SubVec1;
8296 } else {
8297 Result1 = DAG.getPOISON(PackVT);
8298 }
8299
8300 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8301 {NewMaskIdx0, NewMaskIdx1});
8302 Pieces.push_back(Shuf);
8303 } else {
8304 const int Idx0 = SVN->getMaskElt(I);
8305 const int Idx1 = SVN->getMaskElt(I + 1);
8306 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8307 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8308 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8309 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8310
8311 SDValue Vec0 = SVN->getOperand(VecIdx0);
8312 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8313 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8314
8315 SDValue Vec1 = SVN->getOperand(VecIdx1);
8316 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8317 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8318 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8319 }
8320 }
8321
8322 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8323}
8324
8325SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8326 SelectionDAG &DAG) const {
8327 SDValue SVal = Op.getOperand(0);
8328 EVT ResultVT = Op.getValueType();
8329 EVT SValVT = SVal.getValueType();
8330 SDValue UndefVal = DAG.getPOISON(SValVT);
8331 SDLoc SL(Op);
8332
8334 VElts.push_back(SVal);
8335 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8336 VElts.push_back(UndefVal);
8337
8338 return DAG.getBuildVector(ResultVT, SL, VElts);
8339}
8340
8341SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8342 SelectionDAG &DAG) const {
8343 SDLoc SL(Op);
8344 EVT VT = Op.getValueType();
8345
8346 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8347 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8348
8349 SDValue Lo = Op.getOperand(0);
8350 SDValue Hi = Op.getOperand(1);
8351
8352 // Avoid adding defined bits with the zero_extend.
8353 if (Hi.isUndef()) {
8354 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8355 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8356 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8357 }
8358
8359 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8360 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8361
8362 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8363 DAG.getConstant(16, SL, MVT::i32));
8364 if (Lo.isUndef())
8365 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8366
8367 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8368 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8369
8370 SDValue Or =
8371 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8372 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8373 }
8374
8375 // Split into 2-element chunks.
8376 const unsigned NumParts = VT.getVectorNumElements() / 2;
8377 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8378 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8379
8381 for (unsigned P = 0; P < NumParts; ++P) {
8382 SDValue Vec = DAG.getBuildVector(
8383 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8384 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8385 }
8386
8387 SDValue Blend =
8388 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8389 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8390}
8391
8393 const GlobalAddressSDNode *GA) const {
8394 // OSes that use ELF REL relocations (instead of RELA) can only store a
8395 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8396 // which can create arbitrary 64-bit addends. (This is only a problem for
8397 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8398 // the high 32 bits of the addend.)
8399 //
8400 // This should be kept in sync with how HasRelocationAddend is initialized in
8401 // the constructor of ELFAMDGPUAsmBackend.
8402 if (!Subtarget->isAmdHsaOS())
8403 return false;
8404
8405 // We can fold offsets for anything that doesn't require a GOT relocation.
8406 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8410}
8411
8412static SDValue
8414 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8415 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8416 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8417 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8418 // lowered to the following code sequence:
8419 //
8420 // For constant address space:
8421 // s_getpc_b64 s[0:1]
8422 // s_add_u32 s0, s0, $symbol
8423 // s_addc_u32 s1, s1, 0
8424 //
8425 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8426 // a fixup or relocation is emitted to replace $symbol with a literal
8427 // constant, which is a pc-relative offset from the encoding of the $symbol
8428 // operand to the global variable.
8429 //
8430 // For global address space:
8431 // s_getpc_b64 s[0:1]
8432 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8433 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8434 //
8435 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8436 // fixups or relocations are emitted to replace $symbol@*@lo and
8437 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8438 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8439 // operand to the global variable.
8440 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8441 assert(GAFlags != SIInstrInfo::MO_NONE);
8442
8443 SDValue Ptr =
8444 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8445 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8446 }
8447
8448 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8449 SDValue PtrHi;
8450 if (GAFlags == SIInstrInfo::MO_NONE)
8451 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8452 else
8453 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8454 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8455}
8456
8457SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8458 SDValue Op,
8459 SelectionDAG &DAG) const {
8460 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8461 SDLoc DL(GSD);
8462 EVT PtrVT = Op.getValueType();
8463
8464 const GlobalValue *GV = GSD->getGlobal();
8470 GV->hasExternalLinkage()) {
8471 Type *Ty = GV->getValueType();
8472 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8473 // zero-sized type in other languages to declare the dynamic shared
8474 // memory which size is not known at the compile time. They will be
8475 // allocated by the runtime and placed directly after the static
8476 // allocated ones. They all share the same offset.
8477 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8478 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8479 // Adjust alignment for that dynamic shared memory array.
8482 MFI->setUsesDynamicLDS(true);
8483 return SDValue(
8484 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8485 }
8486 }
8488 }
8489
8491 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8493 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8494 }
8495
8496 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8497 if (Subtarget->has64BitLiterals()) {
8499 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8500 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8501 0);
8502 }
8503
8504 SDValue AddrLo = DAG.getTargetGlobalAddress(
8505 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8506 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8507
8508 SDValue AddrHi = DAG.getTargetGlobalAddress(
8509 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8510 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8511
8512 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8513 }
8514
8515 if (shouldEmitFixup(GV))
8516 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8517
8518 if (shouldEmitPCReloc(GV))
8519 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8521
8522 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8524 PointerType *PtrTy =
8526 const DataLayout &DataLayout = DAG.getDataLayout();
8527 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8528 MachinePointerInfo PtrInfo =
8530
8531 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8534}
8535
8537 const SDLoc &DL, SDValue V) const {
8538 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8539 // the destination register.
8540 //
8541 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8542 // so we will end up with redundant moves to m0.
8543 //
8544 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8545
8546 // A Null SDValue creates a glue result.
8547 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8548 V, Chain);
8549 return SDValue(M0, 0);
8550}
8551
8552SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8553 MVT VT,
8554 unsigned Offset) const {
8555 SDLoc SL(Op);
8556 SDValue Param = lowerKernargMemParameter(
8557 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8558 // The local size values will have the hi 16-bits as zero.
8559 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8560 DAG.getValueType(VT));
8561}
8562
8564 EVT VT) {
8567 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8568 return DAG.getPOISON(VT);
8569}
8570
8572 EVT VT) {
8575 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8576 return DAG.getPOISON(VT);
8577}
8578
8580 ArrayRef<SDValue> Elts) {
8581 assert(!Elts.empty());
8582 MVT Type;
8583 unsigned NumElts = Elts.size();
8584
8585 if (NumElts <= 12) {
8586 Type = MVT::getVectorVT(MVT::f32, NumElts);
8587 } else {
8588 assert(Elts.size() <= 16);
8589 Type = MVT::v16f32;
8590 NumElts = 16;
8591 }
8592
8593 SmallVector<SDValue, 16> VecElts(NumElts);
8594 for (unsigned i = 0; i < Elts.size(); ++i) {
8595 SDValue Elt = Elts[i];
8596 if (Elt.getValueType() != MVT::f32)
8597 Elt = DAG.getBitcast(MVT::f32, Elt);
8598 VecElts[i] = Elt;
8599 }
8600 for (unsigned i = Elts.size(); i < NumElts; ++i)
8601 VecElts[i] = DAG.getPOISON(MVT::f32);
8602
8603 if (NumElts == 1)
8604 return VecElts[0];
8605 return DAG.getBuildVector(Type, DL, VecElts);
8606}
8607
8608static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8609 SDValue Src, int ExtraElts) {
8610 EVT SrcVT = Src.getValueType();
8611
8613
8614 if (SrcVT.isVector())
8615 DAG.ExtractVectorElements(Src, Elts);
8616 else
8617 Elts.push_back(Src);
8618
8619 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
8620 while (ExtraElts--)
8621 Elts.push_back(Undef);
8622
8623 return DAG.getBuildVector(CastVT, DL, Elts);
8624}
8625
8626// Re-construct the required return value for a image load intrinsic.
8627// This is more complicated due to the optional use TexFailCtrl which means the
8628// required return type is an aggregate
8630 ArrayRef<EVT> ResultTypes, bool IsTexFail,
8631 bool Unpacked, bool IsD16, int DMaskPop,
8632 int NumVDataDwords, bool IsAtomicPacked16Bit,
8633 const SDLoc &DL) {
8634 // Determine the required return type. This is the same regardless of
8635 // IsTexFail flag
8636 EVT ReqRetVT = ResultTypes[0];
8637 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8638 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8639 ? (ReqRetNumElts + 1) / 2
8640 : ReqRetNumElts;
8641
8642 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8643
8644 MVT DataDwordVT =
8645 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8646
8647 MVT MaskPopVT =
8648 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8649
8650 SDValue Data(Result, 0);
8651 SDValue TexFail;
8652
8653 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8654 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8655 if (MaskPopVT.isVector()) {
8656 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8657 SDValue(Result, 0), ZeroIdx);
8658 } else {
8659 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8660 SDValue(Result, 0), ZeroIdx);
8661 }
8662 }
8663
8664 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8665 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8666 NumDataDwords - MaskPopDwords);
8667
8668 if (IsD16)
8669 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8670
8671 EVT LegalReqRetVT = ReqRetVT;
8672 if (!ReqRetVT.isVector()) {
8673 if (!Data.getValueType().isInteger())
8674 Data = DAG.getNode(ISD::BITCAST, DL,
8675 Data.getValueType().changeTypeToInteger(), Data);
8676 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8677 } else {
8678 // We need to widen the return vector to a legal type
8679 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8680 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8681 LegalReqRetVT =
8683 ReqRetVT.getVectorNumElements() + 1);
8684 }
8685 }
8686 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8687
8688 if (IsTexFail) {
8689 TexFail =
8690 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8691 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8692
8693 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8694 }
8695
8696 if (Result->getNumValues() == 1)
8697 return Data;
8698
8699 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8700}
8701
8702static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8703 SDValue *LWE, bool &IsTexFail) {
8704 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8705
8706 uint64_t Value = TexFailCtrlConst->getZExtValue();
8707 if (Value) {
8708 IsTexFail = true;
8709 }
8710
8711 SDLoc DL(TexFailCtrlConst);
8712 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8713 Value &= ~(uint64_t)0x1;
8714 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8715 Value &= ~(uint64_t)0x2;
8716
8717 return Value == 0;
8718}
8719
8721 MVT PackVectorVT,
8722 SmallVectorImpl<SDValue> &PackedAddrs,
8723 unsigned DimIdx, unsigned EndIdx,
8724 unsigned NumGradients) {
8725 SDLoc DL(Op);
8726 for (unsigned I = DimIdx; I < EndIdx; I++) {
8727 SDValue Addr = Op.getOperand(I);
8728
8729 // Gradients are packed with undef for each coordinate.
8730 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8731 // 1D: undef,dx/dh; undef,dx/dv
8732 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8733 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8734 if (((I + 1) >= EndIdx) ||
8735 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8736 I == DimIdx + NumGradients - 1))) {
8737 if (Addr.getValueType() != MVT::i16)
8738 Addr = DAG.getBitcast(MVT::i16, Addr);
8739 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8740 } else {
8741 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8742 I++;
8743 }
8744 Addr = DAG.getBitcast(MVT::f32, Addr);
8745 PackedAddrs.push_back(Addr);
8746 }
8747}
8748
8749SDValue SITargetLowering::lowerImage(SDValue Op,
8751 SelectionDAG &DAG, bool WithChain) const {
8752 SDLoc DL(Op);
8753 MachineFunction &MF = DAG.getMachineFunction();
8754 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8755 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8757 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8758 unsigned IntrOpcode = Intr->BaseOpcode;
8759 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8760 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8761 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8762
8763 SmallVector<EVT, 3> ResultTypes(Op->values());
8764 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8765 bool IsD16 = false;
8766 bool IsG16 = false;
8767 bool IsA16 = false;
8768 SDValue VData;
8769 int NumVDataDwords = 0;
8770 bool AdjustRetType = false;
8771 bool IsAtomicPacked16Bit = false;
8772
8773 // Offset of intrinsic arguments
8774 const unsigned ArgOffset = WithChain ? 2 : 1;
8775
8776 unsigned DMask;
8777 unsigned DMaskLanes = 0;
8778
8779 if (BaseOpcode->Atomic) {
8780 VData = Op.getOperand(2);
8781
8782 IsAtomicPacked16Bit =
8783 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8784 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8785
8786 bool Is64Bit = VData.getValueSizeInBits() == 64;
8787 if (BaseOpcode->AtomicX2) {
8788 SDValue VData2 = Op.getOperand(3);
8789 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8790 {VData, VData2});
8791 if (Is64Bit)
8792 VData = DAG.getBitcast(MVT::v4i32, VData);
8793
8794 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8795 DMask = Is64Bit ? 0xf : 0x3;
8796 NumVDataDwords = Is64Bit ? 4 : 2;
8797 } else {
8798 DMask = Is64Bit ? 0x3 : 0x1;
8799 NumVDataDwords = Is64Bit ? 2 : 1;
8800 }
8801 } else {
8802 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
8803 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
8804
8805 if (BaseOpcode->Store) {
8806 VData = Op.getOperand(2);
8807
8808 MVT StoreVT = VData.getSimpleValueType();
8809 if (StoreVT.getScalarType() == MVT::f16) {
8810 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8811 return Op; // D16 is unsupported for this instruction
8812
8813 IsD16 = true;
8814 VData = handleD16VData(VData, DAG, true);
8815 }
8816
8817 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8818 } else if (!BaseOpcode->NoReturn) {
8819 // Work out the num dwords based on the dmask popcount and underlying type
8820 // and whether packing is supported.
8821 MVT LoadVT = ResultTypes[0].getSimpleVT();
8822 if (LoadVT.getScalarType() == MVT::f16) {
8823 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8824 return Op; // D16 is unsupported for this instruction
8825
8826 IsD16 = true;
8827 }
8828
8829 // Confirm that the return type is large enough for the dmask specified
8830 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8831 (!LoadVT.isVector() && DMaskLanes > 1))
8832 return Op;
8833
8834 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8835 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8836 // instructions.
8837 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8838 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8839 NumVDataDwords = (DMaskLanes + 1) / 2;
8840 else
8841 NumVDataDwords = DMaskLanes;
8842
8843 AdjustRetType = true;
8844 }
8845 }
8846
8847 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8849
8850 // Check for 16 bit addresses or derivatives and pack if true.
8851 MVT VAddrVT =
8852 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8853 MVT VAddrScalarVT = VAddrVT.getScalarType();
8854 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8855 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8856
8857 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8858 VAddrScalarVT = VAddrVT.getScalarType();
8859 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8860 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8861
8862 // Push back extra arguments.
8863 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8864 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8865 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8866 // Special handling of bias when A16 is on. Bias is of type half but
8867 // occupies full 32-bit.
8868 SDValue Bias = DAG.getBuildVector(
8869 MVT::v2f16, DL,
8870 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
8871 VAddrs.push_back(Bias);
8872 } else {
8873 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8874 "Bias needs to be converted to 16 bit in A16 mode");
8875 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8876 }
8877 }
8878
8879 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8880 // 16 bit gradients are supported, but are tied to the A16 control
8881 // so both gradients and addresses must be 16 bit
8882 LLVM_DEBUG(
8883 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8884 "require 16 bit args for both gradients and addresses");
8885 return Op;
8886 }
8887
8888 if (IsA16) {
8889 if (!ST->hasA16()) {
8890 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8891 "support 16 bit addresses\n");
8892 return Op;
8893 }
8894 }
8895
8896 // We've dealt with incorrect input so we know that if IsA16, IsG16
8897 // are set then we have to compress/pack operands (either address,
8898 // gradient or both)
8899 // In the case where a16 and gradients are tied (no G16 support) then we
8900 // have already verified that both IsA16 and IsG16 are true
8901 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8902 // Activate g16
8903 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8905 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8906 }
8907
8908 // Add gradients (packed or unpacked)
8909 if (IsG16) {
8910 // Pack the gradients
8911 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8912 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8913 ArgOffset + Intr->GradientStart,
8914 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8915 } else {
8916 for (unsigned I = ArgOffset + Intr->GradientStart;
8917 I < ArgOffset + Intr->CoordStart; I++)
8918 VAddrs.push_back(Op.getOperand(I));
8919 }
8920
8921 // Add addresses (packed or unpacked)
8922 if (IsA16) {
8923 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8924 ArgOffset + Intr->CoordStart, VAddrEnd,
8925 0 /* No gradients */);
8926 } else {
8927 // Add uncompressed address
8928 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8929 VAddrs.push_back(Op.getOperand(I));
8930 }
8931
8932 // If the register allocator cannot place the address registers contiguously
8933 // without introducing moves, then using the non-sequential address encoding
8934 // is always preferable, since it saves VALU instructions and is usually a
8935 // wash in terms of code size or even better.
8936 //
8937 // However, we currently have no way of hinting to the register allocator that
8938 // MIMG addresses should be placed contiguously when it is possible to do so,
8939 // so force non-NSA for the common 2-address case as a heuristic.
8940 //
8941 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8942 // allocation when possible.
8943 //
8944 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8945 // set of the remaining addresses.
8946 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8947 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8948 const bool UseNSA = ST->hasNSAEncoding() &&
8949 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8950 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8951 const bool UsePartialNSA =
8952 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8953
8954 SDValue VAddr;
8955 if (UsePartialNSA) {
8956 VAddr = getBuildDwordsVector(DAG, DL,
8957 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8958 } else if (!UseNSA) {
8959 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8960 }
8961
8962 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8963 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8964 SDValue Unorm;
8965 if (!BaseOpcode->Sampler) {
8966 Unorm = True;
8967 } else {
8968 uint64_t UnormConst =
8969 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8970
8971 Unorm = UnormConst ? True : False;
8972 }
8973
8974 SDValue TFE;
8975 SDValue LWE;
8976 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8977 bool IsTexFail = false;
8978 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8979 return Op;
8980
8981 if (IsTexFail) {
8982 if (!DMaskLanes) {
8983 // Expecting to get an error flag since TFC is on - and dmask is 0
8984 // Force dmask to be at least 1 otherwise the instruction will fail
8985 DMask = 0x1;
8986 DMaskLanes = 1;
8987 NumVDataDwords = 1;
8988 }
8989 NumVDataDwords += 1;
8990 AdjustRetType = true;
8991 }
8992
8993 // Has something earlier tagged that the return type needs adjusting
8994 // This happens if the instruction is a load or has set TexFailCtrl flags
8995 if (AdjustRetType) {
8996 // NumVDataDwords reflects the true number of dwords required in the return
8997 // type
8998 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8999 // This is a no-op load. This can be eliminated
9000 SDValue Undef = DAG.getPOISON(Op.getValueType());
9001 if (isa<MemSDNode>(Op))
9002 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9003 return Undef;
9004 }
9005
9006 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9007 MVT::i32, NumVDataDwords)
9008 : MVT::i32;
9009
9010 ResultTypes[0] = NewVT;
9011 if (ResultTypes.size() == 3) {
9012 // Original result was aggregate type used for TexFailCtrl results
9013 // The actual instruction returns as a vector type which has now been
9014 // created. Remove the aggregate result.
9015 ResultTypes.erase(&ResultTypes[1]);
9016 }
9017 }
9018
9019 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9020 if (BaseOpcode->Atomic)
9021 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9022 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9024 return Op;
9025
9027 if (BaseOpcode->Store || BaseOpcode->Atomic)
9028 Ops.push_back(VData); // vdata
9029 if (UsePartialNSA) {
9030 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9031 Ops.push_back(VAddr);
9032 } else if (UseNSA)
9033 append_range(Ops, VAddrs);
9034 else
9035 Ops.push_back(VAddr);
9036 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9037 EVT RsrcVT = Rsrc.getValueType();
9038 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9039 return Op;
9040 Ops.push_back(Rsrc);
9041 if (BaseOpcode->Sampler) {
9042 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9043 if (Samp.getValueType() != MVT::v4i32)
9044 return Op;
9045 Ops.push_back(Samp);
9046 }
9047 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9048 if (IsGFX10Plus)
9049 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9050 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9051 Ops.push_back(Unorm);
9052 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9053 Ops.push_back(IsA16 && // r128, a16 for gfx9
9054 ST->hasFeature(AMDGPU::FeatureR128A16)
9055 ? True
9056 : False);
9057 if (IsGFX10Plus)
9058 Ops.push_back(IsA16 ? True : False);
9059
9060 if (!Subtarget->hasGFX90AInsts())
9061 Ops.push_back(TFE); // tfe
9062 else if (TFE->getAsZExtVal()) {
9063 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9065 "TFE is not supported on this GPU", DL.getDebugLoc()));
9066 }
9067
9068 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9069 Ops.push_back(LWE); // lwe
9070 if (!IsGFX10Plus)
9071 Ops.push_back(DimInfo->DA ? True : False);
9072 if (BaseOpcode->HasD16)
9073 Ops.push_back(IsD16 ? True : False);
9074 if (isa<MemSDNode>(Op))
9075 Ops.push_back(Op.getOperand(0)); // chain
9076
9077 int NumVAddrDwords =
9078 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9079 int Opcode = -1;
9080
9081 if (IsGFX12Plus) {
9082 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9083 NumVDataDwords, NumVAddrDwords);
9084 } else if (IsGFX11Plus) {
9085 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9086 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9087 : AMDGPU::MIMGEncGfx11Default,
9088 NumVDataDwords, NumVAddrDwords);
9089 } else if (IsGFX10Plus) {
9090 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9091 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9092 : AMDGPU::MIMGEncGfx10Default,
9093 NumVDataDwords, NumVAddrDwords);
9094 } else {
9095 if (Subtarget->hasGFX90AInsts()) {
9096 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9097 NumVDataDwords, NumVAddrDwords);
9098 if (Opcode == -1) {
9099 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9101 "requested image instruction is not supported on this GPU",
9102 DL.getDebugLoc()));
9103
9104 unsigned Idx = 0;
9105 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9106 for (EVT VT : OrigResultTypes) {
9107 if (VT == MVT::Other)
9108 RetValues[Idx++] = Op.getOperand(0); // Chain
9109 else
9110 RetValues[Idx++] = DAG.getPOISON(VT);
9111 }
9112
9113 return DAG.getMergeValues(RetValues, DL);
9114 }
9115 }
9116 if (Opcode == -1 &&
9117 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9118 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9119 NumVDataDwords, NumVAddrDwords);
9120 if (Opcode == -1)
9121 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9122 NumVDataDwords, NumVAddrDwords);
9123 }
9124 if (Opcode == -1)
9125 return Op;
9126
9127 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9128 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9129 MachineMemOperand *MemRef = MemOp->getMemOperand();
9130 DAG.setNodeMemRefs(NewNode, {MemRef});
9131 }
9132
9133 if (BaseOpcode->AtomicX2) {
9135 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9136 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9137 }
9138 if (BaseOpcode->NoReturn)
9139 return SDValue(NewNode, 0);
9140 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9141 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9142 NumVDataDwords, IsAtomicPacked16Bit, DL);
9143}
9144
9145SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9146 SDValue Offset, SDValue CachePolicy,
9147 SelectionDAG &DAG) const {
9148 MachineFunction &MF = DAG.getMachineFunction();
9149
9150 const DataLayout &DataLayout = DAG.getDataLayout();
9151 Align Alignment =
9152 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9153
9154 MachineMemOperand *MMO = MF.getMachineMemOperand(
9155 MachinePointerInfo(),
9158 VT.getStoreSize(), Alignment);
9159
9160 if (!Offset->isDivergent()) {
9161 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9162
9163 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9164 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9165 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9166 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9167 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9168 SDValue BufferLoad =
9170 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9171 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9172 }
9173
9174 // Widen vec3 load to vec4.
9175 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9176 !Subtarget->hasScalarDwordx3Loads()) {
9177 EVT WidenedVT =
9179 auto WidenedOp = DAG.getMemIntrinsicNode(
9180 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9181 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9182 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9183 DAG.getVectorIdxConstant(0, DL));
9184 return Subvector;
9185 }
9186
9188 DAG.getVTList(VT), Ops, VT, MMO);
9189 }
9190
9191 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9192 // assume that the buffer is unswizzled.
9193 SDValue Ops[] = {
9194 DAG.getEntryNode(), // Chain
9195 Rsrc, // rsrc
9196 DAG.getConstant(0, DL, MVT::i32), // vindex
9197 {}, // voffset
9198 {}, // soffset
9199 {}, // offset
9200 CachePolicy, // cachepolicy
9201 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9202 };
9203 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9204 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9205 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9206 }
9207
9209 unsigned NumLoads = 1;
9210 MVT LoadVT = VT.getSimpleVT();
9211 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9212 assert((LoadVT.getScalarType() == MVT::i32 ||
9213 LoadVT.getScalarType() == MVT::f32));
9214
9215 if (NumElts == 8 || NumElts == 16) {
9216 NumLoads = NumElts / 4;
9217 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9218 }
9219
9220 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9221
9222 // Use the alignment to ensure that the required offsets will fit into the
9223 // immediate offsets.
9224 setBufferOffsets(Offset, DAG, &Ops[3],
9225 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9226
9227 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9228 for (unsigned i = 0; i < NumLoads; ++i) {
9229 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9230 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9231 LoadVT, MMO, DAG));
9232 }
9233
9234 if (NumElts == 8 || NumElts == 16)
9235 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9236
9237 return Loads[0];
9238}
9239
9240SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9241 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9242 if (!Subtarget->hasArchitectedSGPRs())
9243 return {};
9244 SDLoc SL(Op);
9245 MVT VT = MVT::i32;
9246 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9247 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9248 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9249}
9250
9251SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9252 unsigned Dim,
9253 const ArgDescriptor &Arg) const {
9254 SDLoc SL(Op);
9255 MachineFunction &MF = DAG.getMachineFunction();
9256 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9257 if (MaxID == 0)
9258 return DAG.getConstant(0, SL, MVT::i32);
9259
9260 // It's undefined behavior if a function marked with the amdgpu-no-*
9261 // attributes uses the corresponding intrinsic.
9262 if (!Arg)
9263 return DAG.getPOISON(Op->getValueType(0));
9264
9265 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9266 SDLoc(DAG.getEntryNode()), Arg);
9267
9268 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9269 // masking operations anyway.
9270 //
9271 // TODO: We could assert the top bit is 0 for the source copy.
9272 if (Arg.isMasked())
9273 return Val;
9274
9275 // Preserve the known bits after expansion to a copy.
9276 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9277 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9278 DAG.getValueType(SmallVT));
9279}
9280
9281SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9282 SelectionDAG &DAG) const {
9283 MachineFunction &MF = DAG.getMachineFunction();
9284 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9285
9286 EVT VT = Op.getValueType();
9287 SDLoc DL(Op);
9288 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9289
9290 // TODO: Should this propagate fast-math-flags?
9291
9292 switch (IntrinsicID) {
9293 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9294 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9295 return emitNonHSAIntrinsicError(DAG, DL, VT);
9296 return getPreloadedValue(DAG, *MFI, VT,
9298 }
9299 case Intrinsic::amdgcn_dispatch_ptr:
9300 case Intrinsic::amdgcn_queue_ptr: {
9301 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9302 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9303 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9304 DL.getDebugLoc()));
9305 return DAG.getPOISON(VT);
9306 }
9307
9308 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9311 return getPreloadedValue(DAG, *MFI, VT, RegID);
9312 }
9313 case Intrinsic::amdgcn_implicitarg_ptr: {
9314 if (MFI->isEntryFunction())
9315 return getImplicitArgPtr(DAG, DL);
9316 return getPreloadedValue(DAG, *MFI, VT,
9318 }
9319 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9321 // This only makes sense to call in a kernel, so just lower to null.
9322 return DAG.getConstant(0, DL, VT);
9323 }
9324
9325 return getPreloadedValue(DAG, *MFI, VT,
9327 }
9328 case Intrinsic::amdgcn_dispatch_id: {
9329 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9330 }
9331 case Intrinsic::amdgcn_rcp:
9332 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9333 case Intrinsic::amdgcn_rsq:
9334 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9335 case Intrinsic::amdgcn_rsq_legacy:
9336 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9337 return emitRemovedIntrinsicError(DAG, DL, VT);
9338 return SDValue();
9339 case Intrinsic::amdgcn_rcp_legacy:
9340 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9341 return emitRemovedIntrinsicError(DAG, DL, VT);
9342 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9343 case Intrinsic::amdgcn_rsq_clamp: {
9344 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9345 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9346
9347 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9348 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9349 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9350
9351 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9352 SDValue Tmp =
9353 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9354 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9355 DAG.getConstantFP(Min, DL, VT));
9356 }
9357 case Intrinsic::r600_read_ngroups_x:
9358 if (Subtarget->isAmdHsaOS())
9359 return emitNonHSAIntrinsicError(DAG, DL, VT);
9360
9361 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9363 false);
9364 case Intrinsic::r600_read_ngroups_y:
9365 if (Subtarget->isAmdHsaOS())
9366 return emitNonHSAIntrinsicError(DAG, DL, VT);
9367
9368 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9370 false);
9371 case Intrinsic::r600_read_ngroups_z:
9372 if (Subtarget->isAmdHsaOS())
9373 return emitNonHSAIntrinsicError(DAG, DL, VT);
9374
9375 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9377 false);
9378 case Intrinsic::r600_read_local_size_x:
9379 if (Subtarget->isAmdHsaOS())
9380 return emitNonHSAIntrinsicError(DAG, DL, VT);
9381
9382 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9384 case Intrinsic::r600_read_local_size_y:
9385 if (Subtarget->isAmdHsaOS())
9386 return emitNonHSAIntrinsicError(DAG, DL, VT);
9387
9388 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9390 case Intrinsic::r600_read_local_size_z:
9391 if (Subtarget->isAmdHsaOS())
9392 return emitNonHSAIntrinsicError(DAG, DL, VT);
9393
9394 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9396 case Intrinsic::amdgcn_workgroup_id_x:
9397 return getPreloadedValue(DAG, *MFI, VT,
9399 case Intrinsic::amdgcn_workgroup_id_y:
9400 return getPreloadedValue(DAG, *MFI, VT,
9402 case Intrinsic::amdgcn_workgroup_id_z:
9403 return getPreloadedValue(DAG, *MFI, VT,
9405 case Intrinsic::amdgcn_wave_id:
9406 return lowerWaveID(DAG, Op);
9407 case Intrinsic::amdgcn_lds_kernel_id: {
9408 if (MFI->isEntryFunction())
9409 return getLDSKernelId(DAG, DL);
9410 return getPreloadedValue(DAG, *MFI, VT,
9412 }
9413 case Intrinsic::amdgcn_workitem_id_x:
9414 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9415 case Intrinsic::amdgcn_workitem_id_y:
9416 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9417 case Intrinsic::amdgcn_workitem_id_z:
9418 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9419 case Intrinsic::amdgcn_wavefrontsize:
9420 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9421 SDLoc(Op), MVT::i32);
9422 case Intrinsic::amdgcn_s_buffer_load: {
9423 unsigned CPol = Op.getConstantOperandVal(3);
9424 // s_buffer_load, because of how it's optimized, can't be volatile
9425 // so reject ones with the volatile bit set.
9426 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9429 return Op;
9430 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9431 Op.getOperand(3), DAG);
9432 }
9433 case Intrinsic::amdgcn_fdiv_fast:
9434 return lowerFDIV_FAST(Op, DAG);
9435 case Intrinsic::amdgcn_sin:
9436 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9437
9438 case Intrinsic::amdgcn_cos:
9439 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9440
9441 case Intrinsic::amdgcn_mul_u24:
9442 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9443 Op.getOperand(2));
9444 case Intrinsic::amdgcn_mul_i24:
9445 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9446 Op.getOperand(2));
9447
9448 case Intrinsic::amdgcn_log_clamp: {
9449 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9450 return SDValue();
9451
9452 return emitRemovedIntrinsicError(DAG, DL, VT);
9453 }
9454 case Intrinsic::amdgcn_fract:
9455 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9456
9457 case Intrinsic::amdgcn_class:
9458 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9459 Op.getOperand(2));
9460 case Intrinsic::amdgcn_div_fmas:
9461 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9462 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9463
9464 case Intrinsic::amdgcn_div_fixup:
9465 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9466 Op.getOperand(2), Op.getOperand(3));
9467
9468 case Intrinsic::amdgcn_div_scale: {
9469 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9470
9471 // Translate to the operands expected by the machine instruction. The
9472 // first parameter must be the same as the first instruction.
9473 SDValue Numerator = Op.getOperand(1);
9474 SDValue Denominator = Op.getOperand(2);
9475
9476 // Note this order is opposite of the machine instruction's operations,
9477 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9478 // intrinsic has the numerator as the first operand to match a normal
9479 // division operation.
9480
9481 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9482
9483 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9484 Denominator, Numerator);
9485 }
9486 case Intrinsic::amdgcn_icmp: {
9487 // There is a Pat that handles this variant, so return it as-is.
9488 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9489 Op.getConstantOperandVal(2) == 0 &&
9490 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9491 return Op;
9492 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9493 }
9494 case Intrinsic::amdgcn_fcmp: {
9495 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9496 }
9497 case Intrinsic::amdgcn_ballot:
9498 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9499 case Intrinsic::amdgcn_fmed3:
9500 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9501 Op.getOperand(2), Op.getOperand(3));
9502 case Intrinsic::amdgcn_fdot2:
9503 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9504 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9505 case Intrinsic::amdgcn_fmul_legacy:
9506 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9507 Op.getOperand(2));
9508 case Intrinsic::amdgcn_sffbh:
9509 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9510 case Intrinsic::amdgcn_sbfe:
9511 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9512 Op.getOperand(2), Op.getOperand(3));
9513 case Intrinsic::amdgcn_ubfe:
9514 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9515 Op.getOperand(2), Op.getOperand(3));
9516 case Intrinsic::amdgcn_cvt_pkrtz:
9517 case Intrinsic::amdgcn_cvt_pknorm_i16:
9518 case Intrinsic::amdgcn_cvt_pknorm_u16:
9519 case Intrinsic::amdgcn_cvt_pk_i16:
9520 case Intrinsic::amdgcn_cvt_pk_u16: {
9521 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9522 EVT VT = Op.getValueType();
9523 unsigned Opcode;
9524
9525 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9527 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9529 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9531 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9533 else
9535
9536 if (isTypeLegal(VT))
9537 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
9538
9539 SDValue Node =
9540 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
9541 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
9542 }
9543 case Intrinsic::amdgcn_fmad_ftz:
9544 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
9545 Op.getOperand(2), Op.getOperand(3));
9546
9547 case Intrinsic::amdgcn_if_break:
9548 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
9549 Op->getOperand(1), Op->getOperand(2)),
9550 0);
9551
9552 case Intrinsic::amdgcn_groupstaticsize: {
9554 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
9555 return Op;
9556
9557 const Module *M = MF.getFunction().getParent();
9558 const GlobalValue *GV =
9559 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
9560 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
9562 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9563 }
9564 case Intrinsic::amdgcn_is_shared:
9565 case Intrinsic::amdgcn_is_private: {
9566 SDLoc SL(Op);
9567 SDValue SrcVec =
9568 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9569 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
9570 DAG.getConstant(1, SL, MVT::i32));
9571
9572 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9574 : AMDGPUAS::PRIVATE_ADDRESS;
9575 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
9576 Subtarget->hasGloballyAddressableScratch()) {
9577 SDValue FlatScratchBaseHi(
9578 DAG.getMachineNode(
9579 AMDGPU::S_MOV_B32, DL, MVT::i32,
9580 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9581 0);
9582 // Test bits 63..58 against the aperture address.
9583 return DAG.getSetCC(
9584 SL, MVT::i1,
9585 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
9586 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
9587 }
9588
9589 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9590 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
9591 }
9592 case Intrinsic::amdgcn_perm:
9593 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
9594 Op.getOperand(2), Op.getOperand(3));
9595 case Intrinsic::amdgcn_reloc_constant: {
9596 Module *M = MF.getFunction().getParent();
9597 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
9598 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
9599 auto *RelocSymbol = cast<GlobalVariable>(
9600 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
9601 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
9603 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9604 }
9605 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9606 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9607 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9608 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9609 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9610 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9611 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9612 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9613 if (Op.getOperand(4).getValueType() == MVT::i32)
9614 return SDValue();
9615
9616 SDLoc SL(Op);
9617 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
9618 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9619 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9620 Op.getOperand(3), IndexKeyi32);
9621 }
9622 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9623 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9624 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9625 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9626 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9627 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9628 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9629 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9630 if (Op.getOperand(4).getValueType() == MVT::i64)
9631 return SDValue();
9632
9633 SDLoc SL(Op);
9634 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
9635 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9636 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9637 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9638 Op.getOperand(6)});
9639 }
9640 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9641 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9642 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9643 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9644 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9645 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9646 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9647 ? MVT::i64
9648 : MVT::i32;
9649 if (Op.getOperand(6).getValueType() == IndexKeyTy)
9650 return SDValue();
9651
9652 SDLoc SL(Op);
9653 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
9654 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9655 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9656 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9657 IndexKey, Op.getOperand(7),
9658 Op.getOperand(8)}); // No clamp operand
9659 }
9660 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9661 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9662 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9663 if (Op.getOperand(6).getValueType() == MVT::i32)
9664 return SDValue();
9665
9666 SDLoc SL(Op);
9667 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
9668 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9669 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9670 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9671 IndexKeyi32, Op.getOperand(7)});
9672 }
9673 case Intrinsic::amdgcn_addrspacecast_nonnull:
9674 return lowerADDRSPACECAST(Op, DAG);
9675 case Intrinsic::amdgcn_readlane:
9676 case Intrinsic::amdgcn_readfirstlane:
9677 case Intrinsic::amdgcn_writelane:
9678 case Intrinsic::amdgcn_permlane16:
9679 case Intrinsic::amdgcn_permlanex16:
9680 case Intrinsic::amdgcn_permlane64:
9681 case Intrinsic::amdgcn_set_inactive:
9682 case Intrinsic::amdgcn_set_inactive_chain_arg:
9683 case Intrinsic::amdgcn_mov_dpp8:
9684 case Intrinsic::amdgcn_update_dpp:
9685 return lowerLaneOp(*this, Op.getNode(), DAG);
9686 case Intrinsic::amdgcn_dead: {
9688 for (const EVT ValTy : Op.getNode()->values())
9689 Poisons.push_back(DAG.getPOISON(ValTy));
9690 return DAG.getMergeValues(Poisons, SDLoc(Op));
9691 }
9692 default:
9693 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9695 return lowerImage(Op, ImageDimIntr, DAG, false);
9696
9697 return Op;
9698 }
9699}
9700
9701// On targets not supporting constant in soffset field, turn zero to
9702// SGPR_NULL to avoid generating an extra s_mov with zero.
9704 const GCNSubtarget *Subtarget) {
9705 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9706 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9707 return SOffset;
9708}
9709
9710SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9711 SelectionDAG &DAG,
9712 unsigned NewOpcode) const {
9713 SDLoc DL(Op);
9714
9715 SDValue VData = Op.getOperand(2);
9716 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9717 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9718 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9719 SDValue Ops[] = {
9720 Op.getOperand(0), // Chain
9721 VData, // vdata
9722 Rsrc, // rsrc
9723 DAG.getConstant(0, DL, MVT::i32), // vindex
9724 VOffset, // voffset
9725 SOffset, // soffset
9726 Offset, // offset
9727 Op.getOperand(6), // cachepolicy
9728 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9729 };
9730
9731 auto *M = cast<MemSDNode>(Op);
9732
9733 EVT MemVT = VData.getValueType();
9734 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9735 M->getMemOperand());
9736}
9737
9738SDValue
9739SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9740 unsigned NewOpcode) const {
9741 SDLoc DL(Op);
9742
9743 SDValue VData = Op.getOperand(2);
9744 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9745 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9746 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9747 SDValue Ops[] = {
9748 Op.getOperand(0), // Chain
9749 VData, // vdata
9750 Rsrc, // rsrc
9751 Op.getOperand(4), // vindex
9752 VOffset, // voffset
9753 SOffset, // soffset
9754 Offset, // offset
9755 Op.getOperand(7), // cachepolicy
9756 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9757 };
9758
9759 auto *M = cast<MemSDNode>(Op);
9760
9761 EVT MemVT = VData.getValueType();
9762 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9763 M->getMemOperand());
9764}
9765
9766SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9767 SelectionDAG &DAG) const {
9768 unsigned IntrID = Op.getConstantOperandVal(1);
9769 SDLoc DL(Op);
9770
9771 switch (IntrID) {
9772 case Intrinsic::amdgcn_ds_ordered_add:
9773 case Intrinsic::amdgcn_ds_ordered_swap: {
9774 MemSDNode *M = cast<MemSDNode>(Op);
9775 SDValue Chain = M->getOperand(0);
9776 SDValue M0 = M->getOperand(2);
9777 SDValue Value = M->getOperand(3);
9778 unsigned IndexOperand = M->getConstantOperandVal(7);
9779 unsigned WaveRelease = M->getConstantOperandVal(8);
9780 unsigned WaveDone = M->getConstantOperandVal(9);
9781
9782 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9783 IndexOperand &= ~0x3f;
9784 unsigned CountDw = 0;
9785
9786 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9787 CountDw = (IndexOperand >> 24) & 0xf;
9788 IndexOperand &= ~(0xf << 24);
9789
9790 if (CountDw < 1 || CountDw > 4) {
9791 const Function &Fn = DAG.getMachineFunction().getFunction();
9792 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9793 Fn, "ds_ordered_count: dword count must be between 1 and 4",
9794 DL.getDebugLoc()));
9795 CountDw = 1;
9796 }
9797 }
9798
9799 if (IndexOperand) {
9800 const Function &Fn = DAG.getMachineFunction().getFunction();
9801 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9802 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
9803 }
9804
9805 if (WaveDone && !WaveRelease) {
9806 // TODO: Move this to IR verifier
9807 const Function &Fn = DAG.getMachineFunction().getFunction();
9808 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9809 Fn, "ds_ordered_count: wave_done requires wave_release",
9810 DL.getDebugLoc()));
9811 }
9812
9813 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9814 unsigned ShaderType =
9816 unsigned Offset0 = OrderedCountIndex << 2;
9817 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9818
9819 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9820 Offset1 |= (CountDw - 1) << 6;
9821
9822 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9823 Offset1 |= ShaderType << 2;
9824
9825 unsigned Offset = Offset0 | (Offset1 << 8);
9826
9827 SDValue Ops[] = {
9828 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
9829 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
9830 };
9832 M->getVTList(), Ops, M->getMemoryVT(),
9833 M->getMemOperand());
9834 }
9835 case Intrinsic::amdgcn_raw_buffer_load:
9836 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9837 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9838 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9839 case Intrinsic::amdgcn_raw_buffer_load_format:
9840 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9841 const bool IsFormat =
9842 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9843 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9844
9845 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9846 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9847 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9848 SDValue Ops[] = {
9849 Op.getOperand(0), // Chain
9850 Rsrc, // rsrc
9851 DAG.getConstant(0, DL, MVT::i32), // vindex
9852 VOffset, // voffset
9853 SOffset, // soffset
9854 Offset, // offset
9855 Op.getOperand(5), // cachepolicy, swizzled buffer
9856 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9857 };
9858
9859 auto *M = cast<MemSDNode>(Op);
9860 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9861 }
9862 case Intrinsic::amdgcn_struct_buffer_load:
9863 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9864 case Intrinsic::amdgcn_struct_buffer_load_format:
9865 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9866 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9867 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9868 const bool IsFormat =
9869 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9870 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9871
9872 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9873 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9874 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9875 SDValue Ops[] = {
9876 Op.getOperand(0), // Chain
9877 Rsrc, // rsrc
9878 Op.getOperand(3), // vindex
9879 VOffset, // voffset
9880 SOffset, // soffset
9881 Offset, // offset
9882 Op.getOperand(6), // cachepolicy, swizzled buffer
9883 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9884 };
9885
9886 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
9887 }
9888 case Intrinsic::amdgcn_raw_tbuffer_load:
9889 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9890 MemSDNode *M = cast<MemSDNode>(Op);
9891 EVT LoadVT = Op.getValueType();
9892 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9893 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
9894 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
9895
9896 SDValue Ops[] = {
9897 Op.getOperand(0), // Chain
9898 Rsrc, // rsrc
9899 DAG.getConstant(0, DL, MVT::i32), // vindex
9900 VOffset, // voffset
9901 SOffset, // soffset
9902 Offset, // offset
9903 Op.getOperand(5), // format
9904 Op.getOperand(6), // cachepolicy, swizzled buffer
9905 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9906 };
9907
9908 if (LoadVT.getScalarType() == MVT::f16)
9909 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9910 Ops);
9911 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9912 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9913 DAG);
9914 }
9915 case Intrinsic::amdgcn_struct_tbuffer_load:
9916 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9917 MemSDNode *M = cast<MemSDNode>(Op);
9918 EVT LoadVT = Op.getValueType();
9919 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9920 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9921 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9922
9923 SDValue Ops[] = {
9924 Op.getOperand(0), // Chain
9925 Rsrc, // rsrc
9926 Op.getOperand(3), // vindex
9927 VOffset, // voffset
9928 SOffset, // soffset
9929 Offset, // offset
9930 Op.getOperand(6), // format
9931 Op.getOperand(7), // cachepolicy, swizzled buffer
9932 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9933 };
9934
9935 if (LoadVT.getScalarType() == MVT::f16)
9936 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9937 Ops);
9938 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9939 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9940 DAG);
9941 }
9942 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9943 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9944 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9945 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9946 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9947 return lowerStructBufferAtomicIntrin(Op, DAG,
9949 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9950 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9951 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9952 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9954 return lowerStructBufferAtomicIntrin(Op, DAG,
9956 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9958 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9959 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9960 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9961 return lowerStructBufferAtomicIntrin(Op, DAG,
9963 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9964 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9965 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9966 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9968 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9969 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9970 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9971 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9972 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9973 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9974 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9975 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9976 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9977 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9978 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9979 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9980 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9981 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9982 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9983 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9984 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9985 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9986 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9987 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9988 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9989 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9990 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9991 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9992 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9993 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9994 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9995 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9996 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9997 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9998 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9999 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10000 return lowerRawBufferAtomicIntrin(Op, DAG,
10002 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10003 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10004 return lowerStructBufferAtomicIntrin(Op, DAG,
10006 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10007 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10008 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10009 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10010 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10011 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10012 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10013 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10014 return lowerStructBufferAtomicIntrin(Op, DAG,
10016 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10017 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10018 return lowerStructBufferAtomicIntrin(Op, DAG,
10020 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10021 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10022 return lowerStructBufferAtomicIntrin(Op, DAG,
10024 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10025 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10026 return lowerStructBufferAtomicIntrin(Op, DAG,
10028 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10029 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10030 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10031 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10032 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10033 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10034 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10035 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10036 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10037 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10038 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10039 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10040 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10041 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10042 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10043 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10044 return lowerStructBufferAtomicIntrin(Op, DAG,
10046
10047 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10048 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10049 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10050 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10051 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10052 SDValue Ops[] = {
10053 Op.getOperand(0), // Chain
10054 Op.getOperand(2), // src
10055 Op.getOperand(3), // cmp
10056 Rsrc, // rsrc
10057 DAG.getConstant(0, DL, MVT::i32), // vindex
10058 VOffset, // voffset
10059 SOffset, // soffset
10060 Offset, // offset
10061 Op.getOperand(7), // cachepolicy
10062 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10063 };
10064 EVT VT = Op.getValueType();
10065 auto *M = cast<MemSDNode>(Op);
10066
10068 Op->getVTList(), Ops, VT,
10069 M->getMemOperand());
10070 }
10071 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10072 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10073 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10074 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10075 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10076 SDValue Ops[] = {
10077 Op.getOperand(0), // Chain
10078 Op.getOperand(2), // src
10079 Op.getOperand(3), // cmp
10080 Rsrc, // rsrc
10081 Op.getOperand(5), // vindex
10082 VOffset, // voffset
10083 SOffset, // soffset
10084 Offset, // offset
10085 Op.getOperand(8), // cachepolicy
10086 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10087 };
10088 EVT VT = Op.getValueType();
10089 auto *M = cast<MemSDNode>(Op);
10090
10092 Op->getVTList(), Ops, VT,
10093 M->getMemOperand());
10094 }
10095 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10096 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10097 MemSDNode *M = cast<MemSDNode>(Op);
10098 SDValue NodePtr = M->getOperand(2);
10099 SDValue RayExtent = M->getOperand(3);
10100 SDValue InstanceMask = M->getOperand(4);
10101 SDValue RayOrigin = M->getOperand(5);
10102 SDValue RayDir = M->getOperand(6);
10103 SDValue Offsets = M->getOperand(7);
10104 SDValue TDescr = M->getOperand(8);
10105
10106 assert(NodePtr.getValueType() == MVT::i64);
10107 assert(RayDir.getValueType() == MVT::v3f32);
10108
10109 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10110 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10111 return SDValue();
10112 }
10113
10114 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10115 const unsigned NumVDataDwords = 10;
10116 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10117 int Opcode = AMDGPU::getMIMGOpcode(
10118 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10119 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10120 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10121 assert(Opcode != -1);
10122
10124 Ops.push_back(NodePtr);
10125 Ops.push_back(DAG.getBuildVector(
10126 MVT::v2i32, DL,
10127 {DAG.getBitcast(MVT::i32, RayExtent),
10128 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10129 Ops.push_back(RayOrigin);
10130 Ops.push_back(RayDir);
10131 Ops.push_back(Offsets);
10132 Ops.push_back(TDescr);
10133 Ops.push_back(M->getChain());
10134
10135 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10136 MachineMemOperand *MemRef = M->getMemOperand();
10137 DAG.setNodeMemRefs(NewNode, {MemRef});
10138 return SDValue(NewNode, 0);
10139 }
10140 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10141 MemSDNode *M = cast<MemSDNode>(Op);
10142 SDValue NodePtr = M->getOperand(2);
10143 SDValue RayExtent = M->getOperand(3);
10144 SDValue RayOrigin = M->getOperand(4);
10145 SDValue RayDir = M->getOperand(5);
10146 SDValue RayInvDir = M->getOperand(6);
10147 SDValue TDescr = M->getOperand(7);
10148
10149 assert(NodePtr.getValueType() == MVT::i32 ||
10150 NodePtr.getValueType() == MVT::i64);
10151 assert(RayDir.getValueType() == MVT::v3f16 ||
10152 RayDir.getValueType() == MVT::v3f32);
10153
10154 if (!Subtarget->hasGFX10_AEncoding()) {
10155 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10156 return SDValue();
10157 }
10158
10159 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10160 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10161 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10162 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10163 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10164 const unsigned NumVDataDwords = 4;
10165 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10166 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10167 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10168 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10169 IsGFX12Plus;
10170 const unsigned BaseOpcodes[2][2] = {
10171 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10172 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10173 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10174 int Opcode;
10175 if (UseNSA) {
10176 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10177 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10178 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10179 : AMDGPU::MIMGEncGfx10NSA,
10180 NumVDataDwords, NumVAddrDwords);
10181 } else {
10182 assert(!IsGFX12Plus);
10183 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10184 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10185 : AMDGPU::MIMGEncGfx10Default,
10186 NumVDataDwords, NumVAddrDwords);
10187 }
10188 assert(Opcode != -1);
10189
10191
10192 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10194 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10195 if (Lanes[0].getValueSizeInBits() == 32) {
10196 for (unsigned I = 0; I < 3; ++I)
10197 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10198 } else {
10199 if (IsAligned) {
10200 Ops.push_back(DAG.getBitcast(
10201 MVT::i32,
10202 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10203 Ops.push_back(Lanes[2]);
10204 } else {
10205 SDValue Elt0 = Ops.pop_back_val();
10206 Ops.push_back(DAG.getBitcast(
10207 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10208 Ops.push_back(DAG.getBitcast(
10209 MVT::i32,
10210 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10211 }
10212 }
10213 };
10214
10215 if (UseNSA && IsGFX11Plus) {
10216 Ops.push_back(NodePtr);
10217 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10218 Ops.push_back(RayOrigin);
10219 if (IsA16) {
10220 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10221 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10222 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10223 for (unsigned I = 0; I < 3; ++I) {
10224 MergedLanes.push_back(DAG.getBitcast(
10225 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10226 {DirLanes[I], InvDirLanes[I]})));
10227 }
10228 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10229 } else {
10230 Ops.push_back(RayDir);
10231 Ops.push_back(RayInvDir);
10232 }
10233 } else {
10234 if (Is64)
10235 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10236 2);
10237 else
10238 Ops.push_back(NodePtr);
10239
10240 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10241 packLanes(RayOrigin, true);
10242 packLanes(RayDir, true);
10243 packLanes(RayInvDir, false);
10244 }
10245
10246 if (!UseNSA) {
10247 // Build a single vector containing all the operands so far prepared.
10248 if (NumVAddrDwords > 12) {
10249 SDValue Undef = DAG.getPOISON(MVT::i32);
10250 Ops.append(16 - Ops.size(), Undef);
10251 }
10252 assert(Ops.size() >= 8 && Ops.size() <= 12);
10253 SDValue MergedOps =
10254 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10255 Ops.clear();
10256 Ops.push_back(MergedOps);
10257 }
10258
10259 Ops.push_back(TDescr);
10260 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10261 Ops.push_back(M->getChain());
10262
10263 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10264 MachineMemOperand *MemRef = M->getMemOperand();
10265 DAG.setNodeMemRefs(NewNode, {MemRef});
10266 return SDValue(NewNode, 0);
10267 }
10268 case Intrinsic::amdgcn_global_atomic_fmin_num:
10269 case Intrinsic::amdgcn_global_atomic_fmax_num:
10270 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10271 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10272 MemSDNode *M = cast<MemSDNode>(Op);
10273 SDValue Ops[] = {
10274 M->getOperand(0), // Chain
10275 M->getOperand(2), // Ptr
10276 M->getOperand(3) // Value
10277 };
10278 unsigned Opcode = 0;
10279 switch (IntrID) {
10280 case Intrinsic::amdgcn_global_atomic_fmin_num:
10281 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10282 Opcode = ISD::ATOMIC_LOAD_FMIN;
10283 break;
10284 }
10285 case Intrinsic::amdgcn_global_atomic_fmax_num:
10286 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10287 Opcode = ISD::ATOMIC_LOAD_FMAX;
10288 break;
10289 }
10290 default:
10291 llvm_unreachable("unhandled atomic opcode");
10292 }
10293 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10294 Ops, M->getMemOperand());
10295 }
10296 case Intrinsic::amdgcn_s_get_barrier_state:
10297 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10298 SDValue Chain = Op->getOperand(0);
10300 unsigned Opc;
10301
10302 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10303 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10304 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10305 BarID = (BarID >> 4) & 0x3F;
10306 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10307 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10308 Ops.push_back(K);
10309 Ops.push_back(Chain);
10310 } else {
10311 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10312 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10313 SDValue M0Val;
10314 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10315 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10316 M0Val = SDValue(
10317 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10318 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10319 0);
10320 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10321 } else
10322 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10323 }
10324
10325 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10326 return SDValue(NewMI, 0);
10327 }
10328 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10329 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10330 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10331 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10332 SDValue Chain = Op->getOperand(0);
10333 SDValue Ptr = Op->getOperand(2);
10334 EVT VT = Op->getValueType(0);
10335 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10336 Chain, Ptr, MII->getMemOperand());
10337 }
10338 default:
10339
10340 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10342 return lowerImage(Op, ImageDimIntr, DAG, true);
10343
10344 return SDValue();
10345 }
10346}
10347
10348// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10349// dwordx4 if on SI and handle TFE loads.
10350SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10351 SDVTList VTList,
10352 ArrayRef<SDValue> Ops, EVT MemVT,
10353 MachineMemOperand *MMO,
10354 SelectionDAG &DAG) const {
10355 LLVMContext &C = *DAG.getContext();
10356 MachineFunction &MF = DAG.getMachineFunction();
10357 EVT VT = VTList.VTs[0];
10358
10359 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10360 bool IsTFE = VTList.NumVTs == 3;
10361 if (IsTFE) {
10362 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10363 unsigned NumOpDWords = NumValueDWords + 1;
10364 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10365 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10366 MachineMemOperand *OpDWordsMMO =
10367 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10368 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10369 OpDWordsVT, OpDWordsMMO, DAG);
10370 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10371 DAG.getVectorIdxConstant(NumValueDWords, DL));
10372 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10373 SDValue ValueDWords =
10374 NumValueDWords == 1
10375 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10377 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10378 ZeroIdx);
10379 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10380 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10381 }
10382
10383 if (!Subtarget->hasDwordx3LoadStores() &&
10384 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10385 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10386 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10387 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10388 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10389 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10390 WidenedMemVT, WidenedMMO);
10392 DAG.getVectorIdxConstant(0, DL));
10393 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10394 }
10395
10396 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10397}
10398
10399SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10400 bool ImageStore) const {
10401 EVT StoreVT = VData.getValueType();
10402
10403 // No change for f16 and legal vector D16 types.
10404 if (!StoreVT.isVector())
10405 return VData;
10406
10407 SDLoc DL(VData);
10408 unsigned NumElements = StoreVT.getVectorNumElements();
10409
10410 if (Subtarget->hasUnpackedD16VMem()) {
10411 // We need to unpack the packed data to store.
10412 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10413 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10414
10415 EVT EquivStoreVT =
10416 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10417 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10418 return DAG.UnrollVectorOp(ZExt.getNode());
10419 }
10420
10421 // The sq block of gfx8.1 does not estimate register use correctly for d16
10422 // image store instructions. The data operand is computed as if it were not a
10423 // d16 image instruction.
10424 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10425 // Bitcast to i16
10426 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10427 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10428
10429 // Decompose into scalars
10431 DAG.ExtractVectorElements(IntVData, Elts);
10432
10433 // Group pairs of i16 into v2i16 and bitcast to i32
10434 SmallVector<SDValue, 4> PackedElts;
10435 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10436 SDValue Pair =
10437 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10438 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10439 PackedElts.push_back(IntPair);
10440 }
10441 if ((NumElements % 2) == 1) {
10442 // Handle v3i16
10443 unsigned I = Elts.size() / 2;
10444 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10445 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10446 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10447 PackedElts.push_back(IntPair);
10448 }
10449
10450 // Pad using UNDEF
10451 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10452
10453 // Build final vector
10454 EVT VecVT =
10455 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10456 return DAG.getBuildVector(VecVT, DL, PackedElts);
10457 }
10458
10459 if (NumElements == 3) {
10460 EVT IntStoreVT =
10462 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10463
10464 EVT WidenedStoreVT = EVT::getVectorVT(
10465 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10466 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10467 WidenedStoreVT.getStoreSizeInBits());
10468 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10469 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10470 }
10471
10472 assert(isTypeLegal(StoreVT));
10473 return VData;
10474}
10475
10476SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10477 SelectionDAG &DAG) const {
10478 SDLoc DL(Op);
10479 SDValue Chain = Op.getOperand(0);
10480 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10481 MachineFunction &MF = DAG.getMachineFunction();
10482
10483 switch (IntrinsicID) {
10484 case Intrinsic::amdgcn_exp_compr: {
10485 if (!Subtarget->hasCompressedExport()) {
10486 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10488 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10489 }
10490 SDValue Src0 = Op.getOperand(4);
10491 SDValue Src1 = Op.getOperand(5);
10492 // Hack around illegal type on SI by directly selecting it.
10493 if (isTypeLegal(Src0.getValueType()))
10494 return SDValue();
10495
10496 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10497 SDValue Undef = DAG.getPOISON(MVT::f32);
10498 const SDValue Ops[] = {
10499 Op.getOperand(2), // tgt
10500 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10501 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10502 Undef, // src2
10503 Undef, // src3
10504 Op.getOperand(7), // vm
10505 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10506 Op.getOperand(3), // en
10507 Op.getOperand(0) // Chain
10508 };
10509
10510 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10511 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10512 }
10513
10514 case Intrinsic::amdgcn_struct_tbuffer_store:
10515 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10516 SDValue VData = Op.getOperand(2);
10517 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10518 if (IsD16)
10519 VData = handleD16VData(VData, DAG);
10520 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10521 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10522 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10523 SDValue Ops[] = {
10524 Chain,
10525 VData, // vdata
10526 Rsrc, // rsrc
10527 Op.getOperand(4), // vindex
10528 VOffset, // voffset
10529 SOffset, // soffset
10530 Offset, // offset
10531 Op.getOperand(7), // format
10532 Op.getOperand(8), // cachepolicy, swizzled buffer
10533 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10534 };
10535 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10537 MemSDNode *M = cast<MemSDNode>(Op);
10538 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10539 M->getMemoryVT(), M->getMemOperand());
10540 }
10541
10542 case Intrinsic::amdgcn_raw_tbuffer_store:
10543 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10544 SDValue VData = Op.getOperand(2);
10545 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10546 if (IsD16)
10547 VData = handleD16VData(VData, DAG);
10548 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10549 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10550 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10551 SDValue Ops[] = {
10552 Chain,
10553 VData, // vdata
10554 Rsrc, // rsrc
10555 DAG.getConstant(0, DL, MVT::i32), // vindex
10556 VOffset, // voffset
10557 SOffset, // soffset
10558 Offset, // offset
10559 Op.getOperand(6), // format
10560 Op.getOperand(7), // cachepolicy, swizzled buffer
10561 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10562 };
10563 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10565 MemSDNode *M = cast<MemSDNode>(Op);
10566 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10567 M->getMemoryVT(), M->getMemOperand());
10568 }
10569
10570 case Intrinsic::amdgcn_raw_buffer_store:
10571 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10572 case Intrinsic::amdgcn_raw_buffer_store_format:
10573 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10574 const bool IsFormat =
10575 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10576 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10577
10578 SDValue VData = Op.getOperand(2);
10579 EVT VDataVT = VData.getValueType();
10580 EVT EltType = VDataVT.getScalarType();
10581 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10582 if (IsD16) {
10583 VData = handleD16VData(VData, DAG);
10584 VDataVT = VData.getValueType();
10585 }
10586
10587 if (!isTypeLegal(VDataVT)) {
10588 VData =
10589 DAG.getNode(ISD::BITCAST, DL,
10590 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10591 }
10592
10593 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10594 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10595 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10596 SDValue Ops[] = {
10597 Chain,
10598 VData,
10599 Rsrc,
10600 DAG.getConstant(0, DL, MVT::i32), // vindex
10601 VOffset, // voffset
10602 SOffset, // soffset
10603 Offset, // offset
10604 Op.getOperand(6), // cachepolicy, swizzled buffer
10605 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10606 };
10607 unsigned Opc =
10610 MemSDNode *M = cast<MemSDNode>(Op);
10611
10612 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10613 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10614 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
10615
10616 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10617 M->getMemoryVT(), M->getMemOperand());
10618 }
10619
10620 case Intrinsic::amdgcn_struct_buffer_store:
10621 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10622 case Intrinsic::amdgcn_struct_buffer_store_format:
10623 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10624 const bool IsFormat =
10625 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10626 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10627
10628 SDValue VData = Op.getOperand(2);
10629 EVT VDataVT = VData.getValueType();
10630 EVT EltType = VDataVT.getScalarType();
10631 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10632
10633 if (IsD16) {
10634 VData = handleD16VData(VData, DAG);
10635 VDataVT = VData.getValueType();
10636 }
10637
10638 if (!isTypeLegal(VDataVT)) {
10639 VData =
10640 DAG.getNode(ISD::BITCAST, DL,
10641 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10642 }
10643
10644 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10645 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10646 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10647 SDValue Ops[] = {
10648 Chain,
10649 VData,
10650 Rsrc,
10651 Op.getOperand(4), // vindex
10652 VOffset, // voffset
10653 SOffset, // soffset
10654 Offset, // offset
10655 Op.getOperand(7), // cachepolicy, swizzled buffer
10656 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10657 };
10658 unsigned Opc =
10661 MemSDNode *M = cast<MemSDNode>(Op);
10662
10663 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10664 EVT VDataType = VData.getValueType().getScalarType();
10665 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10666 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
10667
10668 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10669 M->getMemoryVT(), M->getMemOperand());
10670 }
10671 case Intrinsic::amdgcn_raw_buffer_load_lds:
10672 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10673 case Intrinsic::amdgcn_struct_buffer_load_lds:
10674 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10675 if (!Subtarget->hasVMemToLDSLoad())
10676 return SDValue();
10677 unsigned Opc;
10678 bool HasVIndex =
10679 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10680 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10681 unsigned OpOffset = HasVIndex ? 1 : 0;
10682 SDValue VOffset = Op.getOperand(5 + OpOffset);
10683 bool HasVOffset = !isNullConstant(VOffset);
10684 unsigned Size = Op->getConstantOperandVal(4);
10685
10686 switch (Size) {
10687 default:
10688 return SDValue();
10689 case 1:
10690 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10691 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10692 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10693 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10694 break;
10695 case 2:
10696 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10697 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10698 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10699 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10700 break;
10701 case 4:
10702 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10703 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10704 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10705 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10706 break;
10707 case 12:
10708 if (!Subtarget->hasLDSLoadB96_B128())
10709 return SDValue();
10710 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10711 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10712 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10713 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10714 break;
10715 case 16:
10716 if (!Subtarget->hasLDSLoadB96_B128())
10717 return SDValue();
10718 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10719 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10720 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10721 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10722 break;
10723 }
10724
10725 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10726
10728
10729 if (HasVIndex && HasVOffset)
10730 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10731 {Op.getOperand(5), // VIndex
10732 VOffset}));
10733 else if (HasVIndex)
10734 Ops.push_back(Op.getOperand(5));
10735 else if (HasVOffset)
10736 Ops.push_back(VOffset);
10737
10738 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10739 Ops.push_back(Rsrc);
10740 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10741 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10742 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10743 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
10744 Ops.push_back(DAG.getTargetConstant(
10745 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10746 DL, MVT::i8)); // cpol
10747 Ops.push_back(DAG.getTargetConstant(
10748 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10749 ? 1
10750 : 0,
10751 DL, MVT::i8)); // swz
10752 Ops.push_back(M0Val.getValue(0)); // Chain
10753 Ops.push_back(M0Val.getValue(1)); // Glue
10754
10755 auto *M = cast<MemSDNode>(Op);
10756 MachineMemOperand *LoadMMO = M->getMemOperand();
10757 // Don't set the offset value here because the pointer points to the base of
10758 // the buffer.
10759 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10760
10761 MachinePointerInfo StorePtrI = LoadPtrI;
10762 LoadPtrI.V = PoisonValue::get(
10766
10767 auto F = LoadMMO->getFlags() &
10769 LoadMMO =
10771 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10772
10773 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10774 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
10775 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10776
10777 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
10778 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10779
10780 return SDValue(Load, 0);
10781 }
10782 // Buffers are handled by LowerBufferFatPointers, and we're going to go
10783 // for "trust me" that the remaining cases are global pointers until
10784 // such time as we can put two mem operands on an intrinsic.
10785 case Intrinsic::amdgcn_load_to_lds:
10786 case Intrinsic::amdgcn_global_load_lds: {
10787 if (!Subtarget->hasVMemToLDSLoad())
10788 return SDValue();
10789
10790 unsigned Opc;
10791 unsigned Size = Op->getConstantOperandVal(4);
10792 switch (Size) {
10793 default:
10794 return SDValue();
10795 case 1:
10796 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10797 break;
10798 case 2:
10799 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10800 break;
10801 case 4:
10802 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10803 break;
10804 case 12:
10805 if (!Subtarget->hasLDSLoadB96_B128())
10806 return SDValue();
10807 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10808 break;
10809 case 16:
10810 if (!Subtarget->hasLDSLoadB96_B128())
10811 return SDValue();
10812 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10813 break;
10814 }
10815
10816 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10817
10819
10820 SDValue Addr = Op.getOperand(2); // Global ptr
10821 SDValue VOffset;
10822 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10823 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10824 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10825 SDValue LHS = Addr.getOperand(0);
10826 SDValue RHS = Addr.getOperand(1);
10827
10828 if (LHS->isDivergent())
10829 std::swap(LHS, RHS);
10830
10831 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10832 RHS.getOperand(0).getValueType() == MVT::i32) {
10833 // add (i64 sgpr), (zero_extend (i32 vgpr))
10834 Addr = LHS;
10835 VOffset = RHS.getOperand(0);
10836 }
10837 }
10838
10839 Ops.push_back(Addr);
10840 if (!Addr->isDivergent()) {
10842 if (!VOffset)
10843 VOffset =
10844 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
10845 DAG.getTargetConstant(0, DL, MVT::i32)),
10846 0);
10847 Ops.push_back(VOffset);
10848 }
10849
10850 Ops.push_back(Op.getOperand(5)); // Offset
10851 Ops.push_back(Op.getOperand(6)); // CPol
10852 Ops.push_back(M0Val.getValue(0)); // Chain
10853 Ops.push_back(M0Val.getValue(1)); // Glue
10854
10855 auto *M = cast<MemSDNode>(Op);
10856 MachineMemOperand *LoadMMO = M->getMemOperand();
10857 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10858 LoadPtrI.Offset = Op->getConstantOperandVal(5);
10859 MachinePointerInfo StorePtrI = LoadPtrI;
10860 LoadPtrI.V = PoisonValue::get(
10864 auto F = LoadMMO->getFlags() &
10866 LoadMMO =
10868 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10869 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10870 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
10871 LoadMMO->getAAInfo());
10872
10873 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10874 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10875
10876 return SDValue(Load, 0);
10877 }
10878 case Intrinsic::amdgcn_end_cf:
10879 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
10880 Op->getOperand(2), Chain),
10881 0);
10882 case Intrinsic::amdgcn_s_barrier_init:
10883 case Intrinsic::amdgcn_s_barrier_signal_var: {
10884 // these two intrinsics have two operands: barrier pointer and member count
10885 SDValue Chain = Op->getOperand(0);
10887 SDValue BarOp = Op->getOperand(2);
10888 SDValue CntOp = Op->getOperand(3);
10889 SDValue M0Val;
10890 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
10891 ? AMDGPU::S_BARRIER_INIT_M0
10892 : AMDGPU::S_BARRIER_SIGNAL_M0;
10893 // extract the BarrierID from bits 4-9 of BarOp
10894 SDValue BarID;
10895 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10896 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10897 BarID =
10898 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
10899 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10900 0);
10901 // Member count should be put into M0[ShAmt:+6]
10902 // Barrier ID should be put into M0[5:0]
10903 M0Val =
10904 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
10905 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10906 0);
10907 constexpr unsigned ShAmt = 16;
10908 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
10909 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
10910
10911 M0Val = SDValue(
10912 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
10913
10914 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10915
10916 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10917 return SDValue(NewMI, 0);
10918 }
10919 case Intrinsic::amdgcn_s_barrier_join: {
10920 // these three intrinsics have one operand: barrier pointer
10921 SDValue Chain = Op->getOperand(0);
10923 SDValue BarOp = Op->getOperand(2);
10924 unsigned Opc;
10925
10926 if (isa<ConstantSDNode>(BarOp)) {
10927 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
10928 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
10929
10930 // extract the BarrierID from bits 4-9 of the immediate
10931 unsigned BarID = (BarVal >> 4) & 0x3F;
10932 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10933 Ops.push_back(K);
10934 Ops.push_back(Chain);
10935 } else {
10936 Opc = AMDGPU::S_BARRIER_JOIN_M0;
10937
10938 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
10939 SDValue M0Val;
10940 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
10941 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10942 M0Val =
10943 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10944 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10945 0);
10946 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10947 }
10948
10949 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10950 return SDValue(NewMI, 0);
10951 }
10952 case Intrinsic::amdgcn_s_prefetch_data: {
10953 // For non-global address space preserve the chain and remove the call.
10955 return Op.getOperand(0);
10956 return Op;
10957 }
10958 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10959 SDValue Ops[] = {
10960 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
10961 Op.getOperand(3), // offset
10962 Op.getOperand(4), // length
10963 };
10964
10965 MemSDNode *M = cast<MemSDNode>(Op);
10967 Op->getVTList(), Ops, M->getMemoryVT(),
10968 M->getMemOperand());
10969 }
10970 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
10971 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
10972 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
10973 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10974 SDValue Chain = Op->getOperand(0);
10975 SDValue Ptr = Op->getOperand(2);
10976 SDValue Val = Op->getOperand(3);
10977 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
10978 Ptr, MII->getMemOperand());
10979 }
10980 default: {
10981 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10983 return lowerImage(Op, ImageDimIntr, DAG, true);
10984
10985 return Op;
10986 }
10987 }
10988}
10989
10990// Return whether the operation has NoUnsignedWrap property.
10991static bool isNoUnsignedWrap(SDValue Addr) {
10992 return (Addr.getOpcode() == ISD::ADD &&
10993 Addr->getFlags().hasNoUnsignedWrap()) ||
10994 Addr->getOpcode() == ISD::OR;
10995}
10996
10998 EVT PtrVT) const {
10999 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11000}
11001
11002// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11003// offset (the offset that is included in bounds checking and swizzling, to be
11004// split between the instruction's voffset and immoffset fields) and soffset
11005// (the offset that is excluded from bounds checking and swizzling, to go in
11006// the instruction's soffset field). This function takes the first kind of
11007// offset and figures out how to split it between voffset and immoffset.
11008std::pair<SDValue, SDValue>
11009SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11010 SDLoc DL(Offset);
11011 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11012 SDValue N0 = Offset;
11013 ConstantSDNode *C1 = nullptr;
11014
11015 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11016 N0 = SDValue();
11017 else if (DAG.isBaseWithConstantOffset(N0)) {
11018 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11019 // being added, so we can only safely match a 32-bit addition with no
11020 // unsigned overflow.
11021 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11022 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11023 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11024 N0 = N0.getOperand(0);
11025 }
11026 }
11027
11028 if (C1) {
11029 unsigned ImmOffset = C1->getZExtValue();
11030 // If the immediate value is too big for the immoffset field, put only bits
11031 // that would normally fit in the immoffset field. The remaining value that
11032 // is copied/added for the voffset field is a large power of 2, and it
11033 // stands more chance of being CSEd with the copy/add for another similar
11034 // load/store.
11035 // However, do not do that rounding down if that is a negative
11036 // number, as it appears to be illegal to have a negative offset in the
11037 // vgpr, even if adding the immediate offset makes it positive.
11038 unsigned Overflow = ImmOffset & ~MaxImm;
11039 ImmOffset -= Overflow;
11040 if ((int32_t)Overflow < 0) {
11041 Overflow += ImmOffset;
11042 ImmOffset = 0;
11043 }
11044 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11045 if (Overflow) {
11046 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11047 if (!N0)
11048 N0 = OverflowVal;
11049 else {
11050 SDValue Ops[] = {N0, OverflowVal};
11051 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11052 }
11053 }
11054 }
11055 if (!N0)
11056 N0 = DAG.getConstant(0, DL, MVT::i32);
11057 if (!C1)
11058 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11059 return {N0, SDValue(C1, 0)};
11060}
11061
11062// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11063// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11064// pointed to by Offsets.
11065void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11066 SelectionDAG &DAG, SDValue *Offsets,
11067 Align Alignment) const {
11068 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11069 SDLoc DL(CombinedOffset);
11070 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11071 uint32_t Imm = C->getZExtValue();
11072 uint32_t SOffset, ImmOffset;
11073 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11074 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11075 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11076 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11077 return;
11078 }
11079 }
11080 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11081 SDValue N0 = CombinedOffset.getOperand(0);
11082 SDValue N1 = CombinedOffset.getOperand(1);
11083 uint32_t SOffset, ImmOffset;
11084 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11085 if (Offset >= 0 &&
11086 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11087 Offsets[0] = N0;
11088 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11089 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11090 return;
11091 }
11092 }
11093
11094 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11095 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11096 : DAG.getConstant(0, DL, MVT::i32);
11097
11098 Offsets[0] = CombinedOffset;
11099 Offsets[1] = SOffsetZero;
11100 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11101}
11102
11103SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11104 SelectionDAG &DAG) const {
11105 if (!MaybePointer.getValueType().isScalarInteger())
11106 return MaybePointer;
11107
11108 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11109 return Rsrc;
11110}
11111
11112// Wrap a global or flat pointer into a buffer intrinsic using the flags
11113// specified in the intrinsic.
11114SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11115 SelectionDAG &DAG) const {
11116 SDLoc Loc(Op);
11117
11118 SDValue Pointer = Op->getOperand(1);
11119 SDValue Stride = Op->getOperand(2);
11120 SDValue NumRecords = Op->getOperand(3);
11121 SDValue Flags = Op->getOperand(4);
11122
11123 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11124 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11125 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11126 std::optional<uint32_t> ConstStride = std::nullopt;
11127 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11128 ConstStride = ConstNode->getZExtValue();
11129
11130 SDValue NewHighHalf = Masked;
11131 if (!ConstStride || *ConstStride != 0) {
11132 SDValue ShiftedStride;
11133 if (ConstStride) {
11134 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11135 } else {
11136 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11137 ShiftedStride =
11138 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11139 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11140 }
11141 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11142 }
11143
11144 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11145 NewHighHalf, NumRecords, Flags);
11146 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11147 return RsrcPtr;
11148}
11149
11150// Handle 8 bit and 16 bit buffer loads
11151SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11152 EVT LoadVT, SDLoc DL,
11154 MachineMemOperand *MMO,
11155 bool IsTFE) const {
11156 EVT IntVT = LoadVT.changeTypeToInteger();
11157
11158 if (IsTFE) {
11159 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11162 MachineFunction &MF = DAG.getMachineFunction();
11163 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11164 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11165 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11166 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11167 DAG.getConstant(1, DL, MVT::i32));
11168 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11169 DAG.getConstant(0, DL, MVT::i32));
11170 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11171 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11172 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11173 }
11174
11175 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11178
11179 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11180 SDValue BufferLoad =
11181 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11182 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11183 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11184
11185 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11186}
11187
11188// Handle 8 bit and 16 bit buffer stores
11189SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11190 EVT VDataType, SDLoc DL,
11191 SDValue Ops[],
11192 MemSDNode *M) const {
11193 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11194 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11195
11196 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11197 Ops[1] = BufferStoreExt;
11198 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11199 : AMDGPUISD::BUFFER_STORE_SHORT;
11200 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11201 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11202 M->getMemOperand());
11203}
11204
11206 SDValue Op, const SDLoc &SL, EVT VT) {
11207 if (VT.bitsLT(Op.getValueType()))
11208 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11209
11210 switch (ExtType) {
11211 case ISD::SEXTLOAD:
11212 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11213 case ISD::ZEXTLOAD:
11214 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11215 case ISD::EXTLOAD:
11216 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11217 case ISD::NON_EXTLOAD:
11218 return Op;
11219 }
11220
11221 llvm_unreachable("invalid ext type");
11222}
11223
11224// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11225// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11226SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11227 DAGCombinerInfo &DCI) const {
11228 SelectionDAG &DAG = DCI.DAG;
11229 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11230 return SDValue();
11231
11232 // FIXME: Constant loads should all be marked invariant.
11233 unsigned AS = Ld->getAddressSpace();
11234 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11236 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11237 return SDValue();
11238
11239 // Don't do this early, since it may interfere with adjacent load merging for
11240 // illegal types. We can avoid losing alignment information for exotic types
11241 // pre-legalize.
11242 EVT MemVT = Ld->getMemoryVT();
11243 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11244 MemVT.getSizeInBits() >= 32)
11245 return SDValue();
11246
11247 SDLoc SL(Ld);
11248
11249 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11250 "unexpected vector extload");
11251
11252 // TODO: Drop only high part of range.
11253 SDValue Ptr = Ld->getBasePtr();
11254 SDValue NewLoad = DAG.getLoad(
11255 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11256 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11257 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11258 nullptr); // Drop ranges
11259
11260 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11261 if (MemVT.isFloatingPoint()) {
11263 "unexpected fp extload");
11264 TruncVT = MemVT.changeTypeToInteger();
11265 }
11266
11267 SDValue Cvt = NewLoad;
11268 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11269 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11270 DAG.getValueType(TruncVT));
11271 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11273 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11274 } else {
11276 }
11277
11278 EVT VT = Ld->getValueType(0);
11279 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11280
11281 DCI.AddToWorklist(Cvt.getNode());
11282
11283 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11284 // the appropriate extension from the 32-bit load.
11285 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11286 DCI.AddToWorklist(Cvt.getNode());
11287
11288 // Handle conversion back to floating point if necessary.
11289 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11290
11291 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11292}
11293
11295 const SIMachineFunctionInfo &Info) {
11296 // TODO: Should check if the address can definitely not access stack.
11297 if (Info.isEntryFunction())
11298 return Info.getUserSGPRInfo().hasFlatScratchInit();
11299 return true;
11300}
11301
11302SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11303 SDLoc DL(Op);
11304 LoadSDNode *Load = cast<LoadSDNode>(Op);
11305 ISD::LoadExtType ExtType = Load->getExtensionType();
11306 EVT MemVT = Load->getMemoryVT();
11307 MachineMemOperand *MMO = Load->getMemOperand();
11308
11309 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11310 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11311 return SDValue();
11312
11313 // FIXME: Copied from PPC
11314 // First, load into 32 bits, then truncate to 1 bit.
11315
11316 SDValue Chain = Load->getChain();
11317 SDValue BasePtr = Load->getBasePtr();
11318
11319 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11320
11321 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11322 RealMemVT, MMO);
11323
11324 if (!MemVT.isVector()) {
11325 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11326 NewLD.getValue(1)};
11327
11328 return DAG.getMergeValues(Ops, DL);
11329 }
11330
11332 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11333 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11334 DAG.getConstant(I, DL, MVT::i32));
11335
11336 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11337 }
11338
11339 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11340
11341 return DAG.getMergeValues(Ops, DL);
11342 }
11343
11344 if (!MemVT.isVector())
11345 return SDValue();
11346
11347 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11348 "Custom lowering for non-i32 vectors hasn't been implemented.");
11349
11350 Align Alignment = Load->getAlign();
11351 unsigned AS = Load->getAddressSpace();
11352 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11353 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11354 return SplitVectorLoad(Op, DAG);
11355 }
11356
11357 MachineFunction &MF = DAG.getMachineFunction();
11358 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11359 // If there is a possibility that flat instruction access scratch memory
11360 // then we need to use the same legalization rules we use for private.
11361 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11362 !Subtarget->hasMultiDwordFlatScratchAddressing())
11363 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11366
11367 unsigned NumElements = MemVT.getVectorNumElements();
11368
11369 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11371 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11372 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11374 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11375 Alignment >= Align(4) && NumElements < 32) {
11376 if (MemVT.isPow2VectorType() ||
11377 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11378 return SDValue();
11379 return WidenOrSplitVectorLoad(Op, DAG);
11380 }
11381 // Non-uniform loads will be selected to MUBUF instructions, so they
11382 // have the same legalization requirements as global and private
11383 // loads.
11384 //
11385 }
11386 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11389 if (NumElements > 4)
11390 return SplitVectorLoad(Op, DAG);
11391 // v3 loads not supported on SI.
11392 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11393 return WidenOrSplitVectorLoad(Op, DAG);
11394
11395 // v3 and v4 loads are supported for private and global memory.
11396 return SDValue();
11397 }
11398 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11399 // Depending on the setting of the private_element_size field in the
11400 // resource descriptor, we can only make private accesses up to a certain
11401 // size.
11402 switch (Subtarget->getMaxPrivateElementSize()) {
11403 case 4: {
11404 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11405 return DAG.getMergeValues({Op0, Op1}, DL);
11406 }
11407 case 8:
11408 if (NumElements > 2)
11409 return SplitVectorLoad(Op, DAG);
11410 return SDValue();
11411 case 16:
11412 // Same as global/flat
11413 if (NumElements > 4)
11414 return SplitVectorLoad(Op, DAG);
11415 // v3 loads not supported on SI.
11416 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11417 return WidenOrSplitVectorLoad(Op, DAG);
11418
11419 return SDValue();
11420 default:
11421 llvm_unreachable("unsupported private_element_size");
11422 }
11423 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11424 unsigned Fast = 0;
11425 auto Flags = Load->getMemOperand()->getFlags();
11427 Load->getAlign(), Flags, &Fast) &&
11428 Fast > 1)
11429 return SDValue();
11430
11431 if (MemVT.isVector())
11432 return SplitVectorLoad(Op, DAG);
11433 }
11434
11436 MemVT, *Load->getMemOperand())) {
11437 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11438 return DAG.getMergeValues({Op0, Op1}, DL);
11439 }
11440
11441 return SDValue();
11442}
11443
11444SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11445 EVT VT = Op.getValueType();
11446 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11447 VT.getSizeInBits() == 512)
11448 return splitTernaryVectorOp(Op, DAG);
11449
11450 assert(VT.getSizeInBits() == 64);
11451
11452 SDLoc DL(Op);
11453 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11454
11455 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11456 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11457
11458 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11459 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11460
11461 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11462 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11463
11464 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11465
11466 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11467 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11468
11469 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11470
11471 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11472 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11473}
11474
11475// Catch division cases where we can use shortcuts with rcp and rsq
11476// instructions.
11477SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11478 SelectionDAG &DAG) const {
11479 SDLoc SL(Op);
11480 SDValue LHS = Op.getOperand(0);
11481 SDValue RHS = Op.getOperand(1);
11482 EVT VT = Op.getValueType();
11483 const SDNodeFlags Flags = Op->getFlags();
11484
11485 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11486
11487 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11488 // Without !fpmath accuracy information, we can't do more because we don't
11489 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11490 // f16 is always accurate enough
11491 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11492 return SDValue();
11493
11494 if (CLHS->isExactlyValue(1.0)) {
11495 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11496 // the CI documentation has a worst case error of 1 ulp.
11497 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11498 // use it as long as we aren't trying to use denormals.
11499 //
11500 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11501
11502 // 1.0 / sqrt(x) -> rsq(x)
11503
11504 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11505 // error seems really high at 2^29 ULP.
11506 // 1.0 / x -> rcp(x)
11507 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11508 }
11509
11510 // Same as for 1.0, but expand the sign out of the constant.
11511 if (CLHS->isExactlyValue(-1.0)) {
11512 // -1.0 / x -> rcp (fneg x)
11513 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11514 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11515 }
11516 }
11517
11518 // For f16 and bf16 require afn or arcp.
11519 // For f32 require afn.
11520 if (!AllowInaccurateRcp &&
11521 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
11522 return SDValue();
11523
11524 // Turn into multiply by the reciprocal.
11525 // x / y -> x * (1.0 / y)
11526 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11527 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
11528}
11529
11530SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
11531 SelectionDAG &DAG) const {
11532 SDLoc SL(Op);
11533 SDValue X = Op.getOperand(0);
11534 SDValue Y = Op.getOperand(1);
11535 EVT VT = Op.getValueType();
11536 const SDNodeFlags Flags = Op->getFlags();
11537
11538 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
11539 if (!AllowInaccurateDiv)
11540 return SDValue();
11541
11542 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
11543 SDValue One = DAG.getConstantFP(1.0, SL, VT);
11544
11545 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
11546 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11547
11548 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
11549 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11550 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
11551 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
11552 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
11553 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
11554}
11555
11556static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11557 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
11558 SDNodeFlags Flags) {
11559 if (GlueChain->getNumValues() <= 1) {
11560 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
11561 }
11562
11563 assert(GlueChain->getNumValues() == 3);
11564
11565 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11566 switch (Opcode) {
11567 default:
11568 llvm_unreachable("no chain equivalent for opcode");
11569 case ISD::FMUL:
11570 Opcode = AMDGPUISD::FMUL_W_CHAIN;
11571 break;
11572 }
11573
11574 return DAG.getNode(Opcode, SL, VTList,
11575 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
11576 Flags);
11577}
11578
11579static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11580 EVT VT, SDValue A, SDValue B, SDValue C,
11581 SDValue GlueChain, SDNodeFlags Flags) {
11582 if (GlueChain->getNumValues() <= 1) {
11583 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
11584 }
11585
11586 assert(GlueChain->getNumValues() == 3);
11587
11588 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11589 switch (Opcode) {
11590 default:
11591 llvm_unreachable("no chain equivalent for opcode");
11592 case ISD::FMA:
11593 Opcode = AMDGPUISD::FMA_W_CHAIN;
11594 break;
11595 }
11596
11597 return DAG.getNode(Opcode, SL, VTList,
11598 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
11599 Flags);
11600}
11601
11602SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
11603 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11604 return FastLowered;
11605
11606 SDLoc SL(Op);
11607 EVT VT = Op.getValueType();
11608 SDValue LHS = Op.getOperand(0);
11609 SDValue RHS = Op.getOperand(1);
11610
11611 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
11612 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
11613
11614 if (VT == MVT::bf16) {
11615 SDValue ExtDiv =
11616 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
11617 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
11618 DAG.getTargetConstant(0, SL, MVT::i32));
11619 }
11620
11621 assert(VT == MVT::f16);
11622
11623 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
11624 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
11625 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
11626 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
11627 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11628 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
11629 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11630 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
11631 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
11632 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
11633 // q16.u = opx(V_CVT_F16_F32, q32.u);
11634 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
11635
11636 // We will use ISD::FMA on targets that don't support ISD::FMAD.
11637 unsigned FMADOpCode =
11639 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11640 SDValue Rcp =
11641 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
11642 SDValue Quot =
11643 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
11644 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11645 Op->getFlags());
11646 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
11647 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11648 Op->getFlags());
11649 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
11650 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
11651 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
11652 DAG.getConstant(0xff800000, SL, MVT::i32));
11653 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11654 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
11655 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
11656 DAG.getTargetConstant(0, SL, MVT::i32));
11657 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
11658 Op->getFlags());
11659}
11660
11661// Faster 2.5 ULP division that does not support denormals.
11662SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
11663 SDNodeFlags Flags = Op->getFlags();
11664 SDLoc SL(Op);
11665 SDValue LHS = Op.getOperand(1);
11666 SDValue RHS = Op.getOperand(2);
11667
11668 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
11669
11670 const APFloat K0Val(0x1p+96f);
11671 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
11672
11673 const APFloat K1Val(0x1p-32f);
11674 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
11675
11676 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11677
11678 EVT SetCCVT =
11679 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
11680
11681 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
11682
11683 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
11684
11685 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
11686
11687 // rcp does not support denormals.
11688 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
11689
11690 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
11691
11692 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
11693}
11694
11695// Returns immediate value for setting the F32 denorm mode when using the
11696// S_DENORM_MODE instruction.
11699 const GCNSubtarget *ST) {
11700 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
11701 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
11702 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11703 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
11704}
11705
11706SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
11707 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11708 return FastLowered;
11709
11710 // The selection matcher assumes anything with a chain selecting to a
11711 // mayRaiseFPException machine instruction. Since we're introducing a chain
11712 // here, we need to explicitly report nofpexcept for the regular fdiv
11713 // lowering.
11714 SDNodeFlags Flags = Op->getFlags();
11715 Flags.setNoFPExcept(true);
11716
11717 SDLoc SL(Op);
11718 SDValue LHS = Op.getOperand(0);
11719 SDValue RHS = Op.getOperand(1);
11720
11721 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11722
11723 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
11724
11725 SDValue DenominatorScaled =
11726 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
11727 SDValue NumeratorScaled =
11728 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
11729
11730 // Denominator is scaled to not be denormal, so using rcp is ok.
11731 SDValue ApproxRcp =
11732 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
11733 SDValue NegDivScale0 =
11734 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11735
11736 using namespace AMDGPU::Hwreg;
11737 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11738 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
11739
11740 const MachineFunction &MF = DAG.getMachineFunction();
11741 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
11742 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
11743
11744 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
11745 const bool HasDynamicDenormals =
11746 (DenormMode.Input == DenormalMode::Dynamic) ||
11747 (DenormMode.Output == DenormalMode::Dynamic);
11748
11749 SDValue SavedDenormMode;
11750
11751 if (!PreservesDenormals) {
11752 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
11753 // lowering. The chain dependence is insufficient, and we need glue. We do
11754 // not need the glue variants in a strictfp function.
11755
11756 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11757
11758 SDValue Glue = DAG.getEntryNode();
11759 if (HasDynamicDenormals) {
11760 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
11761 DAG.getVTList(MVT::i32, MVT::Glue),
11762 {BitField, Glue});
11763 SavedDenormMode = SDValue(GetReg, 0);
11764
11765 Glue = DAG.getMergeValues(
11766 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
11767 }
11768
11769 SDNode *EnableDenorm;
11770 if (Subtarget->hasDenormModeInst()) {
11771 const SDValue EnableDenormValue =
11773
11774 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
11775 EnableDenormValue)
11776 .getNode();
11777 } else {
11778 const SDValue EnableDenormValue =
11779 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
11780 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11781 {EnableDenormValue, BitField, Glue});
11782 }
11783
11784 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11785 SDValue(EnableDenorm, 1)};
11786
11787 NegDivScale0 = DAG.getMergeValues(Ops, SL);
11788 }
11789
11790 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
11791 ApproxRcp, One, NegDivScale0, Flags);
11792
11793 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
11794 ApproxRcp, Fma0, Flags);
11795
11796 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
11797 Fma1, Flags);
11798
11799 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
11800 NumeratorScaled, Mul, Flags);
11801
11802 SDValue Fma3 =
11803 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
11804
11805 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
11806 NumeratorScaled, Fma3, Flags);
11807
11808 if (!PreservesDenormals) {
11809 SDNode *DisableDenorm;
11810 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11811 const SDValue DisableDenormValue = getSPDenormModeValue(
11812 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
11813
11814 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11815 DisableDenorm =
11816 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
11817 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
11818 .getNode();
11819 } else {
11820 assert(HasDynamicDenormals == (bool)SavedDenormMode);
11821 const SDValue DisableDenormValue =
11822 HasDynamicDenormals
11823 ? SavedDenormMode
11824 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
11825
11826 DisableDenorm = DAG.getMachineNode(
11827 AMDGPU::S_SETREG_B32, SL, MVT::Other,
11828 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
11829 }
11830
11831 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
11832 SDValue(DisableDenorm, 0), DAG.getRoot());
11833 DAG.setRoot(OutputChain);
11834 }
11835
11836 SDValue Scale = NumeratorScaled.getValue(1);
11837 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
11838 {Fma4, Fma1, Fma3, Scale}, Flags);
11839
11840 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
11841}
11842
11843SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11844 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11845 return FastLowered;
11846
11847 SDLoc SL(Op);
11848 SDValue X = Op.getOperand(0);
11849 SDValue Y = Op.getOperand(1);
11850
11851 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
11852
11853 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
11854
11855 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
11856
11857 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
11858
11859 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
11860
11861 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
11862
11863 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
11864
11865 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
11866
11867 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
11868
11869 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
11870 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
11871
11872 SDValue Fma4 =
11873 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
11874
11875 SDValue Scale;
11876
11877 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11878 // Workaround a hardware bug on SI where the condition output from div_scale
11879 // is not usable.
11880
11881 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
11882
11883 // Figure out if the scale to use for div_fmas.
11884 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
11885 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
11886 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
11887 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
11888
11889 SDValue NumHi =
11890 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
11891 SDValue DenHi =
11892 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
11893
11894 SDValue Scale0Hi =
11895 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
11896 SDValue Scale1Hi =
11897 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
11898
11899 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
11900 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
11901 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
11902 } else {
11903 Scale = DivScale1.getValue(1);
11904 }
11905
11906 SDValue Fmas =
11907 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
11908
11909 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
11910}
11911
11912SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11913 EVT VT = Op.getValueType();
11914
11915 if (VT == MVT::f32)
11916 return LowerFDIV32(Op, DAG);
11917
11918 if (VT == MVT::f64)
11919 return LowerFDIV64(Op, DAG);
11920
11921 if (VT == MVT::f16 || VT == MVT::bf16)
11922 return LowerFDIV16(Op, DAG);
11923
11924 llvm_unreachable("Unexpected type for fdiv");
11925}
11926
11927SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11928 SDLoc dl(Op);
11929 SDValue Val = Op.getOperand(0);
11930 EVT VT = Val.getValueType();
11931 EVT ResultExpVT = Op->getValueType(1);
11932 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11933
11934 SDValue Mant = DAG.getNode(
11936 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
11937
11938 SDValue Exp = DAG.getNode(
11939 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
11940 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
11941
11942 if (Subtarget->hasFractBug()) {
11943 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
11944 SDValue Inf =
11946
11947 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
11948 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
11949 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
11950 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
11951 }
11952
11953 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
11954 return DAG.getMergeValues({Mant, CastExp}, dl);
11955}
11956
11957SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11958 SDLoc DL(Op);
11959 StoreSDNode *Store = cast<StoreSDNode>(Op);
11960 EVT VT = Store->getMemoryVT();
11961
11962 if (VT == MVT::i1) {
11963 return DAG.getTruncStore(
11964 Store->getChain(), DL,
11965 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
11966 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
11967 }
11968
11969 assert(VT.isVector() &&
11970 Store->getValue().getValueType().getScalarType() == MVT::i32);
11971
11972 unsigned AS = Store->getAddressSpace();
11973 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11974 Store->getAlign().value() < VT.getStoreSize() &&
11975 VT.getSizeInBits() > 32) {
11976 return SplitVectorStore(Op, DAG);
11977 }
11978
11979 MachineFunction &MF = DAG.getMachineFunction();
11980 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11981 // If there is a possibility that flat instruction access scratch memory
11982 // then we need to use the same legalization rules we use for private.
11983 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11984 !Subtarget->hasMultiDwordFlatScratchAddressing())
11985 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
11988
11989 unsigned NumElements = VT.getVectorNumElements();
11991 if (NumElements > 4)
11992 return SplitVectorStore(Op, DAG);
11993 // v3 stores not supported on SI.
11994 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11995 return SplitVectorStore(Op, DAG);
11996
11998 VT, *Store->getMemOperand()))
11999 return expandUnalignedStore(Store, DAG);
12000
12001 return SDValue();
12002 }
12003 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12004 switch (Subtarget->getMaxPrivateElementSize()) {
12005 case 4:
12006 return scalarizeVectorStore(Store, DAG);
12007 case 8:
12008 if (NumElements > 2)
12009 return SplitVectorStore(Op, DAG);
12010 return SDValue();
12011 case 16:
12012 if (NumElements > 4 ||
12013 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12014 return SplitVectorStore(Op, DAG);
12015 return SDValue();
12016 default:
12017 llvm_unreachable("unsupported private_element_size");
12018 }
12019 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12020 unsigned Fast = 0;
12021 auto Flags = Store->getMemOperand()->getFlags();
12023 Store->getAlign(), Flags, &Fast) &&
12024 Fast > 1)
12025 return SDValue();
12026
12027 if (VT.isVector())
12028 return SplitVectorStore(Op, DAG);
12029
12030 return expandUnalignedStore(Store, DAG);
12031 }
12032
12033 // Probably an invalid store. If so we'll end up emitting a selection error.
12034 return SDValue();
12035}
12036
12037// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12038SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12039 SDLoc SL(Op);
12040 assert(!Subtarget->has16BitInsts());
12041 SDNodeFlags Flags = Op->getFlags();
12042 SDValue Ext =
12043 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12044
12045 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12046 SDValue Sqrt =
12047 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12048
12049 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12050 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12051}
12052
12053SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12054 SDLoc DL(Op);
12055 SDNodeFlags Flags = Op->getFlags();
12056 MVT VT = Op.getValueType().getSimpleVT();
12057 const SDValue X = Op.getOperand(0);
12058
12059 if (allowApproxFunc(DAG, Flags)) {
12060 // Instruction is 1ulp but ignores denormals.
12061 return DAG.getNode(
12063 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12064 }
12065
12066 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12067 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12068
12069 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12070
12071 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12072
12073 SDValue SqrtX =
12074 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12075
12076 SDValue SqrtS;
12077 if (needsDenormHandlingF32(DAG, X, Flags)) {
12078 SDValue SqrtID =
12079 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12080 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12081
12082 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12083 SDValue SqrtSNextDownInt =
12084 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12085 DAG.getAllOnesConstant(DL, MVT::i32));
12086 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12087
12088 SDValue NegSqrtSNextDown =
12089 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12090
12091 SDValue SqrtVP =
12092 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12093
12094 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12095 DAG.getConstant(1, DL, MVT::i32));
12096 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12097
12098 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12099 SDValue SqrtVS =
12100 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12101
12102 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12103 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12104
12105 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12106 Flags);
12107
12108 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12109 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12110 Flags);
12111 } else {
12112 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12113
12114 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12115
12116 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12117 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12118 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12119
12120 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12121 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12122 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12123
12124 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12125 SDValue SqrtD =
12126 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12127 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12128 }
12129
12130 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12131
12132 SDValue ScaledDown =
12133 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12134
12135 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12136 SDValue IsZeroOrInf =
12137 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12138 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12139
12140 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12141}
12142
12143SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12144 // For double type, the SQRT and RSQ instructions don't have required
12145 // precision, we apply Goldschmidt's algorithm to improve the result:
12146 //
12147 // y0 = rsq(x)
12148 // g0 = x * y0
12149 // h0 = 0.5 * y0
12150 //
12151 // r0 = 0.5 - h0 * g0
12152 // g1 = g0 * r0 + g0
12153 // h1 = h0 * r0 + h0
12154 //
12155 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12156 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12157 // h2 = h1 * r1 + h1
12158 //
12159 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12160 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12161 //
12162 // sqrt(x) = g3
12163
12164 SDNodeFlags Flags = Op->getFlags();
12165
12166 SDLoc DL(Op);
12167
12168 SDValue X = Op.getOperand(0);
12169 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12170
12171 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12172
12173 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12174
12175 // Scale up input if it is too small.
12176 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12177 SDValue ScaleUp =
12178 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12179 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12180
12181 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12182
12183 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12184
12185 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12186 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12187
12188 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12189 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12190
12191 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12192
12193 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12194
12195 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12196 SDValue SqrtD0 =
12197 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12198
12199 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12200
12201 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12202 SDValue SqrtD1 =
12203 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12204
12205 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12206
12207 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12208 SDValue ScaleDown =
12209 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12210 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12211
12212 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12213 // with finite only or nsz because rsq(+/-0) = +/-inf
12214
12215 // TODO: Check for DAZ and expand to subnormals
12216 SDValue IsZeroOrInf =
12217 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12218 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12219
12220 // If x is +INF, +0, or -0, use its original value
12221 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12222 Flags);
12223}
12224
12225SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12226 SDLoc DL(Op);
12227 EVT VT = Op.getValueType();
12228 SDValue Arg = Op.getOperand(0);
12229 SDValue TrigVal;
12230
12231 // Propagate fast-math flags so that the multiply we introduce can be folded
12232 // if Arg is already the result of a multiply by constant.
12233 auto Flags = Op->getFlags();
12234
12235 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12236
12237 if (Subtarget->hasTrigReducedRange()) {
12238 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12239 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12240 } else {
12241 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12242 }
12243
12244 switch (Op.getOpcode()) {
12245 case ISD::FCOS:
12246 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12247 case ISD::FSIN:
12248 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12249 default:
12250 llvm_unreachable("Wrong trig opcode");
12251 }
12252}
12253
12254SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12255 SelectionDAG &DAG) const {
12256 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12257 assert(AtomicNode->isCompareAndSwap());
12258 unsigned AS = AtomicNode->getAddressSpace();
12259
12260 // No custom lowering required for local address space
12262 return Op;
12263
12264 // Non-local address space requires custom lowering for atomic compare
12265 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12266 SDLoc DL(Op);
12267 SDValue ChainIn = Op.getOperand(0);
12268 SDValue Addr = Op.getOperand(1);
12269 SDValue Old = Op.getOperand(2);
12270 SDValue New = Op.getOperand(3);
12271 EVT VT = Op.getValueType();
12272 MVT SimpleVT = VT.getSimpleVT();
12273 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12274
12275 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12276 SDValue Ops[] = {ChainIn, Addr, NewOld};
12277
12279 Op->getVTList(), Ops, VT,
12280 AtomicNode->getMemOperand());
12281}
12282
12283//===----------------------------------------------------------------------===//
12284// Custom DAG optimizations
12285//===----------------------------------------------------------------------===//
12286
12287SDValue
12288SITargetLowering::performUCharToFloatCombine(SDNode *N,
12289 DAGCombinerInfo &DCI) const {
12290 EVT VT = N->getValueType(0);
12291 EVT ScalarVT = VT.getScalarType();
12292 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12293 return SDValue();
12294
12295 SelectionDAG &DAG = DCI.DAG;
12296 SDLoc DL(N);
12297
12298 SDValue Src = N->getOperand(0);
12299 EVT SrcVT = Src.getValueType();
12300
12301 // TODO: We could try to match extracting the higher bytes, which would be
12302 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12303 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12304 // about in practice.
12305 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12306 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12307 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12308 DCI.AddToWorklist(Cvt.getNode());
12309
12310 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12311 if (ScalarVT != MVT::f32) {
12312 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12313 DAG.getTargetConstant(0, DL, MVT::i32));
12314 }
12315 return Cvt;
12316 }
12317 }
12318
12319 return SDValue();
12320}
12321
12322SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12323 DAGCombinerInfo &DCI) const {
12324 SDValue MagnitudeOp = N->getOperand(0);
12325 SDValue SignOp = N->getOperand(1);
12326
12327 // The generic combine for fcopysign + fp cast is too conservative with
12328 // vectors, and also gets confused by the splitting we will perform here, so
12329 // peek through FP casts.
12330 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12331 SignOp.getOpcode() == ISD::FP_ROUND)
12332 SignOp = SignOp.getOperand(0);
12333
12334 SelectionDAG &DAG = DCI.DAG;
12335 SDLoc DL(N);
12336 EVT SignVT = SignOp.getValueType();
12337
12338 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12339 // lower half with a copy.
12340 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12341 EVT MagVT = MagnitudeOp.getValueType();
12342
12343 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12344
12345 if (MagVT.getScalarType() == MVT::f64) {
12346 EVT F32VT = MagVT.isVector()
12347 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12348 : MVT::v2f32;
12349
12350 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12351
12353 for (unsigned I = 0; I != NumElts; ++I) {
12354 SDValue MagLo =
12355 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12356 DAG.getConstant(2 * I, DL, MVT::i32));
12357 SDValue MagHi =
12358 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12359 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12360
12361 SDValue SignOpElt =
12362 MagVT.isVector()
12364 SignOp, DAG.getConstant(I, DL, MVT::i32))
12365 : SignOp;
12366
12367 SDValue HiOp =
12368 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12369
12370 SDValue Vector =
12371 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12372
12373 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12374 NewElts.push_back(NewElt);
12375 }
12376
12377 if (NewElts.size() == 1)
12378 return NewElts[0];
12379
12380 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12381 }
12382
12383 if (SignVT.getScalarType() != MVT::f64)
12384 return SDValue();
12385
12386 // Reduce width of sign operand, we only need the highest bit.
12387 //
12388 // fcopysign f64:x, f64:y ->
12389 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12390 // TODO: In some cases it might make sense to go all the way to f16.
12391
12392 EVT F32VT = MagVT.isVector()
12393 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12394 : MVT::v2f32;
12395
12396 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12397
12398 SmallVector<SDValue, 8> F32Signs;
12399 for (unsigned I = 0; I != NumElts; ++I) {
12400 // Take sign from odd elements of cast vector
12401 SDValue SignAsF32 =
12402 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12403 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12404 F32Signs.push_back(SignAsF32);
12405 }
12406
12407 SDValue NewSign =
12408 NumElts == 1
12409 ? F32Signs.back()
12411 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12412 F32Signs);
12413
12414 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12415 NewSign);
12416}
12417
12418// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12419// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12420// bits
12421
12422// This is a variant of
12423// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12424//
12425// The normal DAG combiner will do this, but only if the add has one use since
12426// that would increase the number of instructions.
12427//
12428// This prevents us from seeing a constant offset that can be folded into a
12429// memory instruction's addressing mode. If we know the resulting add offset of
12430// a pointer can be folded into an addressing offset, we can replace the pointer
12431// operand with the add of new constant offset. This eliminates one of the uses,
12432// and may allow the remaining use to also be simplified.
12433//
12434SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12435 EVT MemVT,
12436 DAGCombinerInfo &DCI) const {
12437 SDValue N0 = N->getOperand(0);
12438 SDValue N1 = N->getOperand(1);
12439
12440 // We only do this to handle cases where it's profitable when there are
12441 // multiple uses of the add, so defer to the standard combine.
12442 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12443 N0->hasOneUse())
12444 return SDValue();
12445
12446 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12447 if (!CN1)
12448 return SDValue();
12449
12450 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12451 if (!CAdd)
12452 return SDValue();
12453
12454 SelectionDAG &DAG = DCI.DAG;
12455
12456 if (N0->getOpcode() == ISD::OR &&
12457 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12458 return SDValue();
12459
12460 // If the resulting offset is too large, we can't fold it into the
12461 // addressing mode offset.
12462 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12463 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12464
12465 AddrMode AM;
12466 AM.HasBaseReg = true;
12467 AM.BaseOffs = Offset.getSExtValue();
12468 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12469 return SDValue();
12470
12471 SDLoc SL(N);
12472 EVT VT = N->getValueType(0);
12473
12474 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12475 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12476
12477 SDNodeFlags Flags;
12478 Flags.setNoUnsignedWrap(
12479 N->getFlags().hasNoUnsignedWrap() &&
12480 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12481
12482 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12483}
12484
12485/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12486/// by the chain and intrinsic ID. Theoretically we would also need to check the
12487/// specific intrinsic, but they all place the pointer operand first.
12488static unsigned getBasePtrIndex(const MemSDNode *N) {
12489 switch (N->getOpcode()) {
12490 case ISD::STORE:
12493 return 2;
12494 default:
12495 return 1;
12496 }
12497}
12498
12499SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12500 DAGCombinerInfo &DCI) const {
12501 SelectionDAG &DAG = DCI.DAG;
12502
12503 unsigned PtrIdx = getBasePtrIndex(N);
12504 SDValue Ptr = N->getOperand(PtrIdx);
12505
12506 // TODO: We could also do this for multiplies.
12507 if (Ptr.getOpcode() == ISD::SHL) {
12508 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12509 N->getMemoryVT(), DCI);
12510 if (NewPtr) {
12511 SmallVector<SDValue, 8> NewOps(N->ops());
12512
12513 NewOps[PtrIdx] = NewPtr;
12514 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12515 }
12516 }
12517
12518 return SDValue();
12519}
12520
12521static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12522 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12523 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12524 (Opc == ISD::XOR && Val == 0);
12525}
12526
12527// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12528// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
12529// integer combine opportunities since most 64-bit operations are decomposed
12530// this way. TODO: We won't want this for SALU especially if it is an inline
12531// immediate.
12532SDValue SITargetLowering::splitBinaryBitConstantOp(
12533 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
12534 const ConstantSDNode *CRHS) const {
12535 uint64_t Val = CRHS->getZExtValue();
12536 uint32_t ValLo = Lo_32(Val);
12537 uint32_t ValHi = Hi_32(Val);
12538 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12539
12540 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
12542 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
12543 // We have 64-bit scalar and/or/xor, but do not have vector forms.
12544 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
12545 !CRHS->user_begin()->isDivergent())
12546 return SDValue();
12547
12548 // If we need to materialize a 64-bit immediate, it will be split up later
12549 // anyway. Avoid creating the harder to understand 64-bit immediate
12550 // materialization.
12551 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
12552 }
12553
12554 return SDValue();
12555}
12556
12558 if (V.getValueType() != MVT::i1)
12559 return false;
12560 switch (V.getOpcode()) {
12561 default:
12562 break;
12563 case ISD::SETCC:
12564 case ISD::IS_FPCLASS:
12566 return true;
12567 case ISD::AND:
12568 case ISD::OR:
12569 case ISD::XOR:
12570 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
12571 case ISD::SADDO:
12572 case ISD::UADDO:
12573 case ISD::SSUBO:
12574 case ISD::USUBO:
12575 case ISD::SMULO:
12576 case ISD::UMULO:
12577 return V.getResNo() == 1;
12579 unsigned IntrinsicID = V.getConstantOperandVal(0);
12580 switch (IntrinsicID) {
12581 case Intrinsic::amdgcn_is_shared:
12582 case Intrinsic::amdgcn_is_private:
12583 return true;
12584 default:
12585 return false;
12586 }
12587
12588 return false;
12589 }
12590 }
12591 return false;
12592}
12593
12594// If a constant has all zeroes or all ones within each byte return it.
12595// Otherwise return 0.
12597 // 0xff for any zero byte in the mask
12598 uint32_t ZeroByteMask = 0;
12599 if (!(C & 0x000000ff))
12600 ZeroByteMask |= 0x000000ff;
12601 if (!(C & 0x0000ff00))
12602 ZeroByteMask |= 0x0000ff00;
12603 if (!(C & 0x00ff0000))
12604 ZeroByteMask |= 0x00ff0000;
12605 if (!(C & 0xff000000))
12606 ZeroByteMask |= 0xff000000;
12607 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
12608 if ((NonZeroByteMask & C) != NonZeroByteMask)
12609 return 0; // Partial bytes selected.
12610 return C;
12611}
12612
12613// Check if a node selects whole bytes from its operand 0 starting at a byte
12614// boundary while masking the rest. Returns select mask as in the v_perm_b32
12615// or -1 if not succeeded.
12616// Note byte select encoding:
12617// value 0-3 selects corresponding source byte;
12618// value 0xc selects zero;
12619// value 0xff selects 0xff.
12621 assert(V.getValueSizeInBits() == 32);
12622
12623 if (V.getNumOperands() != 2)
12624 return ~0;
12625
12626 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
12627 if (!N1)
12628 return ~0;
12629
12630 uint32_t C = N1->getZExtValue();
12631
12632 switch (V.getOpcode()) {
12633 default:
12634 break;
12635 case ISD::AND:
12636 if (uint32_t ConstMask = getConstantPermuteMask(C))
12637 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12638 break;
12639
12640 case ISD::OR:
12641 if (uint32_t ConstMask = getConstantPermuteMask(C))
12642 return (0x03020100 & ~ConstMask) | ConstMask;
12643 break;
12644
12645 case ISD::SHL:
12646 if (C % 8)
12647 return ~0;
12648
12649 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
12650
12651 case ISD::SRL:
12652 if (C % 8)
12653 return ~0;
12654
12655 return uint32_t(0x0c0c0c0c03020100ull >> C);
12656 }
12657
12658 return ~0;
12659}
12660
12661SDValue SITargetLowering::performAndCombine(SDNode *N,
12662 DAGCombinerInfo &DCI) const {
12663 if (DCI.isBeforeLegalize())
12664 return SDValue();
12665
12666 SelectionDAG &DAG = DCI.DAG;
12667 EVT VT = N->getValueType(0);
12668 SDValue LHS = N->getOperand(0);
12669 SDValue RHS = N->getOperand(1);
12670
12671 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12672 if (VT == MVT::i64 && CRHS) {
12673 if (SDValue Split =
12674 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
12675 return Split;
12676 }
12677
12678 if (CRHS && VT == MVT::i32) {
12679 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
12680 // nb = number of trailing zeroes in mask
12681 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
12682 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
12683 uint64_t Mask = CRHS->getZExtValue();
12684 unsigned Bits = llvm::popcount(Mask);
12685 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
12686 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
12687 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
12688 unsigned Shift = CShift->getZExtValue();
12689 unsigned NB = CRHS->getAPIntValue().countr_zero();
12690 unsigned Offset = NB + Shift;
12691 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
12692 SDLoc SL(N);
12693 SDValue BFE =
12694 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
12695 DAG.getConstant(Offset, SL, MVT::i32),
12696 DAG.getConstant(Bits, SL, MVT::i32));
12697 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
12698 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
12699 DAG.getValueType(NarrowVT));
12700 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
12701 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
12702 return Shl;
12703 }
12704 }
12705 }
12706
12707 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12708 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
12709 isa<ConstantSDNode>(LHS.getOperand(2))) {
12710 uint32_t Sel = getConstantPermuteMask(Mask);
12711 if (!Sel)
12712 return SDValue();
12713
12714 // Select 0xc for all zero bytes
12715 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12716 SDLoc DL(N);
12717 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12718 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12719 }
12720 }
12721
12722 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
12723 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
12724 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
12725 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12726 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
12727
12728 SDValue X = LHS.getOperand(0);
12729 SDValue Y = RHS.getOperand(0);
12730 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
12731 !isTypeLegal(X.getValueType()))
12732 return SDValue();
12733
12734 if (LCC == ISD::SETO) {
12735 if (X != LHS.getOperand(1))
12736 return SDValue();
12737
12738 if (RCC == ISD::SETUNE) {
12739 const ConstantFPSDNode *C1 =
12740 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
12741 if (!C1 || !C1->isInfinity() || C1->isNegative())
12742 return SDValue();
12743
12744 const uint32_t Mask = SIInstrFlags::N_NORMAL |
12748
12749 static_assert(
12752 0x3ff) == Mask,
12753 "mask not equal");
12754
12755 SDLoc DL(N);
12756 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
12757 DAG.getConstant(Mask, DL, MVT::i32));
12758 }
12759 }
12760 }
12761
12762 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
12763 std::swap(LHS, RHS);
12764
12765 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12766 RHS.hasOneUse()) {
12767 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12768 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
12769 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
12770 // | n_nan)
12771 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12772 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
12773 (RHS.getOperand(0) == LHS.getOperand(0) &&
12774 LHS.getOperand(0) == LHS.getOperand(1))) {
12775 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
12776 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
12777 : Mask->getZExtValue() & OrdMask;
12778
12779 SDLoc DL(N);
12780 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
12781 DAG.getConstant(NewMask, DL, MVT::i32));
12782 }
12783 }
12784
12785 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
12786 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12787 // and x, (sext cc from i1) => select cc, x, 0
12788 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
12789 std::swap(LHS, RHS);
12790 if (isBoolSGPR(RHS.getOperand(0)))
12791 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
12792 DAG.getConstant(0, SDLoc(N), MVT::i32));
12793 }
12794
12795 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12796 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12797 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12798 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12799 uint32_t LHSMask = getPermuteMask(LHS);
12800 uint32_t RHSMask = getPermuteMask(RHS);
12801 if (LHSMask != ~0u && RHSMask != ~0u) {
12802 // Canonicalize the expression in an attempt to have fewer unique masks
12803 // and therefore fewer registers used to hold the masks.
12804 if (LHSMask > RHSMask) {
12805 std::swap(LHSMask, RHSMask);
12806 std::swap(LHS, RHS);
12807 }
12808
12809 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12810 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12811 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12812 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12813
12814 // Check of we need to combine values from two sources within a byte.
12815 if (!(LHSUsedLanes & RHSUsedLanes) &&
12816 // If we select high and lower word keep it for SDWA.
12817 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12818 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12819 // Each byte in each mask is either selector mask 0-3, or has higher
12820 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
12821 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
12822 // mask which is not 0xff wins. By anding both masks we have a correct
12823 // result except that 0x0c shall be corrected to give 0x0c only.
12824 uint32_t Mask = LHSMask & RHSMask;
12825 for (unsigned I = 0; I < 32; I += 8) {
12826 uint32_t ByteSel = 0xff << I;
12827 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12828 Mask &= (0x0c << I) & 0xffffffff;
12829 }
12830
12831 // Add 4 to each active LHS lane. It will not affect any existing 0xff
12832 // or 0x0c.
12833 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
12834 SDLoc DL(N);
12835
12836 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12837 RHS.getOperand(0),
12838 DAG.getConstant(Sel, DL, MVT::i32));
12839 }
12840 }
12841 }
12842
12843 return SDValue();
12844}
12845
12846// A key component of v_perm is a mapping between byte position of the src
12847// operands, and the byte position of the dest. To provide such, we need: 1. the
12848// node that provides x byte of the dest of the OR, and 2. the byte of the node
12849// used to provide that x byte. calculateByteProvider finds which node provides
12850// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12851// and finds an ultimate src and byte position For example: The supported
12852// LoadCombine pattern for vector loads is as follows
12853// t1
12854// or
12855// / \
12856// t2 t3
12857// zext shl
12858// | | \
12859// t4 t5 16
12860// or anyext
12861// / \ |
12862// t6 t7 t8
12863// srl shl or
12864// / | / \ / \
12865// t9 t10 t11 t12 t13 t14
12866// trunc* 8 trunc* 8 and and
12867// | | / | | \
12868// t15 t16 t17 t18 t19 t20
12869// trunc* 255 srl -256
12870// | / \
12871// t15 t15 16
12872//
12873// *In this example, the truncs are from i32->i16
12874//
12875// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12876// respectively. calculateSrcByte would find (given node) -> ultimate src &
12877// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12878// After finding the mapping, we can combine the tree into vperm t15, t16,
12879// 0x05000407
12880
12881// Find the source and byte position from a node.
12882// \p DestByte is the byte position of the dest of the or that the src
12883// ultimately provides. \p SrcIndex is the byte of the src that maps to this
12884// dest of the or byte. \p Depth tracks how many recursive iterations we have
12885// performed.
12886static const std::optional<ByteProvider<SDValue>>
12887calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
12888 unsigned Depth = 0) {
12889 // We may need to recursively traverse a series of SRLs
12890 if (Depth >= 6)
12891 return std::nullopt;
12892
12893 if (Op.getValueSizeInBits() < 8)
12894 return std::nullopt;
12895
12896 if (Op.getValueType().isVector())
12897 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12898
12899 switch (Op->getOpcode()) {
12900 case ISD::TRUNCATE: {
12901 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12902 }
12903
12904 case ISD::SIGN_EXTEND:
12905 case ISD::ZERO_EXTEND:
12907 SDValue NarrowOp = Op->getOperand(0);
12908 auto NarrowVT = NarrowOp.getValueType();
12909 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12910 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12911 NarrowVT = VTSign->getVT();
12912 }
12913 if (!NarrowVT.isByteSized())
12914 return std::nullopt;
12915 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12916
12917 if (SrcIndex >= NarrowByteWidth)
12918 return std::nullopt;
12919 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12920 }
12921
12922 case ISD::SRA:
12923 case ISD::SRL: {
12924 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12925 if (!ShiftOp)
12926 return std::nullopt;
12927
12928 uint64_t BitShift = ShiftOp->getZExtValue();
12929
12930 if (BitShift % 8 != 0)
12931 return std::nullopt;
12932
12933 SrcIndex += BitShift / 8;
12934
12935 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
12936 }
12937
12938 default: {
12939 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
12940 }
12941 }
12942 llvm_unreachable("fully handled switch");
12943}
12944
12945// For a byte position in the result of an Or, traverse the tree and find the
12946// node (and the byte of the node) which ultimately provides this {Or,
12947// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12948// the byte position of the Op that corresponds with the originally requested
12949// byte of the Or \p Depth tracks how many recursive iterations we have
12950// performed. \p StartingIndex is the originally requested byte of the Or
12951static const std::optional<ByteProvider<SDValue>>
12952calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12953 unsigned StartingIndex = 0) {
12954 // Finding Src tree of RHS of or typically requires at least 1 additional
12955 // depth
12956 if (Depth > 6)
12957 return std::nullopt;
12958
12959 unsigned BitWidth = Op.getScalarValueSizeInBits();
12960 if (BitWidth % 8 != 0)
12961 return std::nullopt;
12962 if (Index > BitWidth / 8 - 1)
12963 return std::nullopt;
12964
12965 bool IsVec = Op.getValueType().isVector();
12966 switch (Op.getOpcode()) {
12967 case ISD::OR: {
12968 if (IsVec)
12969 return std::nullopt;
12970
12971 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
12972 StartingIndex);
12973 if (!RHS)
12974 return std::nullopt;
12975 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12976 StartingIndex);
12977 if (!LHS)
12978 return std::nullopt;
12979 // A well formed Or will have two ByteProviders for each byte, one of which
12980 // is constant zero
12981 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12982 return std::nullopt;
12983 if (!LHS || LHS->isConstantZero())
12984 return RHS;
12985 if (!RHS || RHS->isConstantZero())
12986 return LHS;
12987 return std::nullopt;
12988 }
12989
12990 case ISD::AND: {
12991 if (IsVec)
12992 return std::nullopt;
12993
12994 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12995 if (!BitMaskOp)
12996 return std::nullopt;
12997
12998 uint32_t BitMask = BitMaskOp->getZExtValue();
12999 // Bits we expect for our StartingIndex
13000 uint32_t IndexMask = 0xFF << (Index * 8);
13001
13002 if ((IndexMask & BitMask) != IndexMask) {
13003 // If the result of the and partially provides the byte, then it
13004 // is not well formatted
13005 if (IndexMask & BitMask)
13006 return std::nullopt;
13008 }
13009
13010 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13011 }
13012
13013 case ISD::FSHR: {
13014 if (IsVec)
13015 return std::nullopt;
13016
13017 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13018 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13019 if (!ShiftOp || Op.getValueType().isVector())
13020 return std::nullopt;
13021
13022 uint64_t BitsProvided = Op.getValueSizeInBits();
13023 if (BitsProvided % 8 != 0)
13024 return std::nullopt;
13025
13026 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13027 if (BitShift % 8)
13028 return std::nullopt;
13029
13030 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13031 uint64_t ByteShift = BitShift / 8;
13032
13033 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13034 uint64_t BytesProvided = BitsProvided / 8;
13035 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13036 NewIndex %= BytesProvided;
13037 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13038 }
13039
13040 case ISD::SRA:
13041 case ISD::SRL: {
13042 if (IsVec)
13043 return std::nullopt;
13044
13045 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13046 if (!ShiftOp)
13047 return std::nullopt;
13048
13049 uint64_t BitShift = ShiftOp->getZExtValue();
13050 if (BitShift % 8)
13051 return std::nullopt;
13052
13053 auto BitsProvided = Op.getScalarValueSizeInBits();
13054 if (BitsProvided % 8 != 0)
13055 return std::nullopt;
13056
13057 uint64_t BytesProvided = BitsProvided / 8;
13058 uint64_t ByteShift = BitShift / 8;
13059 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13060 // If the byte we are trying to provide (as tracked by index) falls in this
13061 // range, then the SRL provides the byte. The byte of interest of the src of
13062 // the SRL is Index + ByteShift
13063 return BytesProvided - ByteShift > Index
13064 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13065 Index + ByteShift)
13067 }
13068
13069 case ISD::SHL: {
13070 if (IsVec)
13071 return std::nullopt;
13072
13073 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13074 if (!ShiftOp)
13075 return std::nullopt;
13076
13077 uint64_t BitShift = ShiftOp->getZExtValue();
13078 if (BitShift % 8 != 0)
13079 return std::nullopt;
13080 uint64_t ByteShift = BitShift / 8;
13081
13082 // If we are shifting by an amount greater than (or equal to)
13083 // the index we are trying to provide, then it provides 0s. If not,
13084 // then this bytes are not definitively 0s, and the corresponding byte
13085 // of interest is Index - ByteShift of the src
13086 return Index < ByteShift
13088 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13089 Depth + 1, StartingIndex);
13090 }
13091 case ISD::ANY_EXTEND:
13092 case ISD::SIGN_EXTEND:
13093 case ISD::ZERO_EXTEND:
13095 case ISD::AssertZext:
13096 case ISD::AssertSext: {
13097 if (IsVec)
13098 return std::nullopt;
13099
13100 SDValue NarrowOp = Op->getOperand(0);
13101 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13102 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13103 Op->getOpcode() == ISD::AssertZext ||
13104 Op->getOpcode() == ISD::AssertSext) {
13105 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13106 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13107 }
13108 if (NarrowBitWidth % 8 != 0)
13109 return std::nullopt;
13110 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13111
13112 if (Index >= NarrowByteWidth)
13113 return Op.getOpcode() == ISD::ZERO_EXTEND
13114 ? std::optional<ByteProvider<SDValue>>(
13116 : std::nullopt;
13117 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13118 }
13119
13120 case ISD::TRUNCATE: {
13121 if (IsVec)
13122 return std::nullopt;
13123
13124 uint64_t NarrowByteWidth = BitWidth / 8;
13125
13126 if (NarrowByteWidth >= Index) {
13127 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13128 StartingIndex);
13129 }
13130
13131 return std::nullopt;
13132 }
13133
13134 case ISD::CopyFromReg: {
13135 if (BitWidth / 8 > Index)
13136 return calculateSrcByte(Op, StartingIndex, Index);
13137
13138 return std::nullopt;
13139 }
13140
13141 case ISD::LOAD: {
13142 auto *L = cast<LoadSDNode>(Op.getNode());
13143
13144 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13145 if (NarrowBitWidth % 8 != 0)
13146 return std::nullopt;
13147 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13148
13149 // If the width of the load does not reach byte we are trying to provide for
13150 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13151 // question
13152 if (Index >= NarrowByteWidth) {
13153 return L->getExtensionType() == ISD::ZEXTLOAD
13154 ? std::optional<ByteProvider<SDValue>>(
13156 : std::nullopt;
13157 }
13158
13159 if (NarrowByteWidth > Index) {
13160 return calculateSrcByte(Op, StartingIndex, Index);
13161 }
13162
13163 return std::nullopt;
13164 }
13165
13166 case ISD::BSWAP: {
13167 if (IsVec)
13168 return std::nullopt;
13169
13170 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13171 Depth + 1, StartingIndex);
13172 }
13173
13175 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13176 if (!IdxOp)
13177 return std::nullopt;
13178 auto VecIdx = IdxOp->getZExtValue();
13179 auto ScalarSize = Op.getScalarValueSizeInBits();
13180 if (ScalarSize < 32)
13181 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13182 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13183 StartingIndex, Index);
13184 }
13185
13186 case AMDGPUISD::PERM: {
13187 if (IsVec)
13188 return std::nullopt;
13189
13190 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13191 if (!PermMask)
13192 return std::nullopt;
13193
13194 auto IdxMask =
13195 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13196 if (IdxMask > 0x07 && IdxMask != 0x0c)
13197 return std::nullopt;
13198
13199 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13200 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13201
13202 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13205 }
13206
13207 default: {
13208 return std::nullopt;
13209 }
13210 }
13211
13212 llvm_unreachable("fully handled switch");
13213}
13214
13215// Returns true if the Operand is a scalar and is 16 bits
13216static bool isExtendedFrom16Bits(SDValue &Operand) {
13217
13218 switch (Operand.getOpcode()) {
13219 case ISD::ANY_EXTEND:
13220 case ISD::SIGN_EXTEND:
13221 case ISD::ZERO_EXTEND: {
13222 auto OpVT = Operand.getOperand(0).getValueType();
13223 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13224 }
13225 case ISD::LOAD: {
13226 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13227 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13228 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13229 ExtType == ISD::EXTLOAD) {
13230 auto MemVT = L->getMemoryVT();
13231 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13232 }
13233 return L->getMemoryVT().getSizeInBits() == 16;
13234 }
13235 default:
13236 return false;
13237 }
13238}
13239
13240// Returns true if the mask matches consecutive bytes, and the first byte
13241// begins at a power of 2 byte offset from 0th byte
13242static bool addresses16Bits(int Mask) {
13243 int Low8 = Mask & 0xff;
13244 int Hi8 = (Mask & 0xff00) >> 8;
13245
13246 assert(Low8 < 8 && Hi8 < 8);
13247 // Are the bytes contiguous in the order of increasing addresses.
13248 bool IsConsecutive = (Hi8 - Low8 == 1);
13249 // Is the first byte at location that is aligned for 16 bit instructions.
13250 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13251 // In this case, we still need code to extract the 16 bit operand, so it
13252 // is better to use i8 v_perm
13253 bool Is16Aligned = !(Low8 % 2);
13254
13255 return IsConsecutive && Is16Aligned;
13256}
13257
13258// Do not lower into v_perm if the operands are actually 16 bit
13259// and the selected bits (based on PermMask) correspond with two
13260// easily addressable 16 bit operands.
13262 SDValue &OtherOp) {
13263 int Low16 = PermMask & 0xffff;
13264 int Hi16 = (PermMask & 0xffff0000) >> 16;
13265
13266 auto TempOp = peekThroughBitcasts(Op);
13267 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13268
13269 auto OpIs16Bit =
13270 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13271 if (!OpIs16Bit)
13272 return true;
13273
13274 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13275 isExtendedFrom16Bits(TempOtherOp);
13276 if (!OtherOpIs16Bit)
13277 return true;
13278
13279 // Do we cleanly address both
13280 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13281}
13282
13284 unsigned DWordOffset) {
13285 SDValue Ret;
13286
13287 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13288 // ByteProvider must be at least 8 bits
13289 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13290
13291 if (TypeSize <= 32)
13292 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13293
13294 if (Src.getValueType().isVector()) {
13295 auto ScalarTySize = Src.getScalarValueSizeInBits();
13296 auto ScalarTy = Src.getValueType().getScalarType();
13297 if (ScalarTySize == 32) {
13298 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13299 DAG.getConstant(DWordOffset, SL, MVT::i32));
13300 }
13301 if (ScalarTySize > 32) {
13302 Ret = DAG.getNode(
13303 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13304 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13305 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13306 if (ShiftVal)
13307 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13308 DAG.getConstant(ShiftVal, SL, MVT::i32));
13309 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13310 }
13311
13312 assert(ScalarTySize < 32);
13313 auto NumElements = TypeSize / ScalarTySize;
13314 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13315 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13316 auto NumElementsIn32 = 32 / ScalarTySize;
13317 auto NumAvailElements = DWordOffset < Trunc32Elements
13318 ? NumElementsIn32
13319 : NumElements - NormalizedTrunc;
13320
13322 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13323 NumAvailElements);
13324
13325 Ret = DAG.getBuildVector(
13326 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13327 VecSrcs);
13328 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13329 }
13330
13331 /// Scalar Type
13332 auto ShiftVal = 32 * DWordOffset;
13333 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13334 DAG.getConstant(ShiftVal, SL, MVT::i32));
13335 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13336}
13337
13339 SelectionDAG &DAG = DCI.DAG;
13340 [[maybe_unused]] EVT VT = N->getValueType(0);
13342
13343 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13344 assert(VT == MVT::i32);
13345 for (int i = 0; i < 4; i++) {
13346 // Find the ByteProvider that provides the ith byte of the result of OR
13347 std::optional<ByteProvider<SDValue>> P =
13348 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13349 // TODO support constantZero
13350 if (!P || P->isConstantZero())
13351 return SDValue();
13352
13353 PermNodes.push_back(*P);
13354 }
13355 if (PermNodes.size() != 4)
13356 return SDValue();
13357
13358 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13359 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13360 uint64_t PermMask = 0x00000000;
13361 for (size_t i = 0; i < PermNodes.size(); i++) {
13362 auto PermOp = PermNodes[i];
13363 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13364 // by sizeof(Src2) = 4
13365 int SrcByteAdjust = 4;
13366
13367 // If the Src uses a byte from a different DWORD, then it corresponds
13368 // with a difference source
13369 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13370 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13371 if (SecondSrc)
13372 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13373 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13374 return SDValue();
13375
13376 // Set the index of the second distinct Src node
13377 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13378 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13379 SrcByteAdjust = 0;
13380 }
13381 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13383 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13384 }
13385 SDLoc DL(N);
13386 SDValue Op = *PermNodes[FirstSrc.first].Src;
13387 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13388 assert(Op.getValueSizeInBits() == 32);
13389
13390 // Check that we are not just extracting the bytes in order from an op
13391 if (!SecondSrc) {
13392 int Low16 = PermMask & 0xffff;
13393 int Hi16 = (PermMask & 0xffff0000) >> 16;
13394
13395 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13396 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13397
13398 // The perm op would really just produce Op. So combine into Op
13399 if (WellFormedLow && WellFormedHi)
13400 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13401 }
13402
13403 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13404
13405 if (SecondSrc) {
13406 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13407 assert(OtherOp.getValueSizeInBits() == 32);
13408 }
13409
13410 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13411
13412 assert(Op.getValueType().isByteSized() &&
13413 OtherOp.getValueType().isByteSized());
13414
13415 // If the ultimate src is less than 32 bits, then we will only be
13416 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13417 // CalculateByteProvider would not have returned Op as source if we
13418 // used a byte that is outside its ValueType. Thus, we are free to
13419 // ANY_EXTEND as the extended bits are dont-cares.
13420 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13421 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13422
13423 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13424 DAG.getConstant(PermMask, DL, MVT::i32));
13425 }
13426 return SDValue();
13427}
13428
13429SDValue SITargetLowering::performOrCombine(SDNode *N,
13430 DAGCombinerInfo &DCI) const {
13431 SelectionDAG &DAG = DCI.DAG;
13432 SDValue LHS = N->getOperand(0);
13433 SDValue RHS = N->getOperand(1);
13434
13435 EVT VT = N->getValueType(0);
13436 if (VT == MVT::i1) {
13437 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13438 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13439 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13440 SDValue Src = LHS.getOperand(0);
13441 if (Src != RHS.getOperand(0))
13442 return SDValue();
13443
13444 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13445 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13446 if (!CLHS || !CRHS)
13447 return SDValue();
13448
13449 // Only 10 bits are used.
13450 static const uint32_t MaxMask = 0x3ff;
13451
13452 uint32_t NewMask =
13453 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13454 SDLoc DL(N);
13455 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13456 DAG.getConstant(NewMask, DL, MVT::i32));
13457 }
13458
13459 return SDValue();
13460 }
13461
13462 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13464 LHS.getOpcode() == AMDGPUISD::PERM &&
13465 isa<ConstantSDNode>(LHS.getOperand(2))) {
13466 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13467 if (!Sel)
13468 return SDValue();
13469
13470 Sel |= LHS.getConstantOperandVal(2);
13471 SDLoc DL(N);
13472 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13473 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13474 }
13475
13476 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13477 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13478 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13479 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13480
13481 // If all the uses of an or need to extract the individual elements, do not
13482 // attempt to lower into v_perm
13483 auto usesCombinedOperand = [](SDNode *OrUse) {
13484 // If we have any non-vectorized use, then it is a candidate for v_perm
13485 if (OrUse->getOpcode() != ISD::BITCAST ||
13486 !OrUse->getValueType(0).isVector())
13487 return true;
13488
13489 // If we have any non-vectorized use, then it is a candidate for v_perm
13490 for (auto *VUser : OrUse->users()) {
13491 if (!VUser->getValueType(0).isVector())
13492 return true;
13493
13494 // If the use of a vector is a store, then combining via a v_perm
13495 // is beneficial.
13496 // TODO -- whitelist more uses
13497 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13498 if (VUser->getOpcode() == VectorwiseOp)
13499 return true;
13500 }
13501 return false;
13502 };
13503
13504 if (!any_of(N->users(), usesCombinedOperand))
13505 return SDValue();
13506
13507 uint32_t LHSMask = getPermuteMask(LHS);
13508 uint32_t RHSMask = getPermuteMask(RHS);
13509
13510 if (LHSMask != ~0u && RHSMask != ~0u) {
13511 // Canonicalize the expression in an attempt to have fewer unique masks
13512 // and therefore fewer registers used to hold the masks.
13513 if (LHSMask > RHSMask) {
13514 std::swap(LHSMask, RHSMask);
13515 std::swap(LHS, RHS);
13516 }
13517
13518 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13519 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13520 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13521 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13522
13523 // Check of we need to combine values from two sources within a byte.
13524 if (!(LHSUsedLanes & RHSUsedLanes) &&
13525 // If we select high and lower word keep it for SDWA.
13526 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13527 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13528 // Kill zero bytes selected by other mask. Zero value is 0xc.
13529 LHSMask &= ~RHSUsedLanes;
13530 RHSMask &= ~LHSUsedLanes;
13531 // Add 4 to each active LHS lane
13532 LHSMask |= LHSUsedLanes & 0x04040404;
13533 // Combine masks
13534 uint32_t Sel = LHSMask | RHSMask;
13535 SDLoc DL(N);
13536
13537 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13538 RHS.getOperand(0),
13539 DAG.getConstant(Sel, DL, MVT::i32));
13540 }
13541 }
13542 if (LHSMask == ~0u || RHSMask == ~0u) {
13543 if (SDValue Perm = matchPERM(N, DCI))
13544 return Perm;
13545 }
13546 }
13547
13548 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13549 return SDValue();
13550
13551 // TODO: This could be a generic combine with a predicate for extracting the
13552 // high half of an integer being free.
13553
13554 // (or i64:x, (zero_extend i32:y)) ->
13555 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
13556 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
13557 RHS.getOpcode() != ISD::ZERO_EXTEND)
13558 std::swap(LHS, RHS);
13559
13560 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
13561 SDValue ExtSrc = RHS.getOperand(0);
13562 EVT SrcVT = ExtSrc.getValueType();
13563 if (SrcVT == MVT::i32) {
13564 SDLoc SL(N);
13565 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
13566 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
13567
13568 DCI.AddToWorklist(LowOr.getNode());
13569 DCI.AddToWorklist(HiBits.getNode());
13570
13571 SDValue Vec =
13572 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
13573 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13574 }
13575 }
13576
13577 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
13578 if (CRHS) {
13579 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
13580 N->getOperand(0), CRHS))
13581 return Split;
13582 }
13583
13584 return SDValue();
13585}
13586
13587SDValue SITargetLowering::performXorCombine(SDNode *N,
13588 DAGCombinerInfo &DCI) const {
13589 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
13590 return RV;
13591
13592 SDValue LHS = N->getOperand(0);
13593 SDValue RHS = N->getOperand(1);
13594
13595 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13596 SelectionDAG &DAG = DCI.DAG;
13597
13598 EVT VT = N->getValueType(0);
13599 if (CRHS && VT == MVT::i64) {
13600 if (SDValue Split =
13601 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
13602 return Split;
13603 }
13604
13605 // Make sure to apply the 64-bit constant splitting fold before trying to fold
13606 // fneg-like xors into 64-bit select.
13607 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
13608 // This looks like an fneg, try to fold as a source modifier.
13609 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
13611 // xor (select c, a, b), 0x80000000 ->
13612 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
13613 SDLoc DL(N);
13614 SDValue CastLHS =
13615 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
13616 SDValue CastRHS =
13617 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
13618 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
13619 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
13620 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
13621 LHS->getOperand(0), FNegLHS, FNegRHS);
13622 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13623 }
13624 }
13625
13626 return SDValue();
13627}
13628
13629SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
13630 DAGCombinerInfo &DCI) const {
13631 if (!Subtarget->has16BitInsts() ||
13632 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13633 return SDValue();
13634
13635 EVT VT = N->getValueType(0);
13636 if (VT != MVT::i32)
13637 return SDValue();
13638
13639 SDValue Src = N->getOperand(0);
13640 if (Src.getValueType() != MVT::i16)
13641 return SDValue();
13642
13643 return SDValue();
13644}
13645
13646SDValue
13647SITargetLowering::performSignExtendInRegCombine(SDNode *N,
13648 DAGCombinerInfo &DCI) const {
13649 SDValue Src = N->getOperand(0);
13650 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
13651
13652 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
13653 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
13654 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
13655 VTSign->getVT() == MVT::i8) ||
13656 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
13657 VTSign->getVT() == MVT::i16))) {
13658 assert(Subtarget->hasScalarSubwordLoads() &&
13659 "s_buffer_load_{u8, i8} are supported "
13660 "in GFX12 (or newer) architectures.");
13661 EVT VT = Src.getValueType();
13662 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
13665 SDLoc DL(N);
13666 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13667 SDValue Ops[] = {
13668 Src.getOperand(0), // source register
13669 Src.getOperand(1), // offset
13670 Src.getOperand(2) // cachePolicy
13671 };
13672 auto *M = cast<MemSDNode>(Src);
13673 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13674 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13675 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
13676 return LoadVal;
13677 }
13678 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
13679 VTSign->getVT() == MVT::i8) ||
13680 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
13681 VTSign->getVT() == MVT::i16)) &&
13682 Src.hasOneUse()) {
13683 auto *M = cast<MemSDNode>(Src);
13684 SDValue Ops[] = {Src.getOperand(0), // Chain
13685 Src.getOperand(1), // rsrc
13686 Src.getOperand(2), // vindex
13687 Src.getOperand(3), // voffset
13688 Src.getOperand(4), // soffset
13689 Src.getOperand(5), // offset
13690 Src.getOperand(6), Src.getOperand(7)};
13691 // replace with BUFFER_LOAD_BYTE/SHORT
13692 SDVTList ResList =
13693 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13694 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
13697 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13698 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13699 return DCI.DAG.getMergeValues(
13700 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
13701 }
13702 return SDValue();
13703}
13704
13705SDValue SITargetLowering::performClassCombine(SDNode *N,
13706 DAGCombinerInfo &DCI) const {
13707 SelectionDAG &DAG = DCI.DAG;
13708 SDValue Mask = N->getOperand(1);
13709
13710 // fp_class x, 0 -> false
13711 if (isNullConstant(Mask))
13712 return DAG.getConstant(0, SDLoc(N), MVT::i1);
13713
13714 if (N->getOperand(0).isUndef())
13715 return DAG.getUNDEF(MVT::i1);
13716
13717 return SDValue();
13718}
13719
13720SDValue SITargetLowering::performRcpCombine(SDNode *N,
13721 DAGCombinerInfo &DCI) const {
13722 EVT VT = N->getValueType(0);
13723 SDValue N0 = N->getOperand(0);
13724
13725 if (N0.isUndef()) {
13726 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
13727 SDLoc(N), VT);
13728 }
13729
13730 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
13731 N0.getOpcode() == ISD::SINT_TO_FP)) {
13732 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
13733 N->getFlags());
13734 }
13735
13736 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
13737 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
13738 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
13739 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
13740 N->getFlags());
13741 }
13742
13744}
13745
13747 unsigned MaxDepth) const {
13748 unsigned Opcode = Op.getOpcode();
13749 if (Opcode == ISD::FCANONICALIZE)
13750 return true;
13751
13752 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13753 const auto &F = CFP->getValueAPF();
13754 if (F.isNaN() && F.isSignaling())
13755 return false;
13756 if (!F.isDenormal())
13757 return true;
13758
13759 DenormalMode Mode =
13760 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
13761 return Mode == DenormalMode::getIEEE();
13762 }
13763
13764 // If source is a result of another standard FP operation it is already in
13765 // canonical form.
13766 if (MaxDepth == 0)
13767 return false;
13768
13769 switch (Opcode) {
13770 // These will flush denorms if required.
13771 case ISD::FADD:
13772 case ISD::FSUB:
13773 case ISD::FMUL:
13774 case ISD::FCEIL:
13775 case ISD::FFLOOR:
13776 case ISD::FMA:
13777 case ISD::FMAD:
13778 case ISD::FSQRT:
13779 case ISD::FDIV:
13780 case ISD::FREM:
13781 case ISD::FP_ROUND:
13782 case ISD::FP_EXTEND:
13783 case ISD::FP16_TO_FP:
13784 case ISD::FP_TO_FP16:
13785 case ISD::BF16_TO_FP:
13786 case ISD::FP_TO_BF16:
13787 case ISD::FLDEXP:
13790 case AMDGPUISD::RCP:
13791 case AMDGPUISD::RSQ:
13795 case AMDGPUISD::LOG:
13796 case AMDGPUISD::EXP:
13800 case AMDGPUISD::FRACT:
13807 case AMDGPUISD::SIN_HW:
13808 case AMDGPUISD::COS_HW:
13809 return true;
13810
13811 // It can/will be lowered or combined as a bit operation.
13812 // Need to check their input recursively to handle.
13813 case ISD::FNEG:
13814 case ISD::FABS:
13815 case ISD::FCOPYSIGN:
13816 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13817
13818 case ISD::AND:
13819 if (Op.getValueType() == MVT::i32) {
13820 // Be careful as we only know it is a bitcast floating point type. It
13821 // could be f32, v2f16, we have no way of knowing. Luckily the constant
13822 // value that we optimize for, which comes up in fp32 to bf16 conversions,
13823 // is valid to optimize for all types.
13824 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
13825 if (RHS->getZExtValue() == 0xffff0000) {
13826 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13827 }
13828 }
13829 }
13830 break;
13831
13832 case ISD::FSIN:
13833 case ISD::FCOS:
13834 case ISD::FSINCOS:
13835 return Op.getValueType().getScalarType() != MVT::f16;
13836
13837 case ISD::FMINNUM:
13838 case ISD::FMAXNUM:
13839 case ISD::FMINNUM_IEEE:
13840 case ISD::FMAXNUM_IEEE:
13841 case ISD::FMINIMUM:
13842 case ISD::FMAXIMUM:
13843 case ISD::FMINIMUMNUM:
13844 case ISD::FMAXIMUMNUM:
13845 case AMDGPUISD::CLAMP:
13846 case AMDGPUISD::FMED3:
13847 case AMDGPUISD::FMAX3:
13848 case AMDGPUISD::FMIN3:
13850 case AMDGPUISD::FMINIMUM3: {
13851 // FIXME: Shouldn't treat the generic operations different based these.
13852 // However, we aren't really required to flush the result from
13853 // minnum/maxnum..
13854
13855 // snans will be quieted, so we only need to worry about denormals.
13856 if (Subtarget->supportsMinMaxDenormModes() ||
13857 // FIXME: denormalsEnabledForType is broken for dynamic
13858 denormalsEnabledForType(DAG, Op.getValueType()))
13859 return true;
13860
13861 // Flushing may be required.
13862 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13863 // targets need to check their input recursively.
13864
13865 // FIXME: Does this apply with clamp? It's implemented with max.
13866 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
13867 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
13868 return false;
13869 }
13870
13871 return true;
13872 }
13873 case ISD::SELECT: {
13874 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
13875 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
13876 }
13877 case ISD::BUILD_VECTOR: {
13878 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
13879 SDValue SrcOp = Op.getOperand(i);
13880 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
13881 return false;
13882 }
13883
13884 return true;
13885 }
13888 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13889 }
13891 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
13892 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
13893 }
13894 case ISD::UNDEF:
13895 // Could be anything.
13896 return false;
13897
13898 case ISD::BITCAST:
13899 // TODO: This is incorrect as it loses track of the operand's type. We may
13900 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13901 // same bits that are canonicalized in one type need not be in the other.
13902 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
13903 case ISD::TRUNCATE: {
13904 // Hack round the mess we make when legalizing extract_vector_elt
13905 if (Op.getValueType() == MVT::i16) {
13906 SDValue TruncSrc = Op.getOperand(0);
13907 if (TruncSrc.getValueType() == MVT::i32 &&
13908 TruncSrc.getOpcode() == ISD::BITCAST &&
13909 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
13910 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
13911 }
13912 }
13913 return false;
13914 }
13916 unsigned IntrinsicID = Op.getConstantOperandVal(0);
13917 // TODO: Handle more intrinsics
13918 switch (IntrinsicID) {
13919 case Intrinsic::amdgcn_cvt_pkrtz:
13920 case Intrinsic::amdgcn_cubeid:
13921 case Intrinsic::amdgcn_frexp_mant:
13922 case Intrinsic::amdgcn_fdot2:
13923 case Intrinsic::amdgcn_rcp:
13924 case Intrinsic::amdgcn_rsq:
13925 case Intrinsic::amdgcn_rsq_clamp:
13926 case Intrinsic::amdgcn_rcp_legacy:
13927 case Intrinsic::amdgcn_rsq_legacy:
13928 case Intrinsic::amdgcn_trig_preop:
13929 case Intrinsic::amdgcn_tanh:
13930 case Intrinsic::amdgcn_log:
13931 case Intrinsic::amdgcn_exp2:
13932 case Intrinsic::amdgcn_sqrt:
13933 return true;
13934 default:
13935 break;
13936 }
13937
13938 break;
13939 }
13940 default:
13941 break;
13942 }
13943
13944 // FIXME: denormalsEnabledForType is broken for dynamic
13945 return denormalsEnabledForType(DAG, Op.getValueType()) &&
13946 DAG.isKnownNeverSNaN(Op);
13947}
13948
13950 unsigned MaxDepth) const {
13951 const MachineRegisterInfo &MRI = MF.getRegInfo();
13952 MachineInstr *MI = MRI.getVRegDef(Reg);
13953 unsigned Opcode = MI->getOpcode();
13954
13955 if (Opcode == AMDGPU::G_FCANONICALIZE)
13956 return true;
13957
13958 std::optional<FPValueAndVReg> FCR;
13959 // Constant splat (can be padded with undef) or scalar constant.
13961 if (FCR->Value.isSignaling())
13962 return false;
13963 if (!FCR->Value.isDenormal())
13964 return true;
13965
13966 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
13967 return Mode == DenormalMode::getIEEE();
13968 }
13969
13970 if (MaxDepth == 0)
13971 return false;
13972
13973 switch (Opcode) {
13974 case AMDGPU::G_FADD:
13975 case AMDGPU::G_FSUB:
13976 case AMDGPU::G_FMUL:
13977 case AMDGPU::G_FCEIL:
13978 case AMDGPU::G_FFLOOR:
13979 case AMDGPU::G_FRINT:
13980 case AMDGPU::G_FNEARBYINT:
13981 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13982 case AMDGPU::G_INTRINSIC_TRUNC:
13983 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13984 case AMDGPU::G_FMA:
13985 case AMDGPU::G_FMAD:
13986 case AMDGPU::G_FSQRT:
13987 case AMDGPU::G_FDIV:
13988 case AMDGPU::G_FREM:
13989 case AMDGPU::G_FPOW:
13990 case AMDGPU::G_FPEXT:
13991 case AMDGPU::G_FLOG:
13992 case AMDGPU::G_FLOG2:
13993 case AMDGPU::G_FLOG10:
13994 case AMDGPU::G_FPTRUNC:
13995 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13996 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13997 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13998 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13999 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14000 return true;
14001 case AMDGPU::G_FNEG:
14002 case AMDGPU::G_FABS:
14003 case AMDGPU::G_FCOPYSIGN:
14004 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14005 case AMDGPU::G_FMINNUM:
14006 case AMDGPU::G_FMAXNUM:
14007 case AMDGPU::G_FMINNUM_IEEE:
14008 case AMDGPU::G_FMAXNUM_IEEE:
14009 case AMDGPU::G_FMINIMUM:
14010 case AMDGPU::G_FMAXIMUM:
14011 case AMDGPU::G_FMINIMUMNUM:
14012 case AMDGPU::G_FMAXIMUMNUM: {
14013 if (Subtarget->supportsMinMaxDenormModes() ||
14014 // FIXME: denormalsEnabledForType is broken for dynamic
14015 denormalsEnabledForType(MRI.getType(Reg), MF))
14016 return true;
14017
14018 [[fallthrough]];
14019 }
14020 case AMDGPU::G_BUILD_VECTOR:
14021 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14022 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14023 return false;
14024 return true;
14025 case AMDGPU::G_INTRINSIC:
14026 case AMDGPU::G_INTRINSIC_CONVERGENT:
14027 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14028 case Intrinsic::amdgcn_fmul_legacy:
14029 case Intrinsic::amdgcn_fmad_ftz:
14030 case Intrinsic::amdgcn_sqrt:
14031 case Intrinsic::amdgcn_fmed3:
14032 case Intrinsic::amdgcn_sin:
14033 case Intrinsic::amdgcn_cos:
14034 case Intrinsic::amdgcn_log:
14035 case Intrinsic::amdgcn_exp2:
14036 case Intrinsic::amdgcn_log_clamp:
14037 case Intrinsic::amdgcn_rcp:
14038 case Intrinsic::amdgcn_rcp_legacy:
14039 case Intrinsic::amdgcn_rsq:
14040 case Intrinsic::amdgcn_rsq_clamp:
14041 case Intrinsic::amdgcn_rsq_legacy:
14042 case Intrinsic::amdgcn_div_scale:
14043 case Intrinsic::amdgcn_div_fmas:
14044 case Intrinsic::amdgcn_div_fixup:
14045 case Intrinsic::amdgcn_fract:
14046 case Intrinsic::amdgcn_cvt_pkrtz:
14047 case Intrinsic::amdgcn_cubeid:
14048 case Intrinsic::amdgcn_cubema:
14049 case Intrinsic::amdgcn_cubesc:
14050 case Intrinsic::amdgcn_cubetc:
14051 case Intrinsic::amdgcn_frexp_mant:
14052 case Intrinsic::amdgcn_fdot2:
14053 case Intrinsic::amdgcn_trig_preop:
14054 case Intrinsic::amdgcn_tanh:
14055 return true;
14056 default:
14057 break;
14058 }
14059
14060 [[fallthrough]];
14061 default:
14062 return false;
14063 }
14064
14065 llvm_unreachable("invalid operation");
14066}
14067
14068// Constant fold canonicalize.
14069SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14070 const SDLoc &SL, EVT VT,
14071 const APFloat &C) const {
14072 // Flush denormals to 0 if not enabled.
14073 if (C.isDenormal()) {
14074 DenormalMode Mode =
14075 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14076 if (Mode == DenormalMode::getPreserveSign()) {
14077 return DAG.getConstantFP(
14078 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14079 }
14080
14081 if (Mode != DenormalMode::getIEEE())
14082 return SDValue();
14083 }
14084
14085 if (C.isNaN()) {
14086 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14087 if (C.isSignaling()) {
14088 // Quiet a signaling NaN.
14089 // FIXME: Is this supposed to preserve payload bits?
14090 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14091 }
14092
14093 // Make sure it is the canonical NaN bitpattern.
14094 //
14095 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14096 // immediate?
14097 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14098 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14099 }
14100
14101 // Already canonical.
14102 return DAG.getConstantFP(C, SL, VT);
14103}
14104
14106 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14107}
14108
14109SDValue
14110SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14111 DAGCombinerInfo &DCI) const {
14112 SelectionDAG &DAG = DCI.DAG;
14113 SDValue N0 = N->getOperand(0);
14114 EVT VT = N->getValueType(0);
14115
14116 // fcanonicalize undef -> qnan
14117 if (N0.isUndef()) {
14119 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14120 }
14121
14122 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14123 EVT VT = N->getValueType(0);
14124 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14125 }
14126
14127 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14128 // (fcanonicalize k)
14129 //
14130 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14131
14132 // TODO: This could be better with wider vectors that will be split to v2f16,
14133 // and to consider uses since there aren't that many packed operations.
14134 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14135 isTypeLegal(MVT::v2f16)) {
14136 SDLoc SL(N);
14137 SDValue NewElts[2];
14138 SDValue Lo = N0.getOperand(0);
14139 SDValue Hi = N0.getOperand(1);
14140 EVT EltVT = Lo.getValueType();
14141
14143 for (unsigned I = 0; I != 2; ++I) {
14144 SDValue Op = N0.getOperand(I);
14145 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14146 NewElts[I] =
14147 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14148 } else if (Op.isUndef()) {
14149 // Handled below based on what the other operand is.
14150 NewElts[I] = Op;
14151 } else {
14152 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14153 }
14154 }
14155
14156 // If one half is undef, and one is constant, prefer a splat vector rather
14157 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14158 // cheaper to use and may be free with a packed operation.
14159 if (NewElts[0].isUndef()) {
14160 if (isa<ConstantFPSDNode>(NewElts[1]))
14161 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14162 ? NewElts[1]
14163 : DAG.getConstantFP(0.0f, SL, EltVT);
14164 }
14165
14166 if (NewElts[1].isUndef()) {
14167 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14168 ? NewElts[0]
14169 : DAG.getConstantFP(0.0f, SL, EltVT);
14170 }
14171
14172 return DAG.getBuildVector(VT, SL, NewElts);
14173 }
14174 }
14175
14176 return SDValue();
14177}
14178
14179static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14180 switch (Opc) {
14181 case ISD::FMAXNUM:
14182 case ISD::FMAXNUM_IEEE:
14183 case ISD::FMAXIMUMNUM:
14184 return AMDGPUISD::FMAX3;
14185 case ISD::FMAXIMUM:
14186 return AMDGPUISD::FMAXIMUM3;
14187 case ISD::SMAX:
14188 return AMDGPUISD::SMAX3;
14189 case ISD::UMAX:
14190 return AMDGPUISD::UMAX3;
14191 case ISD::FMINNUM:
14192 case ISD::FMINNUM_IEEE:
14193 case ISD::FMINIMUMNUM:
14194 return AMDGPUISD::FMIN3;
14195 case ISD::FMINIMUM:
14196 return AMDGPUISD::FMINIMUM3;
14197 case ISD::SMIN:
14198 return AMDGPUISD::SMIN3;
14199 case ISD::UMIN:
14200 return AMDGPUISD::UMIN3;
14201 default:
14202 llvm_unreachable("Not a min/max opcode");
14203 }
14204}
14205
14206SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14207 const SDLoc &SL, SDValue Src,
14208 SDValue MinVal,
14209 SDValue MaxVal,
14210 bool Signed) const {
14211
14212 // med3 comes from
14213 // min(max(x, K0), K1), K0 < K1
14214 // max(min(x, K0), K1), K1 < K0
14215 //
14216 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14217 // min/max op.
14218 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14219 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14220
14221 if (!MinK || !MaxK)
14222 return SDValue();
14223
14224 if (Signed) {
14225 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14226 return SDValue();
14227 } else {
14228 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14229 return SDValue();
14230 }
14231
14232 EVT VT = MinK->getValueType(0);
14233 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14234 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14235 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14236
14237 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14238 // not available, but this is unlikely to be profitable as constants
14239 // will often need to be materialized & extended, especially on
14240 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14241 return SDValue();
14242}
14243
14246 return C;
14247
14249 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14250 return C;
14251 }
14252
14253 return nullptr;
14254}
14255
14256SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14257 const SDLoc &SL, SDValue Op0,
14258 SDValue Op1) const {
14259 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14260 if (!K1)
14261 return SDValue();
14262
14263 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14264 if (!K0)
14265 return SDValue();
14266
14267 // Ordered >= (although NaN inputs should have folded away by now).
14268 if (K0->getValueAPF() > K1->getValueAPF())
14269 return SDValue();
14270
14271 // med3 with a nan input acts like
14272 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14273 //
14274 // So the result depends on whether the IEEE mode bit is enabled or not with a
14275 // signaling nan input.
14276 // ieee=1
14277 // s0 snan: yields s2
14278 // s1 snan: yields s2
14279 // s2 snan: qnan
14280
14281 // s0 qnan: min(s1, s2)
14282 // s1 qnan: min(s0, s2)
14283 // s2 qnan: min(s0, s1)
14284
14285 // ieee=0
14286 // s0 snan: min(s1, s2)
14287 // s1 snan: min(s0, s2)
14288 // s2 snan: qnan
14289
14290 // s0 qnan: min(s1, s2)
14291 // s1 qnan: min(s0, s2)
14292 // s2 qnan: min(s0, s1)
14293 const MachineFunction &MF = DAG.getMachineFunction();
14294 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14295
14296 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14297 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14298 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14299 EVT VT = Op0.getValueType();
14300 if (Info->getMode().DX10Clamp) {
14301 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14302 // hardware fmed3 behavior converting to a min.
14303 // FIXME: Should this be allowing -0.0?
14304 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14305 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14306 }
14307
14308 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14309 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14310 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14311 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14312 // then give the other result, which is different from med3 with a NaN
14313 // input.
14314 SDValue Var = Op0.getOperand(0);
14315 if (!DAG.isKnownNeverSNaN(Var))
14316 return SDValue();
14317
14318 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14319
14320 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14321 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14322 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14323 SDValue(K0, 0), SDValue(K1, 0));
14324 }
14325 }
14326
14327 return SDValue();
14328}
14329
14330/// \return true if the subtarget supports minimum3 and maximum3 with the given
14331/// base min/max opcode \p Opc for type \p VT.
14332static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14333 EVT VT) {
14334 switch (Opc) {
14335 case ISD::FMINNUM:
14336 case ISD::FMAXNUM:
14337 case ISD::FMINNUM_IEEE:
14338 case ISD::FMAXNUM_IEEE:
14339 case ISD::FMINIMUMNUM:
14340 case ISD::FMAXIMUMNUM:
14343 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14344 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14345 case ISD::FMINIMUM:
14346 case ISD::FMAXIMUM:
14347 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14348 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14349 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14350 case ISD::SMAX:
14351 case ISD::SMIN:
14352 case ISD::UMAX:
14353 case ISD::UMIN:
14354 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14355 default:
14356 return false;
14357 }
14358
14359 llvm_unreachable("not a min/max opcode");
14360}
14361
14362SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14363 DAGCombinerInfo &DCI) const {
14364 SelectionDAG &DAG = DCI.DAG;
14365
14366 EVT VT = N->getValueType(0);
14367 unsigned Opc = N->getOpcode();
14368 SDValue Op0 = N->getOperand(0);
14369 SDValue Op1 = N->getOperand(1);
14370
14371 // Only do this if the inner op has one use since this will just increases
14372 // register pressure for no benefit.
14373
14374 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14375 // max(max(a, b), c) -> max3(a, b, c)
14376 // min(min(a, b), c) -> min3(a, b, c)
14377 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14378 SDLoc DL(N);
14379 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14380 Op0.getOperand(0), Op0.getOperand(1), Op1);
14381 }
14382
14383 // Try commuted.
14384 // max(a, max(b, c)) -> max3(a, b, c)
14385 // min(a, min(b, c)) -> min3(a, b, c)
14386 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14387 SDLoc DL(N);
14388 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14389 Op0, Op1.getOperand(0), Op1.getOperand(1));
14390 }
14391 }
14392
14393 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14394 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14395 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14396 if (SDValue Med3 = performIntMed3ImmCombine(
14397 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14398 return Med3;
14399 }
14400 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14401 if (SDValue Med3 = performIntMed3ImmCombine(
14402 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14403 return Med3;
14404 }
14405
14406 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14407 if (SDValue Med3 = performIntMed3ImmCombine(
14408 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14409 return Med3;
14410 }
14411 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14412 if (SDValue Med3 = performIntMed3ImmCombine(
14413 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14414 return Med3;
14415 }
14416
14417 // if !is_snan(x):
14418 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14419 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14420 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14421 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14422 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14423 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14424 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14426 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14427 (VT == MVT::f32 || VT == MVT::f64 ||
14428 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14429 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14430 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14431 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14432 Op0.hasOneUse()) {
14433 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14434 return Res;
14435 }
14436
14437 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14438 // for some types, but at a higher cost since it's implemented with a 3
14439 // operand form.
14440 const SDNodeFlags Flags = N->getFlags();
14441 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14442 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14443 unsigned NewOpc =
14444 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14445 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14446 }
14447
14448 return SDValue();
14449}
14450
14454 // FIXME: Should this be allowing -0.0?
14455 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14456 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14457 }
14458 }
14459
14460 return false;
14461}
14462
14463// FIXME: Should only worry about snans for version with chain.
14464SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14465 DAGCombinerInfo &DCI) const {
14466 EVT VT = N->getValueType(0);
14467 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14468 // NaNs. With a NaN input, the order of the operands may change the result.
14469
14470 SelectionDAG &DAG = DCI.DAG;
14471 SDLoc SL(N);
14472
14473 SDValue Src0 = N->getOperand(0);
14474 SDValue Src1 = N->getOperand(1);
14475 SDValue Src2 = N->getOperand(2);
14476
14477 if (isClampZeroToOne(Src0, Src1)) {
14478 // const_a, const_b, x -> clamp is safe in all cases including signaling
14479 // nans.
14480 // FIXME: Should this be allowing -0.0?
14481 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
14482 }
14483
14484 const MachineFunction &MF = DAG.getMachineFunction();
14485 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14486
14487 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
14488 // handling no dx10-clamp?
14489 if (Info->getMode().DX10Clamp) {
14490 // If NaNs is clamped to 0, we are free to reorder the inputs.
14491
14492 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14493 std::swap(Src0, Src1);
14494
14495 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
14496 std::swap(Src1, Src2);
14497
14498 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14499 std::swap(Src0, Src1);
14500
14501 if (isClampZeroToOne(Src1, Src2))
14502 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
14503 }
14504
14505 return SDValue();
14506}
14507
14508SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
14509 DAGCombinerInfo &DCI) const {
14510 SDValue Src0 = N->getOperand(0);
14511 SDValue Src1 = N->getOperand(1);
14512 if (Src0.isUndef() && Src1.isUndef())
14513 return DCI.DAG.getUNDEF(N->getValueType(0));
14514 return SDValue();
14515}
14516
14517// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
14518// expanded into a set of cmp/select instructions.
14520 unsigned NumElem,
14521 bool IsDivergentIdx,
14522 const GCNSubtarget *Subtarget) {
14524 return false;
14525
14526 unsigned VecSize = EltSize * NumElem;
14527
14528 // Sub-dword vectors of size 2 dword or less have better implementation.
14529 if (VecSize <= 64 && EltSize < 32)
14530 return false;
14531
14532 // Always expand the rest of sub-dword instructions, otherwise it will be
14533 // lowered via memory.
14534 if (EltSize < 32)
14535 return true;
14536
14537 // Always do this if var-idx is divergent, otherwise it will become a loop.
14538 if (IsDivergentIdx)
14539 return true;
14540
14541 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
14542 unsigned NumInsts = NumElem /* Number of compares */ +
14543 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
14544
14545 // On some architectures (GFX9) movrel is not available and it's better
14546 // to expand.
14547 if (Subtarget->useVGPRIndexMode())
14548 return NumInsts <= 16;
14549
14550 // If movrel is available, use it instead of expanding for vector of 8
14551 // elements.
14552 if (Subtarget->hasMovrel())
14553 return NumInsts <= 15;
14554
14555 return true;
14556}
14557
14559 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
14560 if (isa<ConstantSDNode>(Idx))
14561 return false;
14562
14563 SDValue Vec = N->getOperand(0);
14564 EVT VecVT = Vec.getValueType();
14565 EVT EltVT = VecVT.getVectorElementType();
14566 unsigned EltSize = EltVT.getSizeInBits();
14567 unsigned NumElem = VecVT.getVectorNumElements();
14568
14570 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
14571}
14572
14573SDValue
14574SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14575 DAGCombinerInfo &DCI) const {
14576 SDValue Vec = N->getOperand(0);
14577 SelectionDAG &DAG = DCI.DAG;
14578
14579 EVT VecVT = Vec.getValueType();
14580 EVT VecEltVT = VecVT.getVectorElementType();
14581 EVT ResVT = N->getValueType(0);
14582
14583 unsigned VecSize = VecVT.getSizeInBits();
14584 unsigned VecEltSize = VecEltVT.getSizeInBits();
14585
14586 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14588 SDLoc SL(N);
14589 SDValue Idx = N->getOperand(1);
14590 SDValue Elt =
14591 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
14592 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
14593 }
14594
14595 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14596 // =>
14597 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14598 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14599 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14600 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14601 SDLoc SL(N);
14602 SDValue Idx = N->getOperand(1);
14603 unsigned Opc = Vec.getOpcode();
14604
14605 switch (Opc) {
14606 default:
14607 break;
14608 // TODO: Support other binary operations.
14609 case ISD::FADD:
14610 case ISD::FSUB:
14611 case ISD::FMUL:
14612 case ISD::ADD:
14613 case ISD::UMIN:
14614 case ISD::UMAX:
14615 case ISD::SMIN:
14616 case ISD::SMAX:
14617 case ISD::FMAXNUM:
14618 case ISD::FMINNUM:
14619 case ISD::FMAXNUM_IEEE:
14620 case ISD::FMINNUM_IEEE:
14621 case ISD::FMAXIMUM:
14622 case ISD::FMINIMUM: {
14623 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14624 Vec.getOperand(0), Idx);
14625 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14626 Vec.getOperand(1), Idx);
14627
14628 DCI.AddToWorklist(Elt0.getNode());
14629 DCI.AddToWorklist(Elt1.getNode());
14630 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
14631 }
14632 }
14633 }
14634
14635 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14637 SDLoc SL(N);
14638 SDValue Idx = N->getOperand(1);
14639 SDValue V;
14640 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14641 SDValue IC = DAG.getVectorIdxConstant(I, SL);
14642 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
14643 if (I == 0)
14644 V = Elt;
14645 else
14646 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
14647 }
14648 return V;
14649 }
14650
14651 if (!DCI.isBeforeLegalize())
14652 return SDValue();
14653
14654 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14655 // elements. This exposes more load reduction opportunities by replacing
14656 // multiple small extract_vector_elements with a single 32-bit extract.
14657 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
14658 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14659 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14660 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
14661
14662 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14663 unsigned EltIdx = BitIndex / 32;
14664 unsigned LeftoverBitIdx = BitIndex % 32;
14665 SDLoc SL(N);
14666
14667 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
14668 DCI.AddToWorklist(Cast.getNode());
14669
14670 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
14671 DAG.getConstant(EltIdx, SL, MVT::i32));
14672 DCI.AddToWorklist(Elt.getNode());
14673 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
14674 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
14675 DCI.AddToWorklist(Srl.getNode());
14676
14677 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14678 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
14679 DCI.AddToWorklist(Trunc.getNode());
14680
14681 if (VecEltVT == ResVT) {
14682 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14683 }
14684
14685 assert(ResVT.isScalarInteger());
14686 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
14687 }
14688
14689 return SDValue();
14690}
14691
14692SDValue
14693SITargetLowering::performInsertVectorEltCombine(SDNode *N,
14694 DAGCombinerInfo &DCI) const {
14695 SDValue Vec = N->getOperand(0);
14696 SDValue Idx = N->getOperand(2);
14697 EVT VecVT = Vec.getValueType();
14698 EVT EltVT = VecVT.getVectorElementType();
14699
14700 // INSERT_VECTOR_ELT (<n x e>, var-idx)
14701 // => BUILD_VECTOR n x select (e, const-idx)
14703 return SDValue();
14704
14705 SelectionDAG &DAG = DCI.DAG;
14706 SDLoc SL(N);
14707 SDValue Ins = N->getOperand(1);
14708 EVT IdxVT = Idx.getValueType();
14709
14711 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14712 SDValue IC = DAG.getConstant(I, SL, IdxVT);
14713 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
14714 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
14715 Ops.push_back(V);
14716 }
14717
14718 return DAG.getBuildVector(VecVT, SL, Ops);
14719}
14720
14721/// Return the source of an fp_extend from f16 to f32, or a converted FP
14722/// constant.
14724 if (Src.getOpcode() == ISD::FP_EXTEND &&
14725 Src.getOperand(0).getValueType() == MVT::f16) {
14726 return Src.getOperand(0);
14727 }
14728
14729 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
14730 APFloat Val = CFP->getValueAPF();
14731 bool LosesInfo = true;
14733 if (!LosesInfo)
14734 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
14735 }
14736
14737 return SDValue();
14738}
14739
14740SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
14741 DAGCombinerInfo &DCI) const {
14742 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14743 "combine only useful on gfx8");
14744
14745 SDValue TruncSrc = N->getOperand(0);
14746 EVT VT = N->getValueType(0);
14747 if (VT != MVT::f16)
14748 return SDValue();
14749
14750 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
14751 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
14752 return SDValue();
14753
14754 SelectionDAG &DAG = DCI.DAG;
14755 SDLoc SL(N);
14756
14757 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
14758 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
14759 // casting back.
14760
14761 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
14762 // fmin(fmax(a, b), fmax(fmin(a, b), c))
14763 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
14764 if (!A)
14765 return SDValue();
14766
14767 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
14768 if (!B)
14769 return SDValue();
14770
14771 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
14772 if (!C)
14773 return SDValue();
14774
14775 // This changes signaling nan behavior. If an input is a signaling nan, it
14776 // would have been quieted by the fpext originally. We don't care because
14777 // these are unconstrained ops. If we needed to insert quieting canonicalizes
14778 // we would be worse off than just doing the promotion.
14779 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
14780 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
14781 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
14782 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
14783}
14784
14785unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
14786 const SDNode *N0,
14787 const SDNode *N1) const {
14788 EVT VT = N0->getValueType(0);
14789
14790 // Only do this if we are not trying to support denormals. v_mad_f32 does not
14791 // support denormals ever.
14792 if (((VT == MVT::f32 &&
14794 (VT == MVT::f16 && Subtarget->hasMadF16() &&
14797 return ISD::FMAD;
14798
14799 const TargetOptions &Options = DAG.getTarget().Options;
14800 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
14801 (N0->getFlags().hasAllowContract() &&
14802 N1->getFlags().hasAllowContract())) &&
14804 return ISD::FMA;
14805 }
14806
14807 return 0;
14808}
14809
14810// For a reassociatable opcode perform:
14811// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
14812SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
14813 SelectionDAG &DAG) const {
14814 EVT VT = N->getValueType(0);
14815 if (VT != MVT::i32 && VT != MVT::i64)
14816 return SDValue();
14817
14818 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
14819 return SDValue();
14820
14821 unsigned Opc = N->getOpcode();
14822 SDValue Op0 = N->getOperand(0);
14823 SDValue Op1 = N->getOperand(1);
14824
14825 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
14826 return SDValue();
14827
14828 if (Op0->isDivergent())
14829 std::swap(Op0, Op1);
14830
14831 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
14832 return SDValue();
14833
14834 SDValue Op2 = Op1.getOperand(1);
14835 Op1 = Op1.getOperand(0);
14836 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
14837 return SDValue();
14838
14839 if (Op1->isDivergent())
14840 std::swap(Op1, Op2);
14841
14842 SDLoc SL(N);
14843 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
14844 return DAG.getNode(Opc, SL, VT, Add1, Op2);
14845}
14846
14847static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
14848 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
14850 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
14851 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
14852 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
14853}
14854
14855// Fold
14856// y = lshr i64 x, 32
14857// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
14858// with Const.hi == -1
14859// To
14860// res = mad_u64_u32 y.lo ,Const.lo, x.lo
14862 SDValue MulLHS, SDValue MulRHS,
14863 SDValue AddRHS) {
14864 if (MulRHS.getOpcode() == ISD::SRL)
14865 std::swap(MulLHS, MulRHS);
14866
14867 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
14868 return SDValue();
14869
14870 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
14871 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
14872 MulLHS.getOperand(0) != AddRHS)
14873 return SDValue();
14874
14876 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
14877 return SDValue();
14878
14879 SDValue ConstMul =
14880 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
14881 return getMad64_32(DAG, SL, MVT::i64,
14882 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
14883 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
14884}
14885
14886// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14887// multiplies, if any.
14888//
14889// Full 64-bit multiplies that feed into an addition are lowered here instead
14890// of using the generic expansion. The generic expansion ends up with
14891// a tree of ADD nodes that prevents us from using the "add" part of the
14892// MAD instruction. The expansion produced here results in a chain of ADDs
14893// instead of a tree.
14894SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14895 DAGCombinerInfo &DCI) const {
14896 assert(N->isAnyAdd());
14897
14898 SelectionDAG &DAG = DCI.DAG;
14899 EVT VT = N->getValueType(0);
14900 SDLoc SL(N);
14901 SDValue LHS = N->getOperand(0);
14902 SDValue RHS = N->getOperand(1);
14903
14904 if (VT.isVector())
14905 return SDValue();
14906
14907 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14908 // result in scalar registers for uniform values.
14909 if (!N->isDivergent() && Subtarget->hasSMulHi())
14910 return SDValue();
14911
14912 unsigned NumBits = VT.getScalarSizeInBits();
14913 if (NumBits <= 32 || NumBits > 64)
14914 return SDValue();
14915
14916 if (LHS.getOpcode() != ISD::MUL) {
14917 assert(RHS.getOpcode() == ISD::MUL);
14918 std::swap(LHS, RHS);
14919 }
14920
14921 // Avoid the fold if it would unduly increase the number of multiplies due to
14922 // multiple uses, except on hardware with full-rate multiply-add (which is
14923 // part of full-rate 64-bit ops).
14924 if (!Subtarget->hasFullRate64Ops()) {
14925 unsigned NumUsers = 0;
14926 for (SDNode *User : LHS->users()) {
14927 // There is a use that does not feed into addition, so the multiply can't
14928 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14929 if (!User->isAnyAdd())
14930 return SDValue();
14931
14932 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14933 // MUL + 3xADD + 3xADDC over 3xMAD.
14934 ++NumUsers;
14935 if (NumUsers >= 3)
14936 return SDValue();
14937 }
14938 }
14939
14940 SDValue MulLHS = LHS.getOperand(0);
14941 SDValue MulRHS = LHS.getOperand(1);
14942 SDValue AddRHS = RHS;
14943
14944 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14945 return FoldedMAD;
14946
14947 // Always check whether operands are small unsigned values, since that
14948 // knowledge is useful in more cases. Check for small signed values only if
14949 // doing so can unlock a shorter code sequence.
14950 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
14951 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
14952
14953 bool MulSignedLo = false;
14954 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14955 MulSignedLo =
14956 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
14957 }
14958
14959 // The operands and final result all have the same number of bits. If
14960 // operands need to be extended, they can be extended with garbage. The
14961 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14962 // truncated away in the end.
14963 if (VT != MVT::i64) {
14964 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
14965 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
14966 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
14967 }
14968
14969 // The basic code generated is conceptually straightforward. Pseudo code:
14970 //
14971 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14972 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14973 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14974 //
14975 // The second and third lines are optional, depending on whether the factors
14976 // are {sign,zero}-extended or not.
14977 //
14978 // The actual DAG is noisier than the pseudo code, but only due to
14979 // instructions that disassemble values into low and high parts, and
14980 // assemble the final result.
14981 SDValue One = DAG.getConstant(1, SL, MVT::i32);
14982
14983 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
14984 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
14985 SDValue Accum =
14986 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
14987
14988 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14989 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
14990
14991 if (!MulLHSUnsigned32) {
14992 auto MulLHSHi =
14993 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
14994 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
14995 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
14996 }
14997
14998 if (!MulRHSUnsigned32) {
14999 auto MulRHSHi =
15000 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15001 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15002 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15003 }
15004
15005 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15006 Accum = DAG.getBitcast(MVT::i64, Accum);
15007 }
15008
15009 if (VT != MVT::i64)
15010 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15011 return Accum;
15012}
15013
15014SDValue
15015SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15016 DAGCombinerInfo &DCI) const {
15017 SDValue RHS = N->getOperand(1);
15018 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15019 if (!CRHS)
15020 return SDValue();
15021
15022 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15023 // common.
15024 uint64_t Val = CRHS->getZExtValue();
15025 if (countr_zero(Val) >= 32) {
15026 SelectionDAG &DAG = DCI.DAG;
15027 SDLoc SL(N);
15028 SDValue LHS = N->getOperand(0);
15029
15030 // Avoid carry machinery if we know the low half of the add does not
15031 // contribute to the final result.
15032 //
15033 // add i64:x, K if computeTrailingZeros(K) >= 32
15034 // => build_pair (add x.hi, K.hi), x.lo
15035
15036 // Breaking the 64-bit add here with this strange constant is unlikely
15037 // to interfere with addressing mode patterns.
15038
15039 SDValue Hi = getHiHalf64(LHS, DAG);
15040 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15041 unsigned Opcode = N->getOpcode();
15042 if (Opcode == ISD::PTRADD)
15043 Opcode = ISD::ADD;
15044 SDValue AddHi =
15045 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15046
15047 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15048 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15049 }
15050
15051 return SDValue();
15052}
15053
15054// Collect the ultimate src of each of the mul node's operands, and confirm
15055// each operand is 8 bytes.
15056static std::optional<ByteProvider<SDValue>>
15057handleMulOperand(const SDValue &MulOperand) {
15058 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15059 if (!Byte0 || Byte0->isConstantZero()) {
15060 return std::nullopt;
15061 }
15062 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15063 if (Byte1 && !Byte1->isConstantZero()) {
15064 return std::nullopt;
15065 }
15066 return Byte0;
15067}
15068
15069static unsigned addPermMasks(unsigned First, unsigned Second) {
15070 unsigned FirstCs = First & 0x0c0c0c0c;
15071 unsigned SecondCs = Second & 0x0c0c0c0c;
15072 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15073 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15074
15075 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15076 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15077 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15078 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15079
15080 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15081}
15082
15083struct DotSrc {
15085 int64_t PermMask;
15087};
15088
15092 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15093
15094 assert(Src0.Src.has_value() && Src1.Src.has_value());
15095 // Src0s and Src1s are empty, just place arbitrarily.
15096 if (Step == 0) {
15097 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15098 Src0.SrcOffset / 4});
15099 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15100 Src1.SrcOffset / 4});
15101 return;
15102 }
15103
15104 for (int BPI = 0; BPI < 2; BPI++) {
15105 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15106 if (BPI == 1) {
15107 BPP = {Src1, Src0};
15108 }
15109 unsigned ZeroMask = 0x0c0c0c0c;
15110 unsigned FMask = 0xFF << (8 * (3 - Step));
15111
15112 unsigned FirstMask =
15113 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15114 unsigned SecondMask =
15115 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15116 // Attempt to find Src vector which contains our SDValue, if so, add our
15117 // perm mask to the existing one. If we are unable to find a match for the
15118 // first SDValue, attempt to find match for the second.
15119 int FirstGroup = -1;
15120 for (int I = 0; I < 2; I++) {
15121 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15122 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15123 return IterElt.SrcOp == *BPP.first.Src &&
15124 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15125 };
15126
15127 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15128 if (Match != Srcs.end()) {
15129 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15130 FirstGroup = I;
15131 break;
15132 }
15133 }
15134 if (FirstGroup != -1) {
15135 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15136 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15137 return IterElt.SrcOp == *BPP.second.Src &&
15138 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15139 };
15140 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15141 if (Match != Srcs.end()) {
15142 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15143 } else
15144 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15145 return;
15146 }
15147 }
15148
15149 // If we have made it here, then we could not find a match in Src0s or Src1s
15150 // for either Src0 or Src1, so just place them arbitrarily.
15151
15152 unsigned ZeroMask = 0x0c0c0c0c;
15153 unsigned FMask = 0xFF << (8 * (3 - Step));
15154
15155 Src0s.push_back(
15156 {*Src0.Src,
15157 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15158 Src0.SrcOffset / 4});
15159 Src1s.push_back(
15160 {*Src1.Src,
15161 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15162 Src1.SrcOffset / 4});
15163}
15164
15166 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15167 bool IsAny) {
15168
15169 // If we just have one source, just permute it accordingly.
15170 if (Srcs.size() == 1) {
15171 auto *Elt = Srcs.begin();
15172 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15173
15174 // v_perm will produce the original value
15175 if (Elt->PermMask == 0x3020100)
15176 return EltOp;
15177
15178 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15179 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15180 }
15181
15182 auto *FirstElt = Srcs.begin();
15183 auto *SecondElt = std::next(FirstElt);
15184
15186
15187 // If we have multiple sources in the chain, combine them via perms (using
15188 // calculated perm mask) and Ors.
15189 while (true) {
15190 auto FirstMask = FirstElt->PermMask;
15191 auto SecondMask = SecondElt->PermMask;
15192
15193 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15194 unsigned FirstPlusFour = FirstMask | 0x04040404;
15195 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15196 // original 0x0C.
15197 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15198
15199 auto PermMask = addPermMasks(FirstMask, SecondMask);
15200 auto FirstVal =
15201 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15202 auto SecondVal =
15203 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15204
15205 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15206 SecondVal,
15207 DAG.getConstant(PermMask, SL, MVT::i32)));
15208
15209 FirstElt = std::next(SecondElt);
15210 if (FirstElt == Srcs.end())
15211 break;
15212
15213 SecondElt = std::next(FirstElt);
15214 // If we only have a FirstElt, then just combine that into the cumulative
15215 // source node.
15216 if (SecondElt == Srcs.end()) {
15217 auto EltOp =
15218 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15219
15220 Perms.push_back(
15221 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15222 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15223 break;
15224 }
15225 }
15226
15227 assert(Perms.size() == 1 || Perms.size() == 2);
15228 return Perms.size() == 2
15229 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15230 : Perms[0];
15231}
15232
15233static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15234 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15235 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15236 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15237 EntryMask += ZeroMask;
15238 }
15239}
15240
15241static bool isMul(const SDValue Op) {
15242 auto Opcode = Op.getOpcode();
15243
15244 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15245 Opcode == AMDGPUISD::MUL_I24);
15246}
15247
15248static std::optional<bool>
15250 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15251 const SDValue &S1Op, const SelectionDAG &DAG) {
15252 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15253 // of the dot4 is irrelevant.
15254 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15255 return false;
15256
15257 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15258 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15259 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15260 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15261 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15262 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15263
15264 assert(!(S0IsUnsigned && S0IsSigned));
15265 assert(!(S1IsUnsigned && S1IsSigned));
15266
15267 // There are 9 possible permutations of
15268 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15269
15270 // In two permutations, the sign bits are known to be the same for both Ops,
15271 // so simply return Signed / Unsigned corresponding to the MSB
15272
15273 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15274 return S0IsSigned;
15275
15276 // In another two permutations, the sign bits are known to be opposite. In
15277 // this case return std::nullopt to indicate a bad match.
15278
15279 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15280 return std::nullopt;
15281
15282 // In the remaining five permutations, we don't know the value of the sign
15283 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15284 // the upper bits must be extension bits. Thus, the only ways for the sign
15285 // bit to be unknown is if it was sign extended from unknown value, or if it
15286 // was any extended. In either case, it is correct to use the signed
15287 // version of the signedness semantics of dot4
15288
15289 // In two of such permutations, we known the sign bit is set for
15290 // one op, and the other is unknown. It is okay to used signed version of
15291 // dot4.
15292 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15293 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15294 return true;
15295
15296 // In one such permutation, we don't know either of the sign bits. It is okay
15297 // to used the signed version of dot4.
15298 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15299 return true;
15300
15301 // In two of such permutations, we known the sign bit is unset for
15302 // one op, and the other is unknown. Return std::nullopt to indicate a
15303 // bad match.
15304 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15305 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15306 return std::nullopt;
15307
15308 llvm_unreachable("Fully covered condition");
15309}
15310
15311SDValue SITargetLowering::performAddCombine(SDNode *N,
15312 DAGCombinerInfo &DCI) const {
15313 SelectionDAG &DAG = DCI.DAG;
15314 EVT VT = N->getValueType(0);
15315 SDLoc SL(N);
15316 SDValue LHS = N->getOperand(0);
15317 SDValue RHS = N->getOperand(1);
15318
15319 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15320 if (Subtarget->hasMad64_32()) {
15321 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15322 return Folded;
15323 }
15324 }
15325
15326 if (SDValue V = reassociateScalarOps(N, DAG)) {
15327 return V;
15328 }
15329
15330 if (VT == MVT::i64) {
15331 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15332 return Folded;
15333 }
15334
15335 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15336 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15337 SDValue TempNode(N, 0);
15338 std::optional<bool> IsSigned;
15342
15343 // Match the v_dot4 tree, while collecting src nodes.
15344 int ChainLength = 0;
15345 for (int I = 0; I < 4; I++) {
15346 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15347 if (MulIdx == -1)
15348 break;
15349 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15350 if (!Src0)
15351 break;
15352 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15353 if (!Src1)
15354 break;
15355
15356 auto IterIsSigned = checkDot4MulSignedness(
15357 TempNode->getOperand(MulIdx), *Src0, *Src1,
15358 TempNode->getOperand(MulIdx)->getOperand(0),
15359 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15360 if (!IterIsSigned)
15361 break;
15362 if (!IsSigned)
15363 IsSigned = *IterIsSigned;
15364 if (*IterIsSigned != *IsSigned)
15365 break;
15366 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15367 auto AddIdx = 1 - MulIdx;
15368 // Allow the special case where add (add (mul24, 0), mul24) became ->
15369 // add (mul24, mul24).
15370 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15371 Src2s.push_back(TempNode->getOperand(AddIdx));
15372 auto Src0 =
15373 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15374 if (!Src0)
15375 break;
15376 auto Src1 =
15377 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15378 if (!Src1)
15379 break;
15380 auto IterIsSigned = checkDot4MulSignedness(
15381 TempNode->getOperand(AddIdx), *Src0, *Src1,
15382 TempNode->getOperand(AddIdx)->getOperand(0),
15383 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15384 if (!IterIsSigned)
15385 break;
15386 assert(IsSigned);
15387 if (*IterIsSigned != *IsSigned)
15388 break;
15389 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15390 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15391 ChainLength = I + 2;
15392 break;
15393 }
15394
15395 TempNode = TempNode->getOperand(AddIdx);
15396 Src2s.push_back(TempNode);
15397 ChainLength = I + 1;
15398 if (TempNode->getNumOperands() < 2)
15399 break;
15400 LHS = TempNode->getOperand(0);
15401 RHS = TempNode->getOperand(1);
15402 }
15403
15404 if (ChainLength < 2)
15405 return SDValue();
15406
15407 // Masks were constructed with assumption that we would find a chain of
15408 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15409 // 0x0c) so they do not affect dot calculation.
15410 if (ChainLength < 4) {
15411 fixMasks(Src0s, ChainLength);
15412 fixMasks(Src1s, ChainLength);
15413 }
15414
15415 SDValue Src0, Src1;
15416
15417 // If we are just using a single source for both, and have permuted the
15418 // bytes consistently, we can just use the sources without permuting
15419 // (commutation).
15420 bool UseOriginalSrc = false;
15421 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15422 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15423 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15424 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15425 SmallVector<unsigned, 4> SrcBytes;
15426 auto Src0Mask = Src0s.begin()->PermMask;
15427 SrcBytes.push_back(Src0Mask & 0xFF000000);
15428 bool UniqueEntries = true;
15429 for (auto I = 1; I < 4; I++) {
15430 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15431
15432 if (is_contained(SrcBytes, NextByte)) {
15433 UniqueEntries = false;
15434 break;
15435 }
15436 SrcBytes.push_back(NextByte);
15437 }
15438
15439 if (UniqueEntries) {
15440 UseOriginalSrc = true;
15441
15442 auto *FirstElt = Src0s.begin();
15443 auto FirstEltOp =
15444 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15445
15446 auto *SecondElt = Src1s.begin();
15447 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
15448 SecondElt->DWordOffset);
15449
15450 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
15451 MVT::getIntegerVT(32));
15452 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
15453 MVT::getIntegerVT(32));
15454 }
15455 }
15456
15457 if (!UseOriginalSrc) {
15458 Src0 = resolveSources(DAG, SL, Src0s, false, true);
15459 Src1 = resolveSources(DAG, SL, Src1s, false, true);
15460 }
15461
15462 assert(IsSigned);
15463 SDValue Src2 =
15464 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15465
15466 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
15467 : Intrinsic::amdgcn_udot4,
15468 SL, MVT::i64);
15469
15470 assert(!VT.isVector());
15471 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
15472 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
15473
15474 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
15475 }
15476
15477 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15478 return SDValue();
15479
15480 // add x, zext (setcc) => uaddo_carry x, 0, setcc
15481 // add x, sext (setcc) => usubo_carry x, 0, setcc
15482 unsigned Opc = LHS.getOpcode();
15485 std::swap(RHS, LHS);
15486
15487 Opc = RHS.getOpcode();
15488 switch (Opc) {
15489 default:
15490 break;
15491 case ISD::ZERO_EXTEND:
15492 case ISD::SIGN_EXTEND:
15493 case ISD::ANY_EXTEND: {
15494 auto Cond = RHS.getOperand(0);
15495 // If this won't be a real VOPC output, we would still need to insert an
15496 // extra instruction anyway.
15497 if (!isBoolSGPR(Cond))
15498 break;
15499 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15500 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15502 return DAG.getNode(Opc, SL, VTList, Args);
15503 }
15504 case ISD::UADDO_CARRY: {
15505 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
15506 if (!isNullConstant(RHS.getOperand(1)))
15507 break;
15508 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
15509 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
15510 }
15511 }
15512 return SDValue();
15513}
15514
15515SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
15516 DAGCombinerInfo &DCI) const {
15517 SelectionDAG &DAG = DCI.DAG;
15518 SDLoc DL(N);
15519 EVT VT = N->getValueType(0);
15520 SDValue N0 = N->getOperand(0);
15521 SDValue N1 = N->getOperand(1);
15522
15523 // The following folds transform PTRADDs into regular arithmetic in cases
15524 // where the PTRADD wouldn't be folded as an immediate offset into memory
15525 // instructions anyway. They are target-specific in that other targets might
15526 // prefer to not lose information about the pointer arithmetic.
15527
15528 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
15529 // Adapted from DAGCombiner::visitADDLikeCommutative.
15530 SDValue V, K;
15531 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
15532 SDNodeFlags ShlFlags = N1->getFlags();
15533 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
15534 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
15535 // preserved.
15536 SDNodeFlags NewShlFlags =
15537 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
15539 : SDNodeFlags();
15540 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
15541 DCI.AddToWorklist(Inner.getNode());
15542 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
15543 }
15544
15545 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
15546 // performAddCombine.
15547 if (N1.getOpcode() == ISD::MUL) {
15548 if (Subtarget->hasMad64_32()) {
15549 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15550 return Folded;
15551 }
15552 }
15553
15554 // If the 32 low bits of the constant are all zero, there is nothing to fold
15555 // into an immediate offset, so it's better to eliminate the unnecessary
15556 // addition for the lower 32 bits than to preserve the PTRADD.
15557 // Analogous to a fold in performAddCombine.
15558 if (VT == MVT::i64) {
15559 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15560 return Folded;
15561 }
15562
15563 if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
15564 // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
15565 // global address GA and constant c, such that c can be folded into GA.
15566 SDValue GAValue = N0.getOperand(0);
15567 if (const GlobalAddressSDNode *GA =
15569 if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
15570 // If both additions in the original were NUW, reassociation preserves
15571 // that.
15572 SDNodeFlags Flags =
15573 (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15574 SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
15575 DCI.AddToWorklist(Inner.getNode());
15576 return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
15577 }
15578 }
15579 }
15580
15581 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15582 return SDValue();
15583
15584 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15585 // y is not, and (add y, z) is used only once.
15586 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15587 // z is not, and (add y, z) is used only once.
15588 // The goal is to move constant offsets to the outermost ptradd, to create
15589 // more opportunities to fold offsets into memory instructions.
15590 // Together with the generic combines in DAGCombiner.cpp, this also
15591 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15592 //
15593 // This transform is here instead of in the general DAGCombiner as it can
15594 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15595 // AArch64's CPA.
15596 SDValue X = N0;
15597 SDValue Y = N1.getOperand(0);
15598 SDValue Z = N1.getOperand(1);
15599 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15600 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15601
15602 // If both additions in the original were NUW, reassociation preserves that.
15603 SDNodeFlags ReassocFlags =
15604 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15605
15606 if (ZIsConstant != YIsConstant) {
15607 if (YIsConstant)
15608 std::swap(Y, Z);
15609 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15610 DCI.AddToWorklist(Inner.getNode());
15611 return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15612 }
15613
15614 // If one of Y and Z is constant, they have been handled above. If both were
15615 // constant, the addition would have been folded in SelectionDAG::getNode
15616 // already. This ensures that the generic DAG combines won't undo the
15617 // following reassociation.
15618 assert(!YIsConstant && !ZIsConstant);
15619
15620 if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15621 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15622 // y are uniform and z isn't.
15623 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15624 // z are uniform and y isn't.
15625 // The goal is to push uniform operands up in the computation, so that they
15626 // can be handled with scalar operations. We can't use reassociateScalarOps
15627 // for this since it requires two identical commutative operations to
15628 // reassociate.
15629 if (Y->isDivergent())
15630 std::swap(Y, Z);
15631 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15632 DCI.AddToWorklist(UniformInner.getNode());
15633 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15634 }
15635
15636 return SDValue();
15637}
15638
15639SDValue SITargetLowering::performSubCombine(SDNode *N,
15640 DAGCombinerInfo &DCI) const {
15641 SelectionDAG &DAG = DCI.DAG;
15642 EVT VT = N->getValueType(0);
15643
15644 if (VT == MVT::i64) {
15645 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15646 return Folded;
15647 }
15648
15649 if (VT != MVT::i32)
15650 return SDValue();
15651
15652 SDLoc SL(N);
15653 SDValue LHS = N->getOperand(0);
15654 SDValue RHS = N->getOperand(1);
15655
15656 // sub x, zext (setcc) => usubo_carry x, 0, setcc
15657 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
15658 unsigned Opc = RHS.getOpcode();
15659 switch (Opc) {
15660 default:
15661 break;
15662 case ISD::ZERO_EXTEND:
15663 case ISD::SIGN_EXTEND:
15664 case ISD::ANY_EXTEND: {
15665 auto Cond = RHS.getOperand(0);
15666 // If this won't be a real VOPC output, we would still need to insert an
15667 // extra instruction anyway.
15668 if (!isBoolSGPR(Cond))
15669 break;
15670 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15671 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15673 return DAG.getNode(Opc, SL, VTList, Args);
15674 }
15675 }
15676
15677 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
15678 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
15679 if (!isNullConstant(LHS.getOperand(1)))
15680 return SDValue();
15681 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
15682 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
15683 }
15684 return SDValue();
15685}
15686
15687SDValue
15688SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
15689 DAGCombinerInfo &DCI) const {
15690
15691 if (N->getValueType(0) != MVT::i32)
15692 return SDValue();
15693
15694 if (!isNullConstant(N->getOperand(1)))
15695 return SDValue();
15696
15697 SelectionDAG &DAG = DCI.DAG;
15698 SDValue LHS = N->getOperand(0);
15699
15700 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
15701 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
15702 unsigned LHSOpc = LHS.getOpcode();
15703 unsigned Opc = N->getOpcode();
15704 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
15705 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
15706 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
15707 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
15708 }
15709 return SDValue();
15710}
15711
15712SDValue SITargetLowering::performFAddCombine(SDNode *N,
15713 DAGCombinerInfo &DCI) const {
15714 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15715 return SDValue();
15716
15717 SelectionDAG &DAG = DCI.DAG;
15718 EVT VT = N->getValueType(0);
15719
15720 SDLoc SL(N);
15721 SDValue LHS = N->getOperand(0);
15722 SDValue RHS = N->getOperand(1);
15723
15724 // These should really be instruction patterns, but writing patterns with
15725 // source modifiers is a pain.
15726
15727 // fadd (fadd (a, a), b) -> mad 2.0, a, b
15728 if (LHS.getOpcode() == ISD::FADD) {
15729 SDValue A = LHS.getOperand(0);
15730 if (A == LHS.getOperand(1)) {
15731 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15732 if (FusedOp != 0) {
15733 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15734 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
15735 }
15736 }
15737 }
15738
15739 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
15740 if (RHS.getOpcode() == ISD::FADD) {
15741 SDValue A = RHS.getOperand(0);
15742 if (A == RHS.getOperand(1)) {
15743 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15744 if (FusedOp != 0) {
15745 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15746 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
15747 }
15748 }
15749 }
15750
15751 return SDValue();
15752}
15753
15754SDValue SITargetLowering::performFSubCombine(SDNode *N,
15755 DAGCombinerInfo &DCI) const {
15756 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15757 return SDValue();
15758
15759 SelectionDAG &DAG = DCI.DAG;
15760 SDLoc SL(N);
15761 EVT VT = N->getValueType(0);
15762 assert(!VT.isVector());
15763
15764 // Try to get the fneg to fold into the source modifier. This undoes generic
15765 // DAG combines and folds them into the mad.
15766 //
15767 // Only do this if we are not trying to support denormals. v_mad_f32 does
15768 // not support denormals ever.
15769 SDValue LHS = N->getOperand(0);
15770 SDValue RHS = N->getOperand(1);
15771 if (LHS.getOpcode() == ISD::FADD) {
15772 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
15773 SDValue A = LHS.getOperand(0);
15774 if (A == LHS.getOperand(1)) {
15775 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15776 if (FusedOp != 0) {
15777 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15778 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
15779
15780 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
15781 }
15782 }
15783 }
15784
15785 if (RHS.getOpcode() == ISD::FADD) {
15786 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
15787
15788 SDValue A = RHS.getOperand(0);
15789 if (A == RHS.getOperand(1)) {
15790 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15791 if (FusedOp != 0) {
15792 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
15793 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
15794 }
15795 }
15796 }
15797
15798 return SDValue();
15799}
15800
15801SDValue SITargetLowering::performFDivCombine(SDNode *N,
15802 DAGCombinerInfo &DCI) const {
15803 SelectionDAG &DAG = DCI.DAG;
15804 SDLoc SL(N);
15805 EVT VT = N->getValueType(0);
15806 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
15807 return SDValue();
15808
15809 SDValue LHS = N->getOperand(0);
15810 SDValue RHS = N->getOperand(1);
15811
15812 SDNodeFlags Flags = N->getFlags();
15813 SDNodeFlags RHSFlags = RHS->getFlags();
15814 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
15815 !RHS->hasOneUse())
15816 return SDValue();
15817
15818 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
15819 bool IsNegative = false;
15820 if (CLHS->isExactlyValue(1.0) ||
15821 (IsNegative = CLHS->isExactlyValue(-1.0))) {
15822 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
15823 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
15824 if (RHS.getOpcode() == ISD::FSQRT) {
15825 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
15826 SDValue Rsq =
15827 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
15828 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
15829 }
15830 }
15831 }
15832
15833 return SDValue();
15834}
15835
15836SDValue SITargetLowering::performFMulCombine(SDNode *N,
15837 DAGCombinerInfo &DCI) const {
15838 SelectionDAG &DAG = DCI.DAG;
15839 EVT VT = N->getValueType(0);
15840 EVT ScalarVT = VT.getScalarType();
15841 EVT IntVT = VT.changeElementType(MVT::i32);
15842
15843 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
15844 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15845 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
15846 return SDValue();
15847 }
15848
15849 SDValue LHS = N->getOperand(0);
15850 SDValue RHS = N->getOperand(1);
15851
15852 // It is cheaper to realize i32 inline constants as compared against
15853 // materializing f16 or f64 (or even non-inline f32) values,
15854 // possible via ldexp usage, as shown below :
15855 //
15856 // Given : A = 2^a & B = 2^b ; where a and b are integers.
15857 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
15858 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
15859 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
15860 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
15861 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
15862 if (!TrueNode)
15863 return SDValue();
15864 const ConstantFPSDNode *FalseNode =
15865 isConstOrConstSplatFP(RHS.getOperand(2));
15866 if (!FalseNode)
15867 return SDValue();
15868
15869 if (TrueNode->isNegative() != FalseNode->isNegative())
15870 return SDValue();
15871
15872 // For f32, only non-inline constants should be transformed.
15873 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15874 if (ScalarVT == MVT::f32 &&
15875 TII->isInlineConstant(TrueNode->getValueAPF()) &&
15876 TII->isInlineConstant(FalseNode->getValueAPF()))
15877 return SDValue();
15878
15879 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
15880 if (TrueNodeExpVal == INT_MIN)
15881 return SDValue();
15882 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
15883 if (FalseNodeExpVal == INT_MIN)
15884 return SDValue();
15885
15886 SDLoc SL(N);
15887 SDValue SelectNode =
15888 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
15889 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
15890 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
15891
15892 LHS = TrueNode->isNegative()
15893 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
15894 : LHS;
15895
15896 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
15897 }
15898
15899 return SDValue();
15900}
15901
15902SDValue SITargetLowering::performFMACombine(SDNode *N,
15903 DAGCombinerInfo &DCI) const {
15904 SelectionDAG &DAG = DCI.DAG;
15905 EVT VT = N->getValueType(0);
15906 SDLoc SL(N);
15907
15908 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
15909 return SDValue();
15910
15911 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
15912 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
15913 SDValue Op1 = N->getOperand(0);
15914 SDValue Op2 = N->getOperand(1);
15915 SDValue FMA = N->getOperand(2);
15916
15917 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
15918 Op2.getOpcode() != ISD::FP_EXTEND)
15919 return SDValue();
15920
15921 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
15922 // regardless of the denorm mode setting. Therefore,
15923 // fp-contract is sufficient to allow generating fdot2.
15924 const TargetOptions &Options = DAG.getTarget().Options;
15925 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15926 (N->getFlags().hasAllowContract() &&
15927 FMA->getFlags().hasAllowContract())) {
15928 Op1 = Op1.getOperand(0);
15929 Op2 = Op2.getOperand(0);
15930 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15932 return SDValue();
15933
15934 SDValue Vec1 = Op1.getOperand(0);
15935 SDValue Idx1 = Op1.getOperand(1);
15936 SDValue Vec2 = Op2.getOperand(0);
15937
15938 SDValue FMAOp1 = FMA.getOperand(0);
15939 SDValue FMAOp2 = FMA.getOperand(1);
15940 SDValue FMAAcc = FMA.getOperand(2);
15941
15942 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
15943 FMAOp2.getOpcode() != ISD::FP_EXTEND)
15944 return SDValue();
15945
15946 FMAOp1 = FMAOp1.getOperand(0);
15947 FMAOp2 = FMAOp2.getOperand(0);
15948 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15950 return SDValue();
15951
15952 SDValue Vec3 = FMAOp1.getOperand(0);
15953 SDValue Vec4 = FMAOp2.getOperand(0);
15954 SDValue Idx2 = FMAOp1.getOperand(1);
15955
15956 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
15957 // Idx1 and Idx2 cannot be the same.
15958 Idx1 == Idx2)
15959 return SDValue();
15960
15961 if (Vec1 == Vec2 || Vec3 == Vec4)
15962 return SDValue();
15963
15964 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
15965 return SDValue();
15966
15967 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
15968 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
15969 DAG.getTargetConstant(0, SL, MVT::i1));
15970 }
15971 }
15972 return SDValue();
15973}
15974
15975SDValue SITargetLowering::performSetCCCombine(SDNode *N,
15976 DAGCombinerInfo &DCI) const {
15977 SelectionDAG &DAG = DCI.DAG;
15978 SDLoc SL(N);
15979
15980 SDValue LHS = N->getOperand(0);
15981 SDValue RHS = N->getOperand(1);
15982 EVT VT = LHS.getValueType();
15983 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
15984
15985 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15986 if (!CRHS) {
15988 if (CRHS) {
15989 std::swap(LHS, RHS);
15990 CC = getSetCCSwappedOperands(CC);
15991 }
15992 }
15993
15994 if (CRHS) {
15995 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
15996 isBoolSGPR(LHS.getOperand(0))) {
15997 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
15998 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
15999 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16000 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16001 if ((CRHS->isAllOnes() &&
16002 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16003 (CRHS->isZero() &&
16004 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16005 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16006 DAG.getAllOnesConstant(SL, MVT::i1));
16007 if ((CRHS->isAllOnes() &&
16008 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16009 (CRHS->isZero() &&
16010 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16011 return LHS.getOperand(0);
16012 }
16013
16014 const APInt &CRHSVal = CRHS->getAPIntValue();
16015 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16016 LHS.getOpcode() == ISD::SELECT &&
16017 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16018 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16019 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16020 isBoolSGPR(LHS.getOperand(0))) {
16021 // Given CT != FT:
16022 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16023 // setcc (select cc, CT, CF), CF, ne => cc
16024 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16025 // setcc (select cc, CT, CF), CT, eq => cc
16026 const APInt &CT = LHS.getConstantOperandAPInt(1);
16027 const APInt &CF = LHS.getConstantOperandAPInt(2);
16028
16029 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16030 (CT == CRHSVal && CC == ISD::SETNE))
16031 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16032 DAG.getAllOnesConstant(SL, MVT::i1));
16033 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16034 (CT == CRHSVal && CC == ISD::SETEQ))
16035 return LHS.getOperand(0);
16036 }
16037 }
16038
16039 if (VT != MVT::f32 && VT != MVT::f64 &&
16040 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16041 return SDValue();
16042
16043 // Match isinf/isfinite pattern
16044 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16045 // (fcmp one (fabs x), inf) -> (fp_class x,
16046 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16047 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16048 LHS.getOpcode() == ISD::FABS) {
16049 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16050 if (!CRHS)
16051 return SDValue();
16052
16053 const APFloat &APF = CRHS->getValueAPF();
16054 if (APF.isInfinity() && !APF.isNegative()) {
16055 const unsigned IsInfMask =
16057 const unsigned IsFiniteMask =
16061 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16062 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16063 DAG.getConstant(Mask, SL, MVT::i32));
16064 }
16065 }
16066
16067 return SDValue();
16068}
16069
16070SDValue
16071SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16072 DAGCombinerInfo &DCI) const {
16073 SelectionDAG &DAG = DCI.DAG;
16074 SDLoc SL(N);
16075 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16076
16077 SDValue Src = N->getOperand(0);
16078 SDValue Shift = N->getOperand(0);
16079
16080 // TODO: Extend type shouldn't matter (assuming legal types).
16081 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16082 Shift = Shift.getOperand(0);
16083
16084 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16085 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16086 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16087 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16088 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16089 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16090 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16091 SDValue Shifted = DAG.getZExtOrTrunc(
16092 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16093
16094 unsigned ShiftOffset = 8 * Offset;
16095 if (Shift.getOpcode() == ISD::SHL)
16096 ShiftOffset -= C->getZExtValue();
16097 else
16098 ShiftOffset += C->getZExtValue();
16099
16100 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16101 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16102 MVT::f32, Shifted);
16103 }
16104 }
16105 }
16106
16107 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16108 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16109 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16110 // We simplified Src. If this node is not dead, visit it again so it is
16111 // folded properly.
16112 if (N->getOpcode() != ISD::DELETED_NODE)
16113 DCI.AddToWorklist(N);
16114 return SDValue(N, 0);
16115 }
16116
16117 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16118 if (SDValue DemandedSrc =
16119 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16120 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16121
16122 return SDValue();
16123}
16124
16125SDValue SITargetLowering::performClampCombine(SDNode *N,
16126 DAGCombinerInfo &DCI) const {
16127 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16128 if (!CSrc)
16129 return SDValue();
16130
16131 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16132 const APFloat &F = CSrc->getValueAPF();
16133 APFloat Zero = APFloat::getZero(F.getSemantics());
16134 if (F < Zero ||
16135 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16136 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16137 }
16138
16139 APFloat One(F.getSemantics(), "1.0");
16140 if (F > One)
16141 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16142
16143 return SDValue(CSrc, 0);
16144}
16145
16146SDValue SITargetLowering::performSelectCombine(SDNode *N,
16147 DAGCombinerInfo &DCI) const {
16148
16149 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16150 // integer).
16151 // Detect when CMP and SELECT use the same constant and fold them to avoid
16152 // loading the constant twice. Specifically handles patterns like:
16153 // %cmp = icmp eq i32 %val, 4242
16154 // %sel = select i1 %cmp, i32 4242, i32 %other
16155 // It can be optimized to reuse %val instead of 4242 in select.
16156 SDValue Cond = N->getOperand(0);
16157 SDValue TrueVal = N->getOperand(1);
16158 SDValue FalseVal = N->getOperand(2);
16159
16160 // Check if condition is a comparison.
16161 if (Cond.getOpcode() != ISD::SETCC)
16162 return SDValue();
16163
16164 SDValue LHS = Cond.getOperand(0);
16165 SDValue RHS = Cond.getOperand(1);
16166 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16167
16168 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16169 bool isInteger = LHS.getValueType().isInteger();
16170
16171 // Handle simple floating-point and integer types only.
16172 if (!isFloatingPoint && !isInteger)
16173 return SDValue();
16174
16175 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16176 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16177 if (!isEquality && !isNonEquality)
16178 return SDValue();
16179
16180 SDValue ArgVal, ConstVal;
16181 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16182 (isInteger && isa<ConstantSDNode>(RHS))) {
16183 ConstVal = RHS;
16184 ArgVal = LHS;
16185 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16186 (isInteger && isa<ConstantSDNode>(LHS))) {
16187 ConstVal = LHS;
16188 ArgVal = RHS;
16189 } else {
16190 return SDValue();
16191 }
16192
16193 // Skip optimization for inlinable immediates.
16194 if (isFloatingPoint) {
16195 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16196 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16197 return SDValue();
16198 } else {
16200 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16201 return SDValue();
16202 }
16203
16204 // For equality and non-equality comparisons, patterns:
16205 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16206 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16207 if (!(isEquality && TrueVal == ConstVal) &&
16208 !(isNonEquality && FalseVal == ConstVal))
16209 return SDValue();
16210
16211 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16212 SDValue SelectRHS =
16213 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16214 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16215 SelectLHS, SelectRHS);
16216}
16217
16219 DAGCombinerInfo &DCI) const {
16220 switch (N->getOpcode()) {
16221 case ISD::ADD:
16222 case ISD::SUB:
16223 case ISD::SHL:
16224 case ISD::SRL:
16225 case ISD::SRA:
16226 case ISD::AND:
16227 case ISD::OR:
16228 case ISD::XOR:
16229 case ISD::MUL:
16230 case ISD::SETCC:
16231 case ISD::SELECT:
16232 case ISD::SMIN:
16233 case ISD::SMAX:
16234 case ISD::UMIN:
16235 case ISD::UMAX:
16236 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16237 return Res;
16238 break;
16239 default:
16240 break;
16241 }
16242
16243 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16244 return SDValue();
16245
16246 switch (N->getOpcode()) {
16247 case ISD::ADD:
16248 return performAddCombine(N, DCI);
16249 case ISD::PTRADD:
16250 return performPtrAddCombine(N, DCI);
16251 case ISD::SUB:
16252 return performSubCombine(N, DCI);
16253 case ISD::UADDO_CARRY:
16254 case ISD::USUBO_CARRY:
16255 return performAddCarrySubCarryCombine(N, DCI);
16256 case ISD::FADD:
16257 return performFAddCombine(N, DCI);
16258 case ISD::FSUB:
16259 return performFSubCombine(N, DCI);
16260 case ISD::FDIV:
16261 return performFDivCombine(N, DCI);
16262 case ISD::FMUL:
16263 return performFMulCombine(N, DCI);
16264 case ISD::SETCC:
16265 return performSetCCCombine(N, DCI);
16266 case ISD::SELECT:
16267 if (auto Res = performSelectCombine(N, DCI))
16268 return Res;
16269 break;
16270 case ISD::FMAXNUM:
16271 case ISD::FMINNUM:
16272 case ISD::FMAXNUM_IEEE:
16273 case ISD::FMINNUM_IEEE:
16274 case ISD::FMAXIMUM:
16275 case ISD::FMINIMUM:
16276 case ISD::FMAXIMUMNUM:
16277 case ISD::FMINIMUMNUM:
16278 case ISD::SMAX:
16279 case ISD::SMIN:
16280 case ISD::UMAX:
16281 case ISD::UMIN:
16284 return performMinMaxCombine(N, DCI);
16285 case ISD::FMA:
16286 return performFMACombine(N, DCI);
16287 case ISD::AND:
16288 return performAndCombine(N, DCI);
16289 case ISD::OR:
16290 return performOrCombine(N, DCI);
16291 case ISD::FSHR: {
16293 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16294 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16295 return matchPERM(N, DCI);
16296 }
16297 break;
16298 }
16299 case ISD::XOR:
16300 return performXorCombine(N, DCI);
16301 case ISD::ZERO_EXTEND:
16302 return performZeroExtendCombine(N, DCI);
16304 return performSignExtendInRegCombine(N, DCI);
16306 return performClassCombine(N, DCI);
16307 case ISD::FCANONICALIZE:
16308 return performFCanonicalizeCombine(N, DCI);
16309 case AMDGPUISD::RCP:
16310 return performRcpCombine(N, DCI);
16311 case ISD::FLDEXP:
16312 case AMDGPUISD::FRACT:
16313 case AMDGPUISD::RSQ:
16316 case AMDGPUISD::RSQ_CLAMP: {
16317 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16318 SDValue Src = N->getOperand(0);
16319 if (Src.isUndef())
16320 return Src;
16321 break;
16322 }
16323 case ISD::SINT_TO_FP:
16324 case ISD::UINT_TO_FP:
16325 return performUCharToFloatCombine(N, DCI);
16326 case ISD::FCOPYSIGN:
16327 return performFCopySignCombine(N, DCI);
16332 return performCvtF32UByteNCombine(N, DCI);
16333 case AMDGPUISD::FMED3:
16334 return performFMed3Combine(N, DCI);
16336 return performCvtPkRTZCombine(N, DCI);
16337 case AMDGPUISD::CLAMP:
16338 return performClampCombine(N, DCI);
16339 case ISD::SCALAR_TO_VECTOR: {
16340 SelectionDAG &DAG = DCI.DAG;
16341 EVT VT = N->getValueType(0);
16342
16343 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16344 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16345 SDLoc SL(N);
16346 SDValue Src = N->getOperand(0);
16347 EVT EltVT = Src.getValueType();
16348 if (EltVT != MVT::i16)
16349 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16350
16351 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16352 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16353 }
16354
16355 break;
16356 }
16358 return performExtractVectorEltCombine(N, DCI);
16360 return performInsertVectorEltCombine(N, DCI);
16361 case ISD::FP_ROUND:
16362 return performFPRoundCombine(N, DCI);
16363 case ISD::LOAD: {
16364 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16365 return Widened;
16366 [[fallthrough]];
16367 }
16368 default: {
16369 if (!DCI.isBeforeLegalize()) {
16370 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16371 return performMemSDNodeCombine(MemNode, DCI);
16372 }
16373
16374 break;
16375 }
16376 }
16377
16379}
16380
16381/// Helper function for adjustWritemask
16382static unsigned SubIdx2Lane(unsigned Idx) {
16383 switch (Idx) {
16384 default:
16385 return ~0u;
16386 case AMDGPU::sub0:
16387 return 0;
16388 case AMDGPU::sub1:
16389 return 1;
16390 case AMDGPU::sub2:
16391 return 2;
16392 case AMDGPU::sub3:
16393 return 3;
16394 case AMDGPU::sub4:
16395 return 4; // Possible with TFE/LWE
16396 }
16397}
16398
16399/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16400SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16401 SelectionDAG &DAG) const {
16402 unsigned Opcode = Node->getMachineOpcode();
16403
16404 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16405 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16406 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16407 return Node; // not implemented for D16
16408
16409 SDNode *Users[5] = {nullptr};
16410 unsigned Lane = 0;
16411 unsigned DmaskIdx =
16412 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16413 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16414 unsigned NewDmask = 0;
16415 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16416 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16417 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16418 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16419 unsigned TFCLane = 0;
16420 bool HasChain = Node->getNumValues() > 1;
16421
16422 if (OldDmask == 0) {
16423 // These are folded out, but on the chance it happens don't assert.
16424 return Node;
16425 }
16426
16427 unsigned OldBitsSet = llvm::popcount(OldDmask);
16428 // Work out which is the TFE/LWE lane if that is enabled.
16429 if (UsesTFC) {
16430 TFCLane = OldBitsSet;
16431 }
16432
16433 // Try to figure out the used register components
16434 for (SDUse &Use : Node->uses()) {
16435
16436 // Don't look at users of the chain.
16437 if (Use.getResNo() != 0)
16438 continue;
16439
16440 SDNode *User = Use.getUser();
16441
16442 // Abort if we can't understand the usage
16443 if (!User->isMachineOpcode() ||
16444 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16445 return Node;
16446
16447 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16448 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16449 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16450 // set, etc.
16451 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16452 if (Lane == ~0u)
16453 return Node;
16454
16455 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16456 if (UsesTFC && Lane == TFCLane) {
16457 Users[Lane] = User;
16458 } else {
16459 // Set which texture component corresponds to the lane.
16460 unsigned Comp;
16461 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16462 Comp = llvm::countr_zero(Dmask);
16463 Dmask &= ~(1 << Comp);
16464 }
16465
16466 // Abort if we have more than one user per component.
16467 if (Users[Lane])
16468 return Node;
16469
16470 Users[Lane] = User;
16471 NewDmask |= 1 << Comp;
16472 }
16473 }
16474
16475 // Don't allow 0 dmask, as hardware assumes one channel enabled.
16476 bool NoChannels = !NewDmask;
16477 if (NoChannels) {
16478 if (!UsesTFC) {
16479 // No uses of the result and not using TFC. Then do nothing.
16480 return Node;
16481 }
16482 // If the original dmask has one channel - then nothing to do
16483 if (OldBitsSet == 1)
16484 return Node;
16485 // Use an arbitrary dmask - required for the instruction to work
16486 NewDmask = 1;
16487 }
16488 // Abort if there's no change
16489 if (NewDmask == OldDmask)
16490 return Node;
16491
16492 unsigned BitsSet = llvm::popcount(NewDmask);
16493
16494 // Check for TFE or LWE - increase the number of channels by one to account
16495 // for the extra return value
16496 // This will need adjustment for D16 if this is also included in
16497 // adjustWriteMask (this function) but at present D16 are excluded.
16498 unsigned NewChannels = BitsSet + UsesTFC;
16499
16500 int NewOpcode =
16501 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
16502 assert(NewOpcode != -1 &&
16503 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
16504 "failed to find equivalent MIMG op");
16505
16506 // Adjust the writemask in the node
16508 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
16509 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
16510 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
16511
16512 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
16513
16514 MVT ResultVT = NewChannels == 1
16515 ? SVT
16516 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
16517 : NewChannels == 5 ? 8
16518 : NewChannels);
16519 SDVTList NewVTList =
16520 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
16521
16522 MachineSDNode *NewNode =
16523 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
16524
16525 if (HasChain) {
16526 // Update chain.
16527 DAG.setNodeMemRefs(NewNode, Node->memoperands());
16528 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
16529 }
16530
16531 if (NewChannels == 1) {
16532 assert(Node->hasNUsesOfValue(1, 0));
16533 SDNode *Copy =
16534 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
16535 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
16536 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
16537 return nullptr;
16538 }
16539
16540 // Update the users of the node with the new indices
16541 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
16542 SDNode *User = Users[i];
16543 if (!User) {
16544 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
16545 // Users[0] is still nullptr because channel 0 doesn't really have a use.
16546 if (i || !NoChannels)
16547 continue;
16548 } else {
16549 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
16550 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
16551 if (NewUser != User) {
16552 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
16553 DAG.RemoveDeadNode(User);
16554 }
16555 }
16556
16557 switch (Idx) {
16558 default:
16559 break;
16560 case AMDGPU::sub0:
16561 Idx = AMDGPU::sub1;
16562 break;
16563 case AMDGPU::sub1:
16564 Idx = AMDGPU::sub2;
16565 break;
16566 case AMDGPU::sub2:
16567 Idx = AMDGPU::sub3;
16568 break;
16569 case AMDGPU::sub3:
16570 Idx = AMDGPU::sub4;
16571 break;
16572 }
16573 }
16574
16575 DAG.RemoveDeadNode(Node);
16576 return nullptr;
16577}
16578
16580 if (Op.getOpcode() == ISD::AssertZext)
16581 Op = Op.getOperand(0);
16582
16583 return isa<FrameIndexSDNode>(Op);
16584}
16585
16586/// Legalize target independent instructions (e.g. INSERT_SUBREG)
16587/// with frame index operands.
16588/// LLVM assumes that inputs are to these instructions are registers.
16589SDNode *
16591 SelectionDAG &DAG) const {
16592 if (Node->getOpcode() == ISD::CopyToReg) {
16593 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
16594 SDValue SrcVal = Node->getOperand(2);
16595
16596 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
16597 // to try understanding copies to physical registers.
16598 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
16599 SDLoc SL(Node);
16601 SDValue VReg = DAG.getRegister(
16602 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16603
16604 SDNode *Glued = Node->getGluedNode();
16605 SDValue ToVReg = DAG.getCopyToReg(
16606 Node->getOperand(0), SL, VReg, SrcVal,
16607 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
16608 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
16609 VReg, ToVReg.getValue(1));
16610 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
16611 DAG.RemoveDeadNode(Node);
16612 return ToResultReg.getNode();
16613 }
16614 }
16615
16617 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
16618 if (!isFrameIndexOp(Node->getOperand(i))) {
16619 Ops.push_back(Node->getOperand(i));
16620 continue;
16621 }
16622
16623 SDLoc DL(Node);
16624 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
16625 Node->getOperand(i).getValueType(),
16626 Node->getOperand(i)),
16627 0));
16628 }
16629
16630 return DAG.UpdateNodeOperands(Node, Ops);
16631}
16632
16633/// Fold the instructions after selecting them.
16634/// Returns null if users were already updated.
16636 SelectionDAG &DAG) const {
16638 unsigned Opcode = Node->getMachineOpcode();
16639
16640 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
16641 !TII->isGather4(Opcode) &&
16642 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
16643 return adjustWritemask(Node, DAG);
16644 }
16645
16646 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16648 return Node;
16649 }
16650
16651 switch (Opcode) {
16652 case AMDGPU::V_DIV_SCALE_F32_e64:
16653 case AMDGPU::V_DIV_SCALE_F64_e64: {
16654 // Satisfy the operand register constraint when one of the inputs is
16655 // undefined. Ordinarily each undef value will have its own implicit_def of
16656 // a vreg, so force these to use a single register.
16657 SDValue Src0 = Node->getOperand(1);
16658 SDValue Src1 = Node->getOperand(3);
16659 SDValue Src2 = Node->getOperand(5);
16660
16661 if ((Src0.isMachineOpcode() &&
16662 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
16663 (Src0 == Src1 || Src0 == Src2))
16664 break;
16665
16666 MVT VT = Src0.getValueType().getSimpleVT();
16667 const TargetRegisterClass *RC =
16668 getRegClassFor(VT, Src0.getNode()->isDivergent());
16669
16671 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
16672
16673 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
16674 Src0, SDValue());
16675
16676 // src0 must be the same register as src1 or src2, even if the value is
16677 // undefined, so make sure we don't violate this constraint.
16678 if (Src0.isMachineOpcode() &&
16679 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
16680 if (Src1.isMachineOpcode() &&
16681 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16682 Src0 = Src1;
16683 else if (Src2.isMachineOpcode() &&
16684 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16685 Src0 = Src2;
16686 else {
16687 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
16688 Src0 = UndefReg;
16689 Src1 = UndefReg;
16690 }
16691 } else
16692 break;
16693
16695 Ops[1] = Src0;
16696 Ops[3] = Src1;
16697 Ops[5] = Src2;
16698 Ops.push_back(ImpDef.getValue(1));
16699 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
16700 }
16701 default:
16702 break;
16703 }
16704
16705 return Node;
16706}
16707
16708// Any MIMG instructions that use tfe or lwe require an initialization of the
16709// result register that will be written in the case of a memory access failure.
16710// The required code is also added to tie this init code to the result of the
16711// img instruction.
16714 const SIRegisterInfo &TRI = TII->getRegisterInfo();
16715 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
16716 MachineBasicBlock &MBB = *MI.getParent();
16717
16718 int DstIdx =
16719 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
16720 unsigned InitIdx = 0;
16721
16722 if (TII->isImage(MI)) {
16723 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
16724 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
16725 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
16726
16727 if (!TFE && !LWE) // intersect_ray
16728 return;
16729
16730 unsigned TFEVal = TFE ? TFE->getImm() : 0;
16731 unsigned LWEVal = LWE ? LWE->getImm() : 0;
16732 unsigned D16Val = D16 ? D16->getImm() : 0;
16733
16734 if (!TFEVal && !LWEVal)
16735 return;
16736
16737 // At least one of TFE or LWE are non-zero
16738 // We have to insert a suitable initialization of the result value and
16739 // tie this to the dest of the image instruction.
16740
16741 // Calculate which dword we have to initialize to 0.
16742 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
16743
16744 // check that dmask operand is found.
16745 assert(MO_Dmask && "Expected dmask operand in instruction");
16746
16747 unsigned dmask = MO_Dmask->getImm();
16748 // Determine the number of active lanes taking into account the
16749 // Gather4 special case
16750 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
16751
16752 bool Packed = !Subtarget->hasUnpackedD16VMem();
16753
16754 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16755
16756 // Abandon attempt if the dst size isn't large enough
16757 // - this is in fact an error but this is picked up elsewhere and
16758 // reported correctly.
16759 uint32_t DstSize =
16760 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16761 if (DstSize < InitIdx)
16762 return;
16763 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
16764 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16765 } else {
16766 return;
16767 }
16768
16769 const DebugLoc &DL = MI.getDebugLoc();
16770
16771 // Create a register for the initialization value.
16772 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
16773 unsigned NewDst = 0; // Final initialized value will be in here
16774
16775 // If PRTStrictNull feature is enabled (the default) then initialize
16776 // all the result registers to 0, otherwise just the error indication
16777 // register (VGPRn+1)
16778 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
16779 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
16780
16781 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
16782 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16783 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
16784 // Initialize dword
16785 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
16786 // clang-format off
16787 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
16788 .addImm(0);
16789 // clang-format on
16790 // Insert into the super-reg
16791 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
16792 .addReg(PrevDst)
16793 .addReg(SubReg)
16795
16796 PrevDst = NewDst;
16797 }
16798
16799 // Add as an implicit operand
16800 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
16801
16802 // Tie the just added implicit operand to the dst
16803 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
16804}
16805
16806/// Assign the register class depending on the number of
16807/// bits set in the writemask
16809 SDNode *Node) const {
16811
16812 MachineFunction *MF = MI.getParent()->getParent();
16815
16816 if (TII->isVOP3(MI.getOpcode())) {
16817 // Make sure constant bus requirements are respected.
16818 TII->legalizeOperandsVOP3(MRI, MI);
16819
16820 // Prefer VGPRs over AGPRs in mAI instructions where possible.
16821 // This saves a chain-copy of registers and better balance register
16822 // use between vgpr and agpr as agpr tuples tend to be big.
16823 if (!MI.getDesc().operands().empty()) {
16824 unsigned Opc = MI.getOpcode();
16825 bool HasAGPRs = Info->mayNeedAGPRs();
16826 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16827 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
16828 for (auto I :
16829 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
16830 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
16831 if (I == -1)
16832 break;
16833 if ((I == Src2Idx) && (HasAGPRs))
16834 break;
16835 MachineOperand &Op = MI.getOperand(I);
16836 if (!Op.isReg() || !Op.getReg().isVirtual())
16837 continue;
16838 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
16839 if (!TRI->hasAGPRs(RC))
16840 continue;
16841 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
16842 if (!Src || !Src->isCopy() ||
16843 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
16844 continue;
16845 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
16846 // All uses of agpr64 and agpr32 can also accept vgpr except for
16847 // v_accvgpr_read, but we do not produce agpr reads during selection,
16848 // so no use checks are needed.
16849 MRI.setRegClass(Op.getReg(), NewRC);
16850 }
16851
16852 if (TII->isMAI(MI)) {
16853 // The ordinary src0, src1, src2 were legalized above.
16854 //
16855 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
16856 // as a separate instruction.
16857 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16858 AMDGPU::OpName::scale_src0);
16859 if (Src0Idx != -1) {
16860 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
16861 AMDGPU::OpName::scale_src1);
16862 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
16863 TII->usesConstantBus(MRI, MI, Src1Idx))
16864 TII->legalizeOpWithMove(MI, Src1Idx);
16865 }
16866 }
16867
16868 if (!HasAGPRs)
16869 return;
16870
16871 // Resolve the rest of AV operands to AGPRs.
16872 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
16873 if (Src2->isReg() && Src2->getReg().isVirtual()) {
16874 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
16875 if (TRI->isVectorSuperClass(RC)) {
16876 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
16877 MRI.setRegClass(Src2->getReg(), NewRC);
16878 if (Src2->isTied())
16879 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
16880 }
16881 }
16882 }
16883 }
16884
16885 return;
16886 }
16887
16888 if (TII->isImage(MI))
16889 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
16890}
16891
16893 uint64_t Val) {
16894 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
16895 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
16896}
16897
16899 const SDLoc &DL,
16900 SDValue Ptr) const {
16902
16903 // Build the half of the subregister with the constants before building the
16904 // full 128-bit register. If we are building multiple resource descriptors,
16905 // this will allow CSEing of the 2-component register.
16906 const SDValue Ops0[] = {
16907 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
16908 buildSMovImm32(DAG, DL, 0),
16909 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16910 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
16911 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
16912
16913 SDValue SubRegHi = SDValue(
16914 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
16915
16916 // Combine the constants and the pointer.
16917 const SDValue Ops1[] = {
16918 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
16919 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
16920 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
16921
16922 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
16923}
16924
16925/// Return a resource descriptor with the 'Add TID' bit enabled
16926/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
16927/// of the resource descriptor) to create an offset, which is added to
16928/// the resource pointer.
16930 SDValue Ptr, uint32_t RsrcDword1,
16931 uint64_t RsrcDword2And3) const {
16932 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
16933 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
16934 if (RsrcDword1) {
16935 PtrHi =
16936 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
16937 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
16938 0);
16939 }
16940
16941 SDValue DataLo =
16942 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
16943 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
16944
16945 const SDValue Ops[] = {
16946 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
16947 PtrLo,
16948 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
16949 PtrHi,
16950 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
16951 DataLo,
16952 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
16953 DataHi,
16954 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
16955
16956 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
16957}
16958
16959//===----------------------------------------------------------------------===//
16960// SI Inline Assembly Support
16961//===----------------------------------------------------------------------===//
16962
16963std::pair<unsigned, const TargetRegisterClass *>
16965 StringRef Constraint,
16966 MVT VT) const {
16967 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
16968
16969 const TargetRegisterClass *RC = nullptr;
16970 if (Constraint.size() == 1) {
16971 // Check if we cannot determine the bit size of the given value type. This
16972 // can happen, for example, in this situation where we have an empty struct
16973 // (size 0): `call void asm "", "v"({} poison)`-
16974 if (VT == MVT::Other)
16975 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16976 const unsigned BitWidth = VT.getSizeInBits();
16977 switch (Constraint[0]) {
16978 default:
16979 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16980 case 's':
16981 case 'r':
16982 switch (BitWidth) {
16983 case 16:
16984 RC = &AMDGPU::SReg_32RegClass;
16985 break;
16986 case 64:
16987 RC = &AMDGPU::SGPR_64RegClass;
16988 break;
16989 default:
16991 if (!RC)
16992 return std::pair(0U, nullptr);
16993 break;
16994 }
16995 break;
16996 case 'v':
16997 switch (BitWidth) {
16998 case 16:
16999 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17000 : &AMDGPU::VGPR_32_Lo256RegClass;
17001 break;
17002 default:
17003 RC = Subtarget->has1024AddressableVGPRs()
17004 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17005 : TRI->getVGPRClassForBitWidth(BitWidth);
17006 if (!RC)
17007 return std::pair(0U, nullptr);
17008 break;
17009 }
17010 break;
17011 case 'a':
17012 if (!Subtarget->hasMAIInsts())
17013 break;
17014 switch (BitWidth) {
17015 case 16:
17016 RC = &AMDGPU::AGPR_32RegClass;
17017 break;
17018 default:
17019 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17020 if (!RC)
17021 return std::pair(0U, nullptr);
17022 break;
17023 }
17024 break;
17025 }
17026 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17027 const unsigned BitWidth = VT.getSizeInBits();
17028 switch (BitWidth) {
17029 case 16:
17030 RC = &AMDGPU::AV_32RegClass;
17031 break;
17032 default:
17033 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17034 if (!RC)
17035 return std::pair(0U, nullptr);
17036 break;
17037 }
17038 }
17039
17040 // We actually support i128, i16 and f16 as inline parameters
17041 // even if they are not reported as legal
17042 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17043 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17044 return std::pair(0U, RC);
17045
17046 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17047 if (Kind != '\0') {
17048 if (Kind == 'v') {
17049 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17050 } else if (Kind == 's') {
17051 RC = &AMDGPU::SGPR_32RegClass;
17052 } else if (Kind == 'a') {
17053 RC = &AMDGPU::AGPR_32RegClass;
17054 }
17055
17056 if (RC) {
17057 if (NumRegs > 1) {
17058 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17059 return std::pair(0U, nullptr);
17060
17061 uint32_t Width = NumRegs * 32;
17062 // Prohibit constraints for register ranges with a width that does not
17063 // match the required type.
17064 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17065 return std::pair(0U, nullptr);
17066
17067 MCRegister Reg = RC->getRegister(Idx);
17069 RC = TRI->getVGPRClassForBitWidth(Width);
17070 else if (SIRegisterInfo::isSGPRClass(RC))
17071 RC = TRI->getSGPRClassForBitWidth(Width);
17072 else if (SIRegisterInfo::isAGPRClass(RC))
17073 RC = TRI->getAGPRClassForBitWidth(Width);
17074 if (RC) {
17075 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17076 if (!Reg) {
17077 // The register class does not contain the requested register,
17078 // e.g., because it is an SGPR pair that would violate alignment
17079 // requirements.
17080 return std::pair(0U, nullptr);
17081 }
17082 return std::pair(Reg, RC);
17083 }
17084 }
17085
17086 // Check for lossy scalar/vector conversions.
17087 if (VT.isVector() && VT.getSizeInBits() != 32)
17088 return std::pair(0U, nullptr);
17089 if (Idx < RC->getNumRegs())
17090 return std::pair(RC->getRegister(Idx), RC);
17091 return std::pair(0U, nullptr);
17092 }
17093 }
17094
17095 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17096 if (Ret.first)
17097 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17098
17099 return Ret;
17100}
17101
17102static bool isImmConstraint(StringRef Constraint) {
17103 if (Constraint.size() == 1) {
17104 switch (Constraint[0]) {
17105 default:
17106 break;
17107 case 'I':
17108 case 'J':
17109 case 'A':
17110 case 'B':
17111 case 'C':
17112 return true;
17113 }
17114 } else if (Constraint == "DA" || Constraint == "DB") {
17115 return true;
17116 }
17117 return false;
17118}
17119
17122 if (Constraint.size() == 1) {
17123 switch (Constraint[0]) {
17124 default:
17125 break;
17126 case 's':
17127 case 'v':
17128 case 'a':
17129 return C_RegisterClass;
17130 }
17131 } else if (Constraint.size() == 2) {
17132 if (Constraint == "VA")
17133 return C_RegisterClass;
17134 }
17135 if (isImmConstraint(Constraint)) {
17136 return C_Other;
17137 }
17138 return TargetLowering::getConstraintType(Constraint);
17139}
17140
17141static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17143 Val = Val & maskTrailingOnes<uint64_t>(Size);
17144 }
17145 return Val;
17146}
17147
17149 StringRef Constraint,
17150 std::vector<SDValue> &Ops,
17151 SelectionDAG &DAG) const {
17152 if (isImmConstraint(Constraint)) {
17153 uint64_t Val;
17154 if (getAsmOperandConstVal(Op, Val) &&
17155 checkAsmConstraintVal(Op, Constraint, Val)) {
17156 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17157 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17158 }
17159 } else {
17161 }
17162}
17163
17165 unsigned Size = Op.getScalarValueSizeInBits();
17166 if (Size > 64)
17167 return false;
17168
17169 if (Size == 16 && !Subtarget->has16BitInsts())
17170 return false;
17171
17173 Val = C->getSExtValue();
17174 return true;
17175 }
17177 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17178 return true;
17179 }
17181 if (Size != 16 || Op.getNumOperands() != 2)
17182 return false;
17183 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17184 return false;
17185 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17186 Val = C->getSExtValue();
17187 return true;
17188 }
17189 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17190 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17191 return true;
17192 }
17193 }
17194
17195 return false;
17196}
17197
17199 uint64_t Val) const {
17200 if (Constraint.size() == 1) {
17201 switch (Constraint[0]) {
17202 case 'I':
17204 case 'J':
17205 return isInt<16>(Val);
17206 case 'A':
17207 return checkAsmConstraintValA(Op, Val);
17208 case 'B':
17209 return isInt<32>(Val);
17210 case 'C':
17211 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17213 default:
17214 break;
17215 }
17216 } else if (Constraint.size() == 2) {
17217 if (Constraint == "DA") {
17218 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17219 int64_t LoBits = static_cast<int32_t>(Val);
17220 return checkAsmConstraintValA(Op, HiBits, 32) &&
17221 checkAsmConstraintValA(Op, LoBits, 32);
17222 }
17223 if (Constraint == "DB") {
17224 return true;
17225 }
17226 }
17227 llvm_unreachable("Invalid asm constraint");
17228}
17229
17231 unsigned MaxSize) const {
17232 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17233 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17234 if (Size == 16) {
17235 MVT VT = Op.getSimpleValueType();
17236 switch (VT.SimpleTy) {
17237 default:
17238 return false;
17239 case MVT::i16:
17240 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17241 case MVT::f16:
17242 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17243 case MVT::bf16:
17244 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17245 case MVT::v2i16:
17246 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17247 case MVT::v2f16:
17248 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17249 case MVT::v2bf16:
17250 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17251 }
17252 }
17253 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17254 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17255 return true;
17256 return false;
17257}
17258
17259static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17260 switch (UnalignedClassID) {
17261 case AMDGPU::VReg_64RegClassID:
17262 return AMDGPU::VReg_64_Align2RegClassID;
17263 case AMDGPU::VReg_96RegClassID:
17264 return AMDGPU::VReg_96_Align2RegClassID;
17265 case AMDGPU::VReg_128RegClassID:
17266 return AMDGPU::VReg_128_Align2RegClassID;
17267 case AMDGPU::VReg_160RegClassID:
17268 return AMDGPU::VReg_160_Align2RegClassID;
17269 case AMDGPU::VReg_192RegClassID:
17270 return AMDGPU::VReg_192_Align2RegClassID;
17271 case AMDGPU::VReg_224RegClassID:
17272 return AMDGPU::VReg_224_Align2RegClassID;
17273 case AMDGPU::VReg_256RegClassID:
17274 return AMDGPU::VReg_256_Align2RegClassID;
17275 case AMDGPU::VReg_288RegClassID:
17276 return AMDGPU::VReg_288_Align2RegClassID;
17277 case AMDGPU::VReg_320RegClassID:
17278 return AMDGPU::VReg_320_Align2RegClassID;
17279 case AMDGPU::VReg_352RegClassID:
17280 return AMDGPU::VReg_352_Align2RegClassID;
17281 case AMDGPU::VReg_384RegClassID:
17282 return AMDGPU::VReg_384_Align2RegClassID;
17283 case AMDGPU::VReg_512RegClassID:
17284 return AMDGPU::VReg_512_Align2RegClassID;
17285 case AMDGPU::VReg_1024RegClassID:
17286 return AMDGPU::VReg_1024_Align2RegClassID;
17287 case AMDGPU::AReg_64RegClassID:
17288 return AMDGPU::AReg_64_Align2RegClassID;
17289 case AMDGPU::AReg_96RegClassID:
17290 return AMDGPU::AReg_96_Align2RegClassID;
17291 case AMDGPU::AReg_128RegClassID:
17292 return AMDGPU::AReg_128_Align2RegClassID;
17293 case AMDGPU::AReg_160RegClassID:
17294 return AMDGPU::AReg_160_Align2RegClassID;
17295 case AMDGPU::AReg_192RegClassID:
17296 return AMDGPU::AReg_192_Align2RegClassID;
17297 case AMDGPU::AReg_256RegClassID:
17298 return AMDGPU::AReg_256_Align2RegClassID;
17299 case AMDGPU::AReg_512RegClassID:
17300 return AMDGPU::AReg_512_Align2RegClassID;
17301 case AMDGPU::AReg_1024RegClassID:
17302 return AMDGPU::AReg_1024_Align2RegClassID;
17303 default:
17304 return -1;
17305 }
17306}
17307
17308// Figure out which registers should be reserved for stack access. Only after
17309// the function is legalized do we know all of the non-spill stack objects or if
17310// calls are present.
17314 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17315 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17316 const SIInstrInfo *TII = ST.getInstrInfo();
17317
17318 if (Info->isEntryFunction()) {
17319 // Callable functions have fixed registers used for stack access.
17321 }
17322
17323 // TODO: Move this logic to getReservedRegs()
17324 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17325 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17326 Register SReg = ST.isWave32()
17327 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17328 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17329 &AMDGPU::SGPR_64RegClass);
17330 Info->setSGPRForEXECCopy(SReg);
17331
17332 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17333 Info->getStackPtrOffsetReg()));
17334 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17335 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17336
17337 // We need to worry about replacing the default register with itself in case
17338 // of MIR testcases missing the MFI.
17339 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17340 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17341
17342 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17343 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17344
17345 Info->limitOccupancy(MF);
17346
17347 if (ST.isWave32() && !MF.empty()) {
17348 for (auto &MBB : MF) {
17349 for (auto &MI : MBB) {
17350 TII->fixImplicitOperands(MI);
17351 }
17352 }
17353 }
17354
17355 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17356 // classes if required. Ideally the register class constraints would differ
17357 // per-subtarget, but there's no easy way to achieve that right now. This is
17358 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17359 // from using them as the register class for legal types.
17360 if (ST.needsAlignedVGPRs()) {
17361 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17362 const Register Reg = Register::index2VirtReg(I);
17363 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17364 if (!RC)
17365 continue;
17366 int NewClassID = getAlignedAGPRClassID(RC->getID());
17367 if (NewClassID != -1)
17368 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17369 }
17370 }
17371
17373}
17374
17376 KnownBits &Known,
17377 const APInt &DemandedElts,
17378 const SelectionDAG &DAG,
17379 unsigned Depth) const {
17380 Known.resetAll();
17381 unsigned Opc = Op.getOpcode();
17382 switch (Opc) {
17384 unsigned IID = Op.getConstantOperandVal(0);
17385 switch (IID) {
17386 case Intrinsic::amdgcn_mbcnt_lo:
17387 case Intrinsic::amdgcn_mbcnt_hi: {
17388 const GCNSubtarget &ST =
17390 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17391 // most 31 + src1.
17392 Known.Zero.setBitsFrom(
17393 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17394 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17395 Known = KnownBits::add(Known, Known2);
17396 return;
17397 }
17398 }
17399 break;
17400 }
17401 }
17403 Op, Known, DemandedElts, DAG, Depth);
17404}
17405
17407 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17409
17410 // Set the high bits to zero based on the maximum allowed scratch size per
17411 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17412 // calculation won't overflow, so assume the sign bit is never set.
17413 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17414}
17415
17417 GISelValueTracking &VT, KnownBits &Known,
17418 unsigned Dim) {
17419 unsigned MaxValue =
17420 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17421 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17422}
17423
17425 KnownBits &Known, const APInt &DemandedElts,
17426 unsigned BFEWidth, bool SExt, unsigned Depth) {
17428 const MachineOperand &Src1 = MI.getOperand(2);
17429
17430 unsigned Src1Cst = 0;
17431 if (Src1.isImm()) {
17432 Src1Cst = Src1.getImm();
17433 } else if (Src1.isReg()) {
17434 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17435 if (!Cst)
17436 return;
17437 Src1Cst = Cst->Value.getZExtValue();
17438 } else {
17439 return;
17440 }
17441
17442 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17443 // Width is always [22:16].
17444 const unsigned Offset =
17445 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17446 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17447
17448 if (Width >= BFEWidth) // Ill-formed.
17449 return;
17450
17451 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17452 Depth + 1);
17453
17454 Known = Known.extractBits(Width, Offset);
17455
17456 if (SExt)
17457 Known = Known.sext(BFEWidth);
17458 else
17459 Known = Known.zext(BFEWidth);
17460}
17461
17463 GISelValueTracking &VT, Register R, KnownBits &Known,
17464 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
17465 unsigned Depth) const {
17466 Known.resetAll();
17467 const MachineInstr *MI = MRI.getVRegDef(R);
17468 switch (MI->getOpcode()) {
17469 case AMDGPU::S_BFE_I32:
17470 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17471 /*SExt=*/true, Depth);
17472 case AMDGPU::S_BFE_U32:
17473 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17474 /*SExt=*/false, Depth);
17475 case AMDGPU::S_BFE_I64:
17476 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17477 /*SExt=*/true, Depth);
17478 case AMDGPU::S_BFE_U64:
17479 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17480 /*SExt=*/false, Depth);
17481 case AMDGPU::G_INTRINSIC:
17482 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17483 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
17484 switch (IID) {
17485 case Intrinsic::amdgcn_workitem_id_x:
17486 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
17487 break;
17488 case Intrinsic::amdgcn_workitem_id_y:
17489 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
17490 break;
17491 case Intrinsic::amdgcn_workitem_id_z:
17492 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
17493 break;
17494 case Intrinsic::amdgcn_mbcnt_lo:
17495 case Intrinsic::amdgcn_mbcnt_hi: {
17496 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17497 // most 31 + src1.
17498 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
17499 ? getSubtarget()->getWavefrontSizeLog2()
17500 : 5);
17501 KnownBits Known2;
17502 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
17503 Depth + 1);
17504 Known = KnownBits::add(Known, Known2);
17505 break;
17506 }
17507 case Intrinsic::amdgcn_groupstaticsize: {
17508 // We can report everything over the maximum size as 0. We can't report
17509 // based on the actual size because we don't know if it's accurate or not
17510 // at any given point.
17511 Known.Zero.setHighBits(
17512 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
17513 break;
17514 }
17515 }
17516 break;
17517 }
17518 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17519 Known.Zero.setHighBits(24);
17520 break;
17521 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17522 Known.Zero.setHighBits(16);
17523 break;
17524 case AMDGPU::G_AMDGPU_SMED3:
17525 case AMDGPU::G_AMDGPU_UMED3: {
17526 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
17527
17528 KnownBits Known2;
17529 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
17530 if (Known2.isUnknown())
17531 break;
17532
17533 KnownBits Known1;
17534 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
17535 if (Known1.isUnknown())
17536 break;
17537
17538 KnownBits Known0;
17539 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
17540 if (Known0.isUnknown())
17541 break;
17542
17543 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
17544 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
17545 Known.One = Known0.One & Known1.One & Known2.One;
17546 break;
17547 }
17548 }
17549}
17550
17553 unsigned Depth) const {
17554 const MachineInstr *MI = MRI.getVRegDef(R);
17555 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
17556 // FIXME: Can this move to generic code? What about the case where the call
17557 // site specifies a lower alignment?
17558 Intrinsic::ID IID = GI->getIntrinsicID();
17560 AttributeList Attrs =
17561 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
17562 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
17563 return *RetAlign;
17564 }
17565 return Align(1);
17566}
17567
17570 const Align CacheLineAlign = Align(64);
17571
17572 // Pre-GFX10 target did not benefit from loop alignment
17573 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
17574 getSubtarget()->hasInstFwdPrefetchBug())
17575 return PrefAlign;
17576
17577 // On GFX10 I$ is 4 x 64 bytes cache lines.
17578 // By default prefetcher keeps one cache line behind and reads two ahead.
17579 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
17580 // behind and one ahead.
17581 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
17582 // If loop fits 64 bytes it always spans no more than two cache lines and
17583 // does not need an alignment.
17584 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
17585 // Else if loop is less or equal 192 bytes we need two lines behind.
17586
17588 const MachineBasicBlock *Header = ML->getHeader();
17589 if (Header->getAlignment() != PrefAlign)
17590 return Header->getAlignment(); // Already processed.
17591
17592 unsigned LoopSize = 0;
17593 for (const MachineBasicBlock *MBB : ML->blocks()) {
17594 // If inner loop block is aligned assume in average half of the alignment
17595 // size to be added as nops.
17596 if (MBB != Header)
17597 LoopSize += MBB->getAlignment().value() / 2;
17598
17599 for (const MachineInstr &MI : *MBB) {
17600 LoopSize += TII->getInstSizeInBytes(MI);
17601 if (LoopSize > 192)
17602 return PrefAlign;
17603 }
17604 }
17605
17606 if (LoopSize <= 64)
17607 return PrefAlign;
17608
17609 if (LoopSize <= 128)
17610 return CacheLineAlign;
17611
17612 // If any of parent loops is surrounded by prefetch instructions do not
17613 // insert new for inner loop, which would reset parent's settings.
17614 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
17615 if (MachineBasicBlock *Exit = P->getExitBlock()) {
17616 auto I = Exit->getFirstNonDebugInstr();
17617 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17618 return CacheLineAlign;
17619 }
17620 }
17621
17622 MachineBasicBlock *Pre = ML->getLoopPreheader();
17623 MachineBasicBlock *Exit = ML->getExitBlock();
17624
17625 if (Pre && Exit) {
17626 auto PreTerm = Pre->getFirstTerminator();
17627 if (PreTerm == Pre->begin() ||
17628 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17629 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17630 .addImm(1); // prefetch 2 lines behind PC
17631
17632 auto ExitHead = Exit->getFirstNonDebugInstr();
17633 if (ExitHead == Exit->end() ||
17634 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17635 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17636 .addImm(2); // prefetch 1 line behind PC
17637 }
17638
17639 return CacheLineAlign;
17640}
17641
17643static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
17644 assert(N->getOpcode() == ISD::CopyFromReg);
17645 do {
17646 // Follow the chain until we find an INLINEASM node.
17647 N = N->getOperand(0).getNode();
17648 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
17649 return true;
17650 } while (N->getOpcode() == ISD::CopyFromReg);
17651 return false;
17652}
17653
17656 UniformityInfo *UA) const {
17657 switch (N->getOpcode()) {
17658 case ISD::CopyFromReg: {
17659 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
17660 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
17661 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17662 Register Reg = R->getReg();
17663
17664 // FIXME: Why does this need to consider isLiveIn?
17665 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
17666 return !TRI->isSGPRReg(MRI, Reg);
17667
17668 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
17669 return UA->isDivergent(V);
17670
17672 return !TRI->isSGPRReg(MRI, Reg);
17673 }
17674 case ISD::LOAD: {
17675 const LoadSDNode *L = cast<LoadSDNode>(N);
17676 unsigned AS = L->getAddressSpace();
17677 // A flat load may access private memory.
17679 }
17680 case ISD::CALLSEQ_END:
17681 return true;
17683 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
17685 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
17704 // Target-specific read-modify-write atomics are sources of divergence.
17705 return true;
17706 default:
17707 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
17708 // Generic read-modify-write atomics are sources of divergence.
17709 return A->readMem() && A->writeMem();
17710 }
17711 return false;
17712 }
17713}
17714
17716 EVT VT) const {
17717 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
17718 case MVT::f32:
17720 case MVT::f64:
17721 case MVT::f16:
17723 default:
17724 return false;
17725 }
17726}
17727
17729 LLT Ty, const MachineFunction &MF) const {
17730 switch (Ty.getScalarSizeInBits()) {
17731 case 32:
17732 return !denormalModeIsFlushAllF32(MF);
17733 case 64:
17734 case 16:
17735 return !denormalModeIsFlushAllF64F16(MF);
17736 default:
17737 return false;
17738 }
17739}
17740
17742 const APInt &DemandedElts,
17743 const SelectionDAG &DAG,
17744 bool SNaN,
17745 unsigned Depth) const {
17746 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
17747 const MachineFunction &MF = DAG.getMachineFunction();
17749
17750 if (Info->getMode().DX10Clamp)
17751 return true; // Clamped to 0.
17752 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
17753 }
17754
17756 DAG, SNaN, Depth);
17757}
17758
17759// On older subtargets, global FP atomic instructions have a hardcoded FP mode
17760// and do not support FP32 denormals, and only support v2f16/f64 denormals.
17762 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
17763 return true;
17764
17766 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
17767 if (DenormMode == DenormalMode::getPreserveSign())
17768 return true;
17769
17770 // TODO: Remove this.
17771 return RMW->getFunction()
17772 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
17773 .getValueAsBool();
17774}
17775
17777 LLVMContext &Ctx = RMW->getContext();
17778 StringRef MemScope =
17779 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
17780
17781 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
17782 << "Hardware instruction generated for atomic "
17783 << RMW->getOperationName(RMW->getOperation())
17784 << " operation at memory scope " << MemScope;
17785}
17786
17787static bool isV2F16OrV2BF16(Type *Ty) {
17788 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
17789 Type *EltTy = VT->getElementType();
17790 return VT->getNumElements() == 2 &&
17791 (EltTy->isHalfTy() || EltTy->isBFloatTy());
17792 }
17793
17794 return false;
17795}
17796
17797static bool isV2F16(Type *Ty) {
17799 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
17800}
17801
17802static bool isV2BF16(Type *Ty) {
17804 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
17805}
17806
17807/// \return true if atomicrmw integer ops work for the type.
17808static bool isAtomicRMWLegalIntTy(Type *Ty) {
17809 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
17810 unsigned BW = IT->getBitWidth();
17811 return BW == 32 || BW == 64;
17812 }
17813
17814 return false;
17815}
17816
17817/// \return true if this atomicrmw xchg type can be selected.
17818static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
17819 Type *Ty = RMW->getType();
17820 if (isAtomicRMWLegalIntTy(Ty))
17821 return true;
17822
17823 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
17824 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
17825 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
17826 return BW == 32 || BW == 64;
17827 }
17828
17829 if (Ty->isFloatTy() || Ty->isDoubleTy())
17830 return true;
17831
17833 return VT->getNumElements() == 2 &&
17834 VT->getElementType()->getPrimitiveSizeInBits() == 16;
17835 }
17836
17837 return false;
17838}
17839
17840/// \returns true if it's valid to emit a native instruction for \p RMW, based
17841/// on the properties of the target memory.
17842static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
17843 const AtomicRMWInst *RMW,
17844 bool HasSystemScope) {
17845 // The remote/fine-grained access logic is different from the integer
17846 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
17847 // fine-grained access does not work, even for a device local allocation.
17848 //
17849 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
17850 // allocations work.
17851 if (HasSystemScope) {
17853 RMW->hasMetadata("amdgpu.no.remote.memory"))
17854 return true;
17855 if (Subtarget.hasEmulatedSystemScopeAtomics())
17856 return true;
17858 return true;
17859
17860 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
17861}
17862
17863/// \return Action to perform on AtomicRMWInsts for integer operations.
17870
17871/// Return if a flat address space atomicrmw can access private memory.
17873 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
17874 return !MD ||
17876}
17877
17885
17888 unsigned AS = RMW->getPointerAddressSpace();
17889 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
17891
17892 // 64-bit flat atomics that dynamically reside in private memory will silently
17893 // be dropped.
17894 //
17895 // Note that we will emit a new copy of the original atomic in the expansion,
17896 // which will be incrementally relegalized.
17897 const DataLayout &DL = RMW->getFunction()->getDataLayout();
17898 if (AS == AMDGPUAS::FLAT_ADDRESS &&
17899 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
17902
17903 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
17905 ORE.emit([=]() {
17906 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
17907 });
17908 return Kind;
17909 };
17910
17911 auto SSID = RMW->getSyncScopeID();
17912 bool HasSystemScope =
17913 SSID == SyncScope::System ||
17914 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
17915
17916 auto Op = RMW->getOperation();
17917 switch (Op) {
17919 // PCIe supports add and xchg for system atomics.
17920 return isAtomicRMWLegalXChgTy(RMW)
17923 case AtomicRMWInst::Add:
17924 // PCIe supports add and xchg for system atomics.
17926 case AtomicRMWInst::Sub:
17927 case AtomicRMWInst::And:
17928 case AtomicRMWInst::Or:
17929 case AtomicRMWInst::Xor:
17930 case AtomicRMWInst::Max:
17931 case AtomicRMWInst::Min:
17938 if (Subtarget->hasEmulatedSystemScopeAtomics())
17940
17941 // On most subtargets, for atomicrmw operations other than add/xchg,
17942 // whether or not the instructions will behave correctly depends on where
17943 // the address physically resides and what interconnect is used in the
17944 // system configuration. On some some targets the instruction will nop,
17945 // and in others synchronization will only occur at degraded device scope.
17946 //
17947 // If the allocation is known local to the device, the instructions should
17948 // work correctly.
17949 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
17951
17952 // If fine-grained remote memory works at device scope, we don't need to
17953 // do anything.
17954 if (!HasSystemScope &&
17955 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17957
17958 // If we are targeting a remote allocated address, it depends what kind of
17959 // allocation the address belongs to.
17960 //
17961 // If the allocation is fine-grained (in host memory, or in PCIe peer
17962 // device memory), the operation will fail depending on the target.
17963 //
17964 // Note fine-grained host memory access does work on APUs or if XGMI is
17965 // used, but we do not know if we are targeting an APU or the system
17966 // configuration from the ISA version/target-cpu.
17967 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
17969
17972 // Atomic sub/or/xor do not work over PCI express, but atomic add
17973 // does. InstCombine transforms these with 0 to or, so undo that.
17974 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
17975 ConstVal && ConstVal->isNullValue())
17977 }
17978
17979 // If the allocation could be in remote, fine-grained memory, the rmw
17980 // instructions may fail. cmpxchg should work, so emit that. On some
17981 // system configurations, PCIe atomics aren't supported so cmpxchg won't
17982 // even work, so you're out of luck anyway.
17983
17984 // In summary:
17985 //
17986 // Cases that may fail:
17987 // - fine-grained pinned host memory
17988 // - fine-grained migratable host memory
17989 // - fine-grained PCIe peer device
17990 //
17991 // Cases that should work, but may be treated overly conservatively.
17992 // - fine-grained host memory on an APU
17993 // - fine-grained XGMI peer device
17995 }
17996
17998 }
17999 case AtomicRMWInst::FAdd: {
18000 Type *Ty = RMW->getType();
18001
18002 // TODO: Handle REGION_ADDRESS
18003 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18004 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18005 // is fixed to round-to-nearest-even.
18006 //
18007 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18008 // round-to-nearest-even.
18009 //
18010 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18011 // suggests it is OK if the floating-point mode may not match the calling
18012 // thread.
18013 if (Ty->isFloatTy()) {
18014 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18016 }
18017
18018 if (Ty->isDoubleTy()) {
18019 // Ignores denormal mode, but we don't consider flushing mandatory.
18020 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18022 }
18023
18024 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18026
18028 }
18029
18030 // LDS atomics respect the denormal mode from the mode register.
18031 //
18032 // Traditionally f32 global/buffer memory atomics would unconditionally
18033 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18034 // flush.
18035 //
18036 // On targets with flat atomic fadd, denormals would flush depending on
18037 // whether the target address resides in LDS or global memory. We consider
18038 // this flat-maybe-flush as will-flush.
18039 if (Ty->isFloatTy() &&
18040 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18043
18044 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18045 // safe. The message phrasing also should be better.
18046 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18047 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18048 // gfx942, gfx12
18049 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18050 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18051 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18052 // gfx90a, gfx942, gfx12
18053 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18054 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18055
18056 // gfx942, gfx12
18057 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18058 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18059 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18060 // gfx90a, gfx942, gfx12
18061 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18062 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18063
18064 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18065 // buffer. gfx12 does have the buffer version.
18066 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18067 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18068 }
18069
18070 // global and flat atomic fadd f64: gfx90a, gfx942.
18071 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18072 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18073
18074 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18075 if (Ty->isFloatTy()) {
18076 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18077 // gfx11+.
18078 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18079 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18080 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18081 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18082 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18083 } else {
18084 // gfx908
18085 if (RMW->use_empty() &&
18086 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18087 isV2F16(Ty))
18088 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18089 }
18090 }
18091
18092 // flat atomic fadd f32: gfx942, gfx11+.
18093 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18094 if (Subtarget->hasFlatAtomicFaddF32Inst())
18095 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18096
18097 // If it is in flat address space, and the type is float, we will try to
18098 // expand it, if the target supports global and lds atomic fadd. The
18099 // reason we need that is, in the expansion, we emit the check of
18100 // address space. If it is in global address space, we emit the global
18101 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18102 // fadd.
18103 if (Subtarget->hasLDSFPAtomicAddF32()) {
18104 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18106 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18108 }
18109 }
18110 }
18111
18113 }
18115 case AtomicRMWInst::FMax: {
18116 Type *Ty = RMW->getType();
18117
18118 // LDS float and double fmin/fmax were always supported.
18119 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18120 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18122 }
18123
18124 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18125 // For flat and global cases:
18126 // float, double in gfx7. Manual claims denormal support.
18127 // Removed in gfx8.
18128 // float, double restored in gfx10.
18129 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18130 //
18131 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18132 // no f32.
18133 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18134 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18135 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18136 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18137 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18138 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18140 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18141 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18142 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18143 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18144 }
18145 }
18146
18148 }
18151 default:
18153 }
18154
18155 llvm_unreachable("covered atomicrmw op switch");
18156}
18157
18164
18171
18174 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18175 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18177
18178 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18180
18181 const DataLayout &DL = CmpX->getDataLayout();
18182
18183 Type *ValTy = CmpX->getNewValOperand()->getType();
18184
18185 // If a 64-bit flat atomic may alias private, we need to avoid using the
18186 // atomic in the private case.
18187 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18189}
18190
18191const TargetRegisterClass *
18192SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18194 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18195 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18196 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18197 : &AMDGPU::SReg_32RegClass;
18198 if (!TRI->isSGPRClass(RC) && !isDivergent)
18199 return TRI->getEquivalentSGPRClass(RC);
18200 if (TRI->isSGPRClass(RC) && isDivergent)
18201 return TRI->getEquivalentVGPRClass(RC);
18202
18203 return RC;
18204}
18205
18206// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18207// uniform values (as produced by the mask results of control flow intrinsics)
18208// used outside of divergent blocks. The phi users need to also be treated as
18209// always uniform.
18210//
18211// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18212static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18213 unsigned WaveSize) {
18214 // FIXME: We assume we never cast the mask results of a control flow
18215 // intrinsic.
18216 // Early exit if the type won't be consistent as a compile time hack.
18217 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18218 if (!IT || IT->getBitWidth() != WaveSize)
18219 return false;
18220
18221 if (!isa<Instruction>(V))
18222 return false;
18223 if (!Visited.insert(V).second)
18224 return false;
18225 bool Result = false;
18226 for (const auto *U : V->users()) {
18228 if (V == U->getOperand(1)) {
18229 switch (Intrinsic->getIntrinsicID()) {
18230 default:
18231 Result = false;
18232 break;
18233 case Intrinsic::amdgcn_if_break:
18234 case Intrinsic::amdgcn_if:
18235 case Intrinsic::amdgcn_else:
18236 Result = true;
18237 break;
18238 }
18239 }
18240 if (V == U->getOperand(0)) {
18241 switch (Intrinsic->getIntrinsicID()) {
18242 default:
18243 Result = false;
18244 break;
18245 case Intrinsic::amdgcn_end_cf:
18246 case Intrinsic::amdgcn_loop:
18247 Result = true;
18248 break;
18249 }
18250 }
18251 } else {
18252 Result = hasCFUser(U, Visited, WaveSize);
18253 }
18254 if (Result)
18255 break;
18256 }
18257 return Result;
18258}
18259
18261 const Value *V) const {
18262 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18263 if (CI->isInlineAsm()) {
18264 // FIXME: This cannot give a correct answer. This should only trigger in
18265 // the case where inline asm returns mixed SGPR and VGPR results, used
18266 // outside the defining block. We don't have a specific result to
18267 // consider, so this assumes if any value is SGPR, the overall register
18268 // also needs to be SGPR.
18269 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18271 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18272 for (auto &TC : TargetConstraints) {
18273 if (TC.Type == InlineAsm::isOutput) {
18275 const TargetRegisterClass *RC =
18276 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18277 TC.ConstraintVT)
18278 .second;
18279 if (RC && SIRI->isSGPRClass(RC))
18280 return true;
18281 }
18282 }
18283 }
18284 }
18286 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18287}
18288
18290 for (SDUse &Use : N->uses()) {
18292 if (getBasePtrIndex(M) == Use.getOperandNo())
18293 return true;
18294 }
18295 }
18296 return false;
18297}
18298
18300 SDValue N1) const {
18301 if (!N0.hasOneUse())
18302 return false;
18303 // Take care of the opportunity to keep N0 uniform
18304 if (N0->isDivergent() || !N1->isDivergent())
18305 return true;
18306 // Check if we have a good chance to form the memory access pattern with the
18307 // base and offset
18308 return (DAG.isBaseWithConstantOffset(N0) &&
18310}
18311
18313 Register N0, Register N1) const {
18314 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18315}
18316
18319 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18321 if (I.getMetadata("amdgpu.noclobber"))
18322 Flags |= MONoClobber;
18323 if (I.getMetadata("amdgpu.last.use"))
18324 Flags |= MOLastUse;
18325 return Flags;
18326}
18327
18329 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18330 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18331 if (User->getOpcode() != ISD::CopyToReg)
18332 return false;
18333 if (!Def->isMachineOpcode())
18334 return false;
18336 if (!MDef)
18337 return false;
18338
18339 unsigned ResNo = User->getOperand(Op).getResNo();
18340 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18341 return false;
18342 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18343 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18344 PhysReg = AMDGPU::SCC;
18345 const TargetRegisterClass *RC =
18346 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18347 Cost = RC->getCopyCost();
18348 return true;
18349 }
18350 return false;
18351}
18352
18354 Instruction *AI) const {
18355 // Given: atomicrmw fadd ptr %addr, float %val ordering
18356 //
18357 // With this expansion we produce the following code:
18358 // [...]
18359 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18360 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18361 //
18362 // atomicrmw.shared:
18363 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18364 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18365 // float %val ordering
18366 // br label %atomicrmw.phi
18367 //
18368 // atomicrmw.check.private:
18369 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18370 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18371 //
18372 // atomicrmw.private:
18373 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18374 // %loaded.private = load float, ptr addrspace(5) %cast.private
18375 // %val.new = fadd float %loaded.private, %val
18376 // store float %val.new, ptr addrspace(5) %cast.private
18377 // br label %atomicrmw.phi
18378 //
18379 // atomicrmw.global:
18380 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18381 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18382 // float %val ordering
18383 // br label %atomicrmw.phi
18384 //
18385 // atomicrmw.phi:
18386 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18387 // [ %loaded.private, %atomicrmw.private ],
18388 // [ %loaded.global, %atomicrmw.global ]
18389 // br label %atomicrmw.end
18390 //
18391 // atomicrmw.end:
18392 // [...]
18393 //
18394 //
18395 // For 64-bit atomics which may reside in private memory, we perform a simpler
18396 // version that only inserts the private check, and uses the flat operation.
18397
18398 IRBuilder<> Builder(AI);
18399 LLVMContext &Ctx = Builder.getContext();
18400
18401 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18402 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18404 Value *Addr = AI->getOperand(PtrOpIdx);
18405
18406 /// TODO: Only need to check private, then emit flat-known-not private (no
18407 /// need for shared block, or cast to global).
18409
18410 Align Alignment;
18411 if (RMW)
18412 Alignment = RMW->getAlign();
18413 else if (CX)
18414 Alignment = CX->getAlign();
18415 else
18416 llvm_unreachable("unhandled atomic operation");
18417
18418 // FullFlatEmulation is true if we need to issue the private, shared, and
18419 // global cases.
18420 //
18421 // If this is false, we are only dealing with the flat-targeting-private case,
18422 // where we only insert a check for private and still use the flat instruction
18423 // for global and shared.
18424
18425 bool FullFlatEmulation =
18426 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18427 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18428 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18429 RMW->getType()->isDoubleTy()));
18430
18431 // If the return value isn't used, do not introduce a false use in the phi.
18432 bool ReturnValueIsUsed = !AI->use_empty();
18433
18434 BasicBlock *BB = Builder.GetInsertBlock();
18435 Function *F = BB->getParent();
18436 BasicBlock *ExitBB =
18437 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18438 BasicBlock *SharedBB = nullptr;
18439
18440 BasicBlock *CheckPrivateBB = BB;
18441 if (FullFlatEmulation) {
18442 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18443 CheckPrivateBB =
18444 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18445 }
18446
18447 BasicBlock *PrivateBB =
18448 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18449 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18450 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18451
18452 std::prev(BB->end())->eraseFromParent();
18453 Builder.SetInsertPoint(BB);
18454
18455 Value *LoadedShared = nullptr;
18456 if (FullFlatEmulation) {
18457 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18458 {Addr}, nullptr, "is.shared");
18459 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18460 Builder.SetInsertPoint(SharedBB);
18461 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18463
18464 Instruction *Clone = AI->clone();
18465 Clone->insertInto(SharedBB, SharedBB->end());
18466 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
18467 LoadedShared = Clone;
18468
18469 Builder.CreateBr(PhiBB);
18470 Builder.SetInsertPoint(CheckPrivateBB);
18471 }
18472
18473 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18474 {Addr}, nullptr, "is.private");
18475 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18476
18477 Builder.SetInsertPoint(PrivateBB);
18478
18479 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18481
18482 Value *LoadedPrivate;
18483 if (RMW) {
18484 LoadedPrivate = Builder.CreateAlignedLoad(
18485 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
18486
18487 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
18488 LoadedPrivate, RMW->getValOperand());
18489
18490 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18491 } else {
18492 auto [ResultLoad, Equal] =
18493 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
18494 CX->getNewValOperand(), CX->getAlign());
18495
18496 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
18497 ResultLoad, 0);
18498 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
18499 }
18500
18501 Builder.CreateBr(PhiBB);
18502
18503 Builder.SetInsertPoint(GlobalBB);
18504
18505 // Continue using a flat instruction if we only emitted the check for private.
18506 Instruction *LoadedGlobal = AI;
18507 if (FullFlatEmulation) {
18508 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
18510 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
18511 }
18512
18513 AI->removeFromParent();
18514 AI->insertInto(GlobalBB, GlobalBB->end());
18515
18516 // The new atomicrmw may go through another round of legalization later.
18517 if (!FullFlatEmulation) {
18518 // We inserted the runtime check already, make sure we do not try to
18519 // re-expand this.
18520 // TODO: Should union with any existing metadata.
18521 MDBuilder MDB(F->getContext());
18522 MDNode *RangeNotPrivate =
18525 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
18526 RangeNotPrivate);
18527 }
18528
18529 Builder.CreateBr(PhiBB);
18530
18531 Builder.SetInsertPoint(PhiBB);
18532
18533 if (ReturnValueIsUsed) {
18534 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
18535 AI->replaceAllUsesWith(Loaded);
18536 if (FullFlatEmulation)
18537 Loaded->addIncoming(LoadedShared, SharedBB);
18538 Loaded->addIncoming(LoadedPrivate, PrivateBB);
18539 Loaded->addIncoming(LoadedGlobal, GlobalBB);
18540 Loaded->takeName(AI);
18541 }
18542
18543 Builder.CreateBr(ExitBB);
18544}
18545
18547 unsigned PtrOpIdx) {
18548 Value *PtrOp = I->getOperand(PtrOpIdx);
18551
18552 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
18553 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
18554 I->getIterator());
18555 I->setOperand(PtrOpIdx, ASCast);
18556}
18557
18560
18563
18566 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
18567 ConstVal && ConstVal->isNullValue()) {
18568 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
18570
18571 // We may still need the private-alias-flat handling below.
18572
18573 // TODO: Skip this for cases where we cannot access remote memory.
18574 }
18575 }
18576
18577 // The non-flat expansions should only perform the de-canonicalization of
18578 // identity values.
18580 return;
18581
18583}
18584
18591
18595
18597 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
18598}
18599
18601 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
18602 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
18603
18605 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
18606}
18607
18608LoadInst *
18610 IRBuilder<> Builder(AI);
18611 auto Order = AI->getOrdering();
18612
18613 // The optimization removes store aspect of the atomicrmw. Therefore, cache
18614 // must be flushed if the atomic ordering had a release semantics. This is
18615 // not necessary a fence, a release fence just coincides to do that flush.
18616 // Avoid replacing of an atomicrmw with a release semantics.
18617 if (isReleaseOrStronger(Order))
18618 return nullptr;
18619
18620 LoadInst *LI = Builder.CreateAlignedLoad(
18621 AI->getType(), AI->getPointerOperand(), AI->getAlign());
18622 LI->setAtomic(Order, AI->getSyncScopeID());
18623 LI->copyMetadata(*AI);
18624 LI->takeName(AI);
18625 AI->replaceAllUsesWith(LI);
18626 AI->eraseFromParent();
18627 return LI;
18628}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1247
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1244
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static uint32_t getIdentityValueForWaveReduction(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1445
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:307
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:853
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:270
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:551
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:390
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:336
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:203
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:399
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:251
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:218
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:241
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs