LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/MDBuilder.h"
45#include "llvm/Support/ModRef.h"
47#include <optional>
48
49using namespace llvm;
50using namespace llvm::SDPatternMatch;
51
52#define DEBUG_TYPE "si-lower"
53
54STATISTIC(NumTailCalls, "Number of tail calls");
55
56static cl::opt<bool>
57 DisableLoopAlignment("amdgpu-disable-loop-alignment",
58 cl::desc("Do not align and prefetch loops"),
59 cl::init(false));
60
62 "amdgpu-use-divergent-register-indexing", cl::Hidden,
63 cl::desc("Use indirect register addressing for divergent indexes"),
64 cl::init(false));
65
66// TODO: This option should be removed once we switch to always using PTRADD in
67// the SelectionDAG.
69 "amdgpu-use-sdag-ptradd", cl::Hidden,
70 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
71 "SelectionDAG ISel"),
72 cl::init(false));
73
76 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
77}
78
81 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
82}
83
84static unsigned findFirstFreeSGPR(CCState &CCInfo) {
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
87 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
88 return AMDGPU::SGPR0 + Reg;
89 }
90 }
91 llvm_unreachable("Cannot allocate sgpr");
92}
93
95 const GCNSubtarget &STI)
96 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
97 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
98 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
99
100 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
101 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const SIRegisterInfo *TRI = STI.getRegisterInfo();
106 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
147
148 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
149 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
150
151 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
152 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
153
154 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
156
157 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
158 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
159
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
162 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
163 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
165 } else {
166 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
169 }
170
171 // Unless there are also VOP3P operations, not operations are really legal.
172 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
176 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
179 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
182 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
185 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
187 }
188
189 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
190 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
191
192 computeRegisterProperties(Subtarget->getRegisterInfo());
193
194 // The boolean content concept here is too inflexible. Compares only ever
195 // really produce a 1-bit result. Any copy/extend from these will turn into a
196 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
197 // it's what most targets use.
200
201 // We need to custom lower vector stores from local memory
202 setOperationAction(ISD::LOAD,
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
207 Custom);
208
209 setOperationAction(ISD::STORE,
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
214 Custom);
215
216 if (isTypeLegal(MVT::bf16)) {
217 for (unsigned Opc :
219 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
225 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
226 ISD::SETCC}) {
227 // FIXME: The promoted to type shouldn't need to be explicit
228 setOperationAction(Opc, MVT::bf16, Promote);
229 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
230 }
231
233
235 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
236
237 setOperationAction(ISD::FABS, MVT::bf16, Legal);
238 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
240
241 // We only need to custom lower because we can't specify an action for bf16
242 // sources.
245 }
246
247 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
248 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
253 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
258 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
259 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
260 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
261 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
262 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
263
264 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
265 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
266 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
267 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
270 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
271
272 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
273
277 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
278
279 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
280
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
283
285 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
286 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
287
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
292 Expand);
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
297 Expand);
298
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
302 Custom);
303
304 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
305 setOperationAction(ISD::BR_CC,
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
307
309
311
313 Expand);
314
315#if 0
317#endif
318
319 // We only support LOAD/STORE and vector manipulation ops for vectors
320 // with > 4 elements.
321 for (MVT VT :
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
330 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
331 switch (Op) {
332 case ISD::LOAD:
333 case ISD::STORE:
335 case ISD::BITCAST:
336 case ISD::UNDEF:
340 case ISD::IS_FPCLASS:
341 break;
346 break;
347 default:
349 break;
350 }
351 }
352 }
353
354 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
355
356 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
357 // is expanded to avoid having two separate loops in case the index is a VGPR.
358
359 // Most operations are naturally 32-bit vector operations. We only support
360 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
361 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
363 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
364
366 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
367
369 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
373 }
374
375 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
377 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
378
380 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
381
383 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
387 }
388
389 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
391 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
392
394 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
395
397 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
401 }
402
403 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
405 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
406
408 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
409
411 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
415 }
416
417 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
419 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
420
422 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
423
425 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
429 }
430
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
434 Custom);
435
436 if (Subtarget->hasPkMovB32()) {
437 // TODO: 16-bit element vectors should be legal with even aligned elements.
438 // TODO: Can be legal with wider source types than the result with
439 // subregister extracts.
440 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441 }
442
443 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
444 Custom);
445
446 // Avoid stack access for these.
447 // TODO: Generalize to more vector types.
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
451 Custom);
452
453 // Deal with vec3 vector operations when widened to vec4.
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
456
457 // Deal with vec5/6/7 vector operations when widened to vec8.
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
463 Custom);
464
465 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
466 // and output demarshalling
467 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
468
469 // We can't return success/failure, only the old value,
470 // let LLVM add the comparison
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
472 Expand);
473
474 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
475
476 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
477
478 // FIXME: This should be narrowed to i32, but that only happens if i64 is
479 // illegal.
480 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
481 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
482
483 // On SI this is s_memtime and s_memrealtime on VI.
484 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
485
486 if (Subtarget->hasSMemRealTime() ||
487 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
488 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
489 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
490
491 if (Subtarget->has16BitInsts()) {
492 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
493 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
494 } else {
495 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
496 }
497
498 if (Subtarget->hasMadMacF32Insts())
500
501 if (!Subtarget->hasBFI())
502 // fcopysign can be done in a single instruction with BFI.
503 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
504
505 if (!Subtarget->hasBCNT(32))
507
508 if (!Subtarget->hasBCNT(64))
510
511 if (Subtarget->hasFFBH())
513
514 if (Subtarget->hasFFBL())
516
517 // We only really have 32-bit BFE instructions (and 16-bit on VI).
518 //
519 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
520 // effort to match them now. We want this to be false for i64 cases when the
521 // extraction isn't restricted to the upper or lower half. Ideally we would
522 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
523 // span the midpoint are probably relatively rare, so don't worry about them
524 // for now.
525 if (Subtarget->hasBFE())
527
528 // Clamp modifier on add/sub
529 if (Subtarget->hasIntClamp())
531
532 if (Subtarget->hasAddNoCarry())
533 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
534 Legal);
535
537 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
538 {MVT::f32, MVT::f64}, Custom);
539
540 // These are really only legal for ieee_mode functions. We should be avoiding
541 // them for functions that don't have ieee_mode enabled, so just say they are
542 // legal.
543 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
544 {MVT::f32, MVT::f64}, Legal);
545
546 if (Subtarget->haveRoundOpsF64())
547 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
548 Legal);
549 else
550 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
551 MVT::f64, Custom);
552
553 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
554 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
555 Legal);
556 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
557
558 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
560
561 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
562 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563
564 // Custom lower these because we can't specify a rule based on an illegal
565 // source bf16.
566 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
567 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
568
569 if (Subtarget->has16BitInsts()) {
572 MVT::i16, Legal);
573
574 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
575
577 MVT::i16, Expand);
578
582 ISD::CTPOP},
583 MVT::i16, Promote);
584
585 setOperationAction(ISD::LOAD, MVT::i16, Custom);
586
587 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
588
589 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
590 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
591 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
592 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
593
597
599
600 // F16 - Constant Actions.
603
604 // F16 - Load/Store Actions.
605 setOperationAction(ISD::LOAD, MVT::f16, Promote);
606 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
607 setOperationAction(ISD::STORE, MVT::f16, Promote);
608 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
609
610 // BF16 - Load/Store Actions.
611 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
612 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
613 setOperationAction(ISD::STORE, MVT::bf16, Promote);
614 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
615
616 // F16 - VOP1 Actions.
618 ISD::FSIN, ISD::FROUND},
619 MVT::f16, Custom);
620
621 // BF16 - VOP1 Actions.
622 if (Subtarget->hasBF16TransInsts())
623 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
624
627
628 // F16 - VOP2 Actions.
629 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
630 Expand);
631 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
632 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
634
635 // F16 - VOP3 Actions.
637 if (STI.hasMadF16())
639
640 for (MVT VT :
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
644 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
645 switch (Op) {
646 case ISD::LOAD:
647 case ISD::STORE:
649 case ISD::BITCAST:
650 case ISD::UNDEF:
655 case ISD::IS_FPCLASS:
656 break;
660 break;
661 default:
663 break;
664 }
665 }
666 }
667
668 // v_perm_b32 can handle either of these.
669 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
671
672 // XXX - Do these do anything? Vector constants turn into build_vector.
673 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
674
675 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
676 Legal);
677
678 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
679 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
680 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
681 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
682
683 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
684 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
685 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
686 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
687
688 setOperationAction(ISD::AND, MVT::v2i16, Promote);
689 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
690 setOperationAction(ISD::OR, MVT::v2i16, Promote);
691 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
692 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
694
695 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
696 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
697 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
698 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
699 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
700 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
701
702 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
703 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
704 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
705 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
706 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
707 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
708
709 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
711 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
712 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
713 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
715
716 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
718 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
720
721 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
722 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
723 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
724 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
725 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
726 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
727
728 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
730 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
731 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
732 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
733 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
734
735 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
737 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
738 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
739 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
740 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
741
742 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
744 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
746 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
748
749 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
750 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
751 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
752 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
753 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
755
757 MVT::v2i32, Expand);
758 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
759
761 MVT::v4i32, Expand);
762
764 MVT::v8i32, Expand);
765
766 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
767 Subtarget->hasVOP3PInsts() ? Legal : Custom);
768
769 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
770 // This isn't really legal, but this avoids the legalizer unrolling it (and
771 // allows matching fneg (fabs x) patterns)
772 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
773
774 // Can do this in one BFI plus a constant materialize.
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
779 Custom);
780
782 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
783 MVT::f16, Custom);
784 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
785
786 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
787 ISD::FMAXIMUMNUM},
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
789 Custom);
790
791 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 Expand);
794
795 for (MVT Vec16 :
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
800 Vec16, Custom);
802 }
803 }
804
805 if (Subtarget->hasVOP3PInsts()) {
809 MVT::v2i16, Legal);
810
811 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
812 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
813 MVT::v2f16, Legal);
814
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
817
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
822 Custom);
823
824 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
825 // Split vector operations.
830 VT, Custom);
831
832 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
833 // Split vector operations.
835 VT, Custom);
836
838 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
839 {MVT::v2f16, MVT::v4f16}, Custom);
840
841 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
842 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
843 Custom);
844
845 if (Subtarget->hasPackedFP32Ops()) {
847 MVT::v2f32, Legal);
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
850 Custom);
851 }
852 }
853
854 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
855
856 if (Subtarget->has16BitInsts()) {
858 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
860 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
861 } else {
862 // Legalization hack.
863 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
864
865 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
866 }
867
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
873 Custom);
874
876
877 if (Subtarget->hasVectorMulU64())
879 else if (Subtarget->hasScalarSMulU64())
881
882 if (Subtarget->hasMad64_32())
884
885 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
886 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
887
888 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
889 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
891 } else {
892 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
893 if (Subtarget->hasMinimum3Maximum3F32())
894 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
895
896 if (Subtarget->hasMinimum3Maximum3PKF16()) {
897 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
898
899 // If only the vector form is available, we need to widen to a vector.
900 if (!Subtarget->hasMinimum3Maximum3F16())
901 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
902 }
903 }
904
905 if (Subtarget->hasVOP3PInsts()) {
906 // We want to break these into v2f16 pieces, not scalarize.
907 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
909 Custom);
910 }
911
912 if (Subtarget->hasIntMinMax64())
914 Legal);
915
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
919 MVT::i8},
920 Custom);
921
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
927 Custom);
928
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
934 Custom);
935
936 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
938 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
939 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
940 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
941
942 // TODO: Could move this to custom lowering, could benefit from combines on
943 // extract of relevant bits.
944 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
945
947
948 if (Subtarget->hasBF16ConversionInsts()) {
949 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
951 }
952
953 if (Subtarget->hasBF16PackedInsts()) {
955 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
956 MVT::v2bf16, Legal);
957 }
958
959 if (Subtarget->hasBF16TransInsts()) {
960 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
961 }
962
963 if (Subtarget->hasCvtPkF16F32Inst()) {
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
966 Custom);
967 }
968
970 ISD::PTRADD,
972 ISD::SUB,
974 ISD::MUL,
975 ISD::FADD,
976 ISD::FSUB,
977 ISD::FDIV,
978 ISD::FMUL,
979 ISD::FMINNUM,
980 ISD::FMAXNUM,
981 ISD::FMINNUM_IEEE,
982 ISD::FMAXNUM_IEEE,
983 ISD::FMINIMUM,
984 ISD::FMAXIMUM,
985 ISD::FMINIMUMNUM,
986 ISD::FMAXIMUMNUM,
987 ISD::FMA,
988 ISD::SMIN,
989 ISD::SMAX,
990 ISD::UMIN,
991 ISD::UMAX,
994 ISD::SMIN,
995 ISD::SMAX,
996 ISD::UMIN,
997 ISD::UMAX,
998 ISD::AND,
999 ISD::OR,
1000 ISD::XOR,
1001 ISD::SHL,
1002 ISD::SRL,
1003 ISD::SRA,
1004 ISD::FSHR,
1014
1015 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1017
1018 // All memory operations. Some folding on the pointer operand is done to help
1019 // matching the constant offsets in the addressing modes.
1020 setTargetDAGCombine({ISD::LOAD,
1021 ISD::STORE,
1022 ISD::ATOMIC_LOAD,
1023 ISD::ATOMIC_STORE,
1024 ISD::ATOMIC_CMP_SWAP,
1025 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1026 ISD::ATOMIC_SWAP,
1027 ISD::ATOMIC_LOAD_ADD,
1028 ISD::ATOMIC_LOAD_SUB,
1029 ISD::ATOMIC_LOAD_AND,
1030 ISD::ATOMIC_LOAD_OR,
1031 ISD::ATOMIC_LOAD_XOR,
1032 ISD::ATOMIC_LOAD_NAND,
1033 ISD::ATOMIC_LOAD_MIN,
1034 ISD::ATOMIC_LOAD_MAX,
1035 ISD::ATOMIC_LOAD_UMIN,
1036 ISD::ATOMIC_LOAD_UMAX,
1037 ISD::ATOMIC_LOAD_FADD,
1038 ISD::ATOMIC_LOAD_FMIN,
1039 ISD::ATOMIC_LOAD_FMAX,
1040 ISD::ATOMIC_LOAD_UINC_WRAP,
1041 ISD::ATOMIC_LOAD_UDEC_WRAP,
1044
1045 // FIXME: In other contexts we pretend this is a per-function property.
1047
1049}
1050
1051const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1052
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1055 return RCRegs;
1056}
1057
1058//===----------------------------------------------------------------------===//
1059// TargetLowering queries
1060//===----------------------------------------------------------------------===//
1061
1062// v_mad_mix* support a conversion from f16 to f32.
1063//
1064// There is only one special case when denormals are enabled we don't currently,
1065// where this is OK to use.
1066bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1067 EVT DestVT, EVT SrcVT) const {
1068 return DestVT.getScalarType() == MVT::f32 &&
1069 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1070 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1071 SrcVT.getScalarType() == MVT::f16) ||
1072 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1073 SrcVT.getScalarType() == MVT::bf16)) &&
1074 // TODO: This probably only requires no input flushing?
1076}
1077
1079 LLT DestTy, LLT SrcTy) const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1082 DestTy.getScalarSizeInBits() == 32 &&
1083 SrcTy.getScalarSizeInBits() == 16 &&
1084 // TODO: This probably only requires no input flushing?
1085 denormalModeIsFlushAllF32(*MI.getMF());
1086}
1087
1089 // SI has some legal vector types, but no legal vector operations. Say no
1090 // shuffles are legal in order to prefer scalarizing some vector operations.
1091 return false;
1092}
1093
1095 CallingConv::ID CC,
1096 EVT VT) const {
1098 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1099
1100 if (VT.isVector()) {
1101 EVT ScalarVT = VT.getScalarType();
1102 unsigned Size = ScalarVT.getSizeInBits();
1103 if (Size == 16) {
1104 if (Subtarget->has16BitInsts()) {
1105 if (VT.isInteger())
1106 return MVT::v2i16;
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1108 }
1109 return VT.isInteger() ? MVT::i32 : MVT::f32;
1110 }
1111
1112 if (Size < 16)
1113 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1114 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1115 }
1116
1117 if (VT.getSizeInBits() > 32)
1118 return MVT::i32;
1119
1120 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1121}
1122
1124 CallingConv::ID CC,
1125 EVT VT) const {
1127 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1128
1129 if (VT.isVector()) {
1130 unsigned NumElts = VT.getVectorNumElements();
1131 EVT ScalarVT = VT.getScalarType();
1132 unsigned Size = ScalarVT.getSizeInBits();
1133
1134 // FIXME: Should probably promote 8-bit vectors to i16.
1135 if (Size == 16 && Subtarget->has16BitInsts())
1136 return (NumElts + 1) / 2;
1137
1138 if (Size <= 32)
1139 return NumElts;
1140
1141 if (Size > 32)
1142 return NumElts * ((Size + 31) / 32);
1143 } else if (VT.getSizeInBits() > 32)
1144 return (VT.getSizeInBits() + 31) / 32;
1145
1146 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1147}
1148
1150 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1151 unsigned &NumIntermediates, MVT &RegisterVT) const {
1152 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1153 unsigned NumElts = VT.getVectorNumElements();
1154 EVT ScalarVT = VT.getScalarType();
1155 unsigned Size = ScalarVT.getSizeInBits();
1156 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1157 // support, but unless we can properly handle 3-vectors, it will be still be
1158 // inconsistent.
1159 if (Size == 16 && Subtarget->has16BitInsts()) {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1163 } else {
1164 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1166 }
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1169 }
1170
1171 if (Size == 32) {
1172 RegisterVT = ScalarVT.getSimpleVT();
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1176 }
1177
1178 if (Size < 16 && Subtarget->has16BitInsts()) {
1179 // FIXME: Should probably form v2i16 pieces
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size != 16 && Size <= 32) {
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1191 }
1192
1193 if (Size > 32) {
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((Size + 31) / 32);
1197 return NumIntermediates;
1198 }
1199 }
1200
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1203}
1204
1206 const DataLayout &DL, Type *Ty,
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1209
1210 LLVMContext &Ctx = Ty->getContext();
1211 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1213 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1214 NumElts);
1215 }
1216
1217 return TLI.getValueType(DL, Ty);
1218}
1219
1220// Peek through TFE struct returns to only use the data size.
1222 const DataLayout &DL, Type *Ty,
1223 unsigned MaxNumLanes) {
1224 auto *ST = dyn_cast<StructType>(Ty);
1225 if (!ST)
1226 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1227
1228 // TFE intrinsics return an aggregate type.
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1231 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1232}
1233
1234/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1235/// in-memory representation. This return value is a custom type because there
1236/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1237/// could cause issues during codegen, these address space 7 pointers will be
1238/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1239/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1240/// for cost modeling, to work. (This also sets us up decently for doing the
1241/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1243 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1249}
1250/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1251/// v8i32 when padding is added.
1252/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1253/// also v8i32 with padding.
1255 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1259 return MVT::v8i32;
1261}
1262
1263static unsigned getIntrMemWidth(unsigned IntrID) {
1264 switch (IntrID) {
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1267 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1268 return 8;
1269 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1270 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1271 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1272 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1273 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1274 return 32;
1275 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1276 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1277 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1278 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1279 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1280 return 64;
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1284 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1285 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1286 return 128;
1287 default:
1288 llvm_unreachable("Unknown width");
1289 }
1290}
1291
1292static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1294 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1295 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1296 switch (AtomicOrderingCABI(Ord)) {
1299 break;
1302 break;
1305 break;
1306 default:
1308 break;
1309 }
1310
1311 Info.flags =
1313 Info.flags |= MOCooperative;
1314
1315 MDNode *ScopeMD = cast<MDNode>(
1316 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1317 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1318 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1319}
1320
1322 const CallInst &CI,
1323 MachineFunction &MF,
1324 unsigned IntrID) const {
1325 Info.flags = MachineMemOperand::MONone;
1326 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1327 Info.flags |= MachineMemOperand::MOInvariant;
1328 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1330 Info.flags |= getTargetMMOFlags(CI);
1331
1332 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1334 AttributeSet Attr =
1336 MemoryEffects ME = Attr.getMemoryEffects();
1337 if (ME.doesNotAccessMemory())
1338 return false;
1339
1340 // TODO: Should images get their own address space?
1341 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1342
1343 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1344 if (RsrcIntr->IsImage) {
1345 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1347 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1348 Info.align.reset();
1349 }
1350
1351 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1352 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1353 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1354 // We conservatively set the memory operand of a buffer intrinsic to the
1355 // base resource pointer, so that we can access alias information about
1356 // those pointers. Cases like "this points at the same value
1357 // but with a different offset" are handled in
1358 // areMemAccessesTriviallyDisjoint.
1359 Info.ptrVal = RsrcArg;
1360 }
1361
1362 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1363 if (!IsSPrefetch) {
1364 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1365 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1366 Info.flags |= MachineMemOperand::MOVolatile;
1367 }
1368
1370 if (ME.onlyReadsMemory()) {
1371 if (RsrcIntr->IsImage) {
1372 unsigned MaxNumLanes = 4;
1373
1374 if (!BaseOpcode->Gather4) {
1375 // If this isn't a gather, we may have excess loaded elements in the
1376 // IR type. Check the dmask for the real number of elements loaded.
1377 unsigned DMask =
1378 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1379 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1380 }
1381
1382 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1383 CI.getType(), MaxNumLanes);
1384 } else {
1385 Info.memVT =
1387 std::numeric_limits<unsigned>::max());
1388 }
1389
1390 // FIXME: What does alignment mean for an image?
1391 Info.opc = ISD::INTRINSIC_W_CHAIN;
1392 Info.flags |= MachineMemOperand::MOLoad;
1393 } else if (ME.onlyWritesMemory()) {
1394 Info.opc = ISD::INTRINSIC_VOID;
1395
1396 Type *DataTy = CI.getArgOperand(0)->getType();
1397 if (RsrcIntr->IsImage) {
1398 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1399 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1400 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1401 DMaskLanes);
1402 } else
1403 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1404
1405 Info.flags |= MachineMemOperand::MOStore;
1406 } else {
1407 // Atomic, NoReturn Sampler or prefetch
1408 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1410 Info.flags |=
1412
1413 if (!IsSPrefetch)
1414 Info.flags |= MachineMemOperand::MOStore;
1415
1416 switch (IntrID) {
1417 default:
1418 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1419 // Fake memory access type for no return sampler intrinsics
1420 Info.memVT = MVT::i32;
1421 } else {
1422 // XXX - Should this be volatile without known ordering?
1423 Info.flags |= MachineMemOperand::MOVolatile;
1424 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1425 }
1426 break;
1427 case Intrinsic::amdgcn_raw_buffer_load_lds:
1428 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1429 case Intrinsic::amdgcn_struct_buffer_load_lds:
1430 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1431 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1432 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1433 Info.ptrVal = CI.getArgOperand(1);
1434 return true;
1435 }
1436 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1437 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1438 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1439 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1440 Info.memVT =
1442 std::numeric_limits<unsigned>::max());
1443 Info.flags &= ~MachineMemOperand::MOStore;
1444 return true;
1445 }
1446 }
1447 }
1448 return true;
1449 }
1450
1451 switch (IntrID) {
1452 case Intrinsic::amdgcn_ds_ordered_add:
1453 case Intrinsic::amdgcn_ds_ordered_swap: {
1454 Info.opc = ISD::INTRINSIC_W_CHAIN;
1455 Info.memVT = MVT::getVT(CI.getType());
1456 Info.ptrVal = CI.getOperand(0);
1457 Info.align.reset();
1459
1460 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1461 if (!Vol->isZero())
1462 Info.flags |= MachineMemOperand::MOVolatile;
1463
1464 return true;
1465 }
1466 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1467 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1468 Info.opc = ISD::INTRINSIC_W_CHAIN;
1469 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1470 Info.ptrVal = nullptr;
1471 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_append:
1476 case Intrinsic::amdgcn_ds_consume: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getType());
1479 Info.ptrVal = CI.getOperand(0);
1480 Info.align.reset();
1482
1483 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1484 if (!Vol->isZero())
1485 Info.flags |= MachineMemOperand::MOVolatile;
1486
1487 return true;
1488 }
1489 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1490 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1491 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1494 Info.memVT = MVT::getVT(CI.getType());
1495 Info.ptrVal = CI.getOperand(0);
1496 Info.memVT = MVT::i64;
1497 Info.size = 8;
1498 Info.align.reset();
1500 return true;
1501 }
1502 case Intrinsic::amdgcn_global_atomic_csub: {
1503 Info.opc = ISD::INTRINSIC_W_CHAIN;
1504 Info.memVT = MVT::getVT(CI.getType());
1505 Info.ptrVal = CI.getOperand(0);
1506 Info.align.reset();
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1512 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1513 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1514 Info.opc = ISD::INTRINSIC_W_CHAIN;
1515 Info.memVT =
1516 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1517 ? CI.getType()
1519 ->getElementType(0)); // XXX: what is correct VT?
1520
1521 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1522 Info.align.reset();
1523 Info.flags |=
1525 return true;
1526 }
1527 case Intrinsic::amdgcn_global_atomic_fmin_num:
1528 case Intrinsic::amdgcn_global_atomic_fmax_num:
1529 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1530 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1531 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1532 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1533 Info.opc = ISD::INTRINSIC_W_CHAIN;
1534 Info.memVT = MVT::getVT(CI.getType());
1535 Info.ptrVal = CI.getOperand(0);
1536 Info.align.reset();
1540 return true;
1541 }
1542 case Intrinsic::amdgcn_flat_load_monitor_b32:
1543 case Intrinsic::amdgcn_flat_load_monitor_b64:
1544 case Intrinsic::amdgcn_flat_load_monitor_b128:
1545 case Intrinsic::amdgcn_global_load_monitor_b32:
1546 case Intrinsic::amdgcn_global_load_monitor_b64:
1547 case Intrinsic::amdgcn_global_load_monitor_b128:
1548 case Intrinsic::amdgcn_cluster_load_b32:
1549 case Intrinsic::amdgcn_cluster_load_b64:
1550 case Intrinsic::amdgcn_cluster_load_b128:
1551 case Intrinsic::amdgcn_ds_load_tr6_b96:
1552 case Intrinsic::amdgcn_ds_load_tr4_b64:
1553 case Intrinsic::amdgcn_ds_load_tr8_b64:
1554 case Intrinsic::amdgcn_ds_load_tr16_b128:
1555 case Intrinsic::amdgcn_global_load_tr6_b96:
1556 case Intrinsic::amdgcn_global_load_tr4_b64:
1557 case Intrinsic::amdgcn_global_load_tr_b64:
1558 case Intrinsic::amdgcn_global_load_tr_b128:
1559 case Intrinsic::amdgcn_ds_read_tr4_b64:
1560 case Intrinsic::amdgcn_ds_read_tr6_b96:
1561 case Intrinsic::amdgcn_ds_read_tr8_b64:
1562 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1563 Info.opc = ISD::INTRINSIC_W_CHAIN;
1564 Info.memVT = MVT::getVT(CI.getType());
1565 Info.ptrVal = CI.getOperand(0);
1566 Info.align.reset();
1567 Info.flags |= MachineMemOperand::MOLoad;
1568 return true;
1569 }
1570 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1571 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1572 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1573 Info.opc = ISD::INTRINSIC_W_CHAIN;
1574 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1575 Info.ptrVal = CI.getOperand(0);
1576 Info.align.reset();
1577 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1578 return true;
1579 }
1580 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1581 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1582 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1583 Info.opc = ISD::INTRINSIC_VOID;
1584 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1585 Info.ptrVal = CI.getArgOperand(0);
1586 Info.align.reset();
1587 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1588 return true;
1589 }
1590 case Intrinsic::amdgcn_ds_gws_init:
1591 case Intrinsic::amdgcn_ds_gws_barrier:
1592 case Intrinsic::amdgcn_ds_gws_sema_v:
1593 case Intrinsic::amdgcn_ds_gws_sema_br:
1594 case Intrinsic::amdgcn_ds_gws_sema_p:
1595 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1596 Info.opc = ISD::INTRINSIC_VOID;
1597
1598 const GCNTargetMachine &TM =
1599 static_cast<const GCNTargetMachine &>(getTargetMachine());
1600
1602 Info.ptrVal = MFI->getGWSPSV(TM);
1603
1604 // This is an abstract access, but we need to specify a type and size.
1605 Info.memVT = MVT::i32;
1606 Info.size = 4;
1607 Info.align = Align(4);
1608
1609 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1610 Info.flags |= MachineMemOperand::MOLoad;
1611 else
1612 Info.flags |= MachineMemOperand::MOStore;
1613 return true;
1614 }
1615 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1616 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1617 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1618 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1619 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1620 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1621 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1622 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1623 Info.opc = ISD::INTRINSIC_VOID;
1624 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1625 Info.ptrVal = CI.getArgOperand(1);
1627 return true;
1628 }
1629 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1630 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1631 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1632 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1633 Info.opc = ISD::INTRINSIC_VOID;
1634 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1635 Info.ptrVal = CI.getArgOperand(0);
1637 return true;
1638 }
1639 case Intrinsic::amdgcn_load_to_lds:
1640 case Intrinsic::amdgcn_global_load_lds: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1643 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1644 Info.ptrVal = CI.getArgOperand(1);
1646 return true;
1647 }
1648 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1649 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1650 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1651 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1652 Info.opc = ISD::INTRINSIC_W_CHAIN;
1653
1654 const GCNTargetMachine &TM =
1655 static_cast<const GCNTargetMachine &>(getTargetMachine());
1656
1658 Info.ptrVal = MFI->getGWSPSV(TM);
1659
1660 // This is an abstract access, but we need to specify a type and size.
1661 Info.memVT = MVT::i32;
1662 Info.size = 4;
1663 Info.align = Align(4);
1664
1666 return true;
1667 }
1668 case Intrinsic::amdgcn_s_prefetch_data:
1669 case Intrinsic::amdgcn_flat_prefetch:
1670 case Intrinsic::amdgcn_global_prefetch: {
1671 Info.opc = ISD::INTRINSIC_VOID;
1672 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1673 Info.ptrVal = CI.getArgOperand(0);
1674 Info.flags |= MachineMemOperand::MOLoad;
1675 return true;
1676 }
1677 default:
1678 return false;
1679 }
1680}
1681
1683 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1685 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1686 // The DAG's ValueType loses the addrspaces.
1687 // Add them as 2 extra Constant operands "from" and "to".
1688 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1689 unsigned DstAS = I.getType()->getPointerAddressSpace();
1690 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1691 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1692 break;
1693 }
1694 default:
1695 break;
1696 }
1697}
1698
1701 Type *&AccessTy) const {
1702 Value *Ptr = nullptr;
1703 switch (II->getIntrinsicID()) {
1704 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1705 case Intrinsic::amdgcn_cluster_load_b128:
1706 case Intrinsic::amdgcn_cluster_load_b64:
1707 case Intrinsic::amdgcn_cluster_load_b32:
1708 case Intrinsic::amdgcn_ds_append:
1709 case Intrinsic::amdgcn_ds_consume:
1710 case Intrinsic::amdgcn_ds_load_tr8_b64:
1711 case Intrinsic::amdgcn_ds_load_tr16_b128:
1712 case Intrinsic::amdgcn_ds_load_tr4_b64:
1713 case Intrinsic::amdgcn_ds_load_tr6_b96:
1714 case Intrinsic::amdgcn_ds_read_tr4_b64:
1715 case Intrinsic::amdgcn_ds_read_tr6_b96:
1716 case Intrinsic::amdgcn_ds_read_tr8_b64:
1717 case Intrinsic::amdgcn_ds_read_tr16_b64:
1718 case Intrinsic::amdgcn_ds_ordered_add:
1719 case Intrinsic::amdgcn_ds_ordered_swap:
1720 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1721 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1722 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1723 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1724 case Intrinsic::amdgcn_flat_load_monitor_b128:
1725 case Intrinsic::amdgcn_flat_load_monitor_b32:
1726 case Intrinsic::amdgcn_flat_load_monitor_b64:
1727 case Intrinsic::amdgcn_global_atomic_csub:
1728 case Intrinsic::amdgcn_global_atomic_fmax_num:
1729 case Intrinsic::amdgcn_global_atomic_fmin_num:
1730 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1731 case Intrinsic::amdgcn_global_load_monitor_b128:
1732 case Intrinsic::amdgcn_global_load_monitor_b32:
1733 case Intrinsic::amdgcn_global_load_monitor_b64:
1734 case Intrinsic::amdgcn_global_load_tr_b64:
1735 case Intrinsic::amdgcn_global_load_tr_b128:
1736 case Intrinsic::amdgcn_global_load_tr4_b64:
1737 case Intrinsic::amdgcn_global_load_tr6_b96:
1738 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1739 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1740 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1741 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1742 Ptr = II->getArgOperand(0);
1743 break;
1744 case Intrinsic::amdgcn_load_to_lds:
1745 case Intrinsic::amdgcn_global_load_lds:
1746 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1747 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1748 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1749 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1750 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1751 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1752 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1753 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1754 Ptr = II->getArgOperand(1);
1755 break;
1756 default:
1757 return false;
1758 }
1759 AccessTy = II->getType();
1760 Ops.push_back(Ptr);
1761 return true;
1762}
1763
1765 unsigned AddrSpace) const {
1766 if (!Subtarget->hasFlatInstOffsets()) {
1767 // Flat instructions do not have offsets, and only have the register
1768 // address.
1769 return AM.BaseOffs == 0 && AM.Scale == 0;
1770 }
1771
1772 decltype(SIInstrFlags::FLAT) FlatVariant =
1776
1777 return AM.Scale == 0 &&
1778 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1779 AM.BaseOffs, AddrSpace, FlatVariant));
1780}
1781
1783 if (Subtarget->hasFlatGlobalInsts())
1785
1786 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1787 // Assume the we will use FLAT for all global memory accesses
1788 // on VI.
1789 // FIXME: This assumption is currently wrong. On VI we still use
1790 // MUBUF instructions for the r + i addressing mode. As currently
1791 // implemented, the MUBUF instructions only work on buffer < 4GB.
1792 // It may be possible to support > 4GB buffers with MUBUF instructions,
1793 // by setting the stride value in the resource descriptor which would
1794 // increase the size limit to (stride * 4GB). However, this is risky,
1795 // because it has never been validated.
1797 }
1798
1799 return isLegalMUBUFAddressingMode(AM);
1800}
1801
1802bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1803 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1804 // additionally can do r + r + i with addr64. 32-bit has more addressing
1805 // mode options. Depending on the resource constant, it can also do
1806 // (i64 r0) + (i32 r1) * (i14 i).
1807 //
1808 // Private arrays end up using a scratch buffer most of the time, so also
1809 // assume those use MUBUF instructions. Scratch loads / stores are currently
1810 // implemented as mubuf instructions with offen bit set, so slightly
1811 // different than the normal addr64.
1812 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1813 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1814 return false;
1815
1816 // FIXME: Since we can split immediate into soffset and immediate offset,
1817 // would it make sense to allow any immediate?
1818
1819 switch (AM.Scale) {
1820 case 0: // r + i or just i, depending on HasBaseReg.
1821 return true;
1822 case 1:
1823 return true; // We have r + r or r + i.
1824 case 2:
1825 if (AM.HasBaseReg) {
1826 // Reject 2 * r + r.
1827 return false;
1828 }
1829
1830 // Allow 2 * r as r + r
1831 // Or 2 * r + i is allowed as r + r + i.
1832 return true;
1833 default: // Don't allow n * r
1834 return false;
1835 }
1836}
1837
1839 const AddrMode &AM, Type *Ty,
1840 unsigned AS,
1841 Instruction *I) const {
1842 // No global is ever allowed as a base.
1843 if (AM.BaseGV)
1844 return false;
1845
1846 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1847 return isLegalGlobalAddressingMode(AM);
1848
1849 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1853 // If the offset isn't a multiple of 4, it probably isn't going to be
1854 // correctly aligned.
1855 // FIXME: Can we get the real alignment here?
1856 if (AM.BaseOffs % 4 != 0)
1857 return isLegalMUBUFAddressingMode(AM);
1858
1859 if (!Subtarget->hasScalarSubwordLoads()) {
1860 // There are no SMRD extloads, so if we have to do a small type access we
1861 // will use a MUBUF load.
1862 // FIXME?: We also need to do this if unaligned, but we don't know the
1863 // alignment here.
1864 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1865 return isLegalGlobalAddressingMode(AM);
1866 }
1867
1868 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1869 // SMRD instructions have an 8-bit, dword offset on SI.
1870 if (!isUInt<8>(AM.BaseOffs / 4))
1871 return false;
1872 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1873 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1874 // in 8-bits, it can use a smaller encoding.
1875 if (!isUInt<32>(AM.BaseOffs / 4))
1876 return false;
1877 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1878 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1879 if (!isUInt<20>(AM.BaseOffs))
1880 return false;
1881 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1882 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1883 // for S_BUFFER_* instructions).
1884 if (!isInt<21>(AM.BaseOffs))
1885 return false;
1886 } else {
1887 // On GFX12, all offsets are signed 24-bit in bytes.
1888 if (!isInt<24>(AM.BaseOffs))
1889 return false;
1890 }
1891
1892 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1894 AM.BaseOffs < 0) {
1895 // Scalar (non-buffer) loads can only use a negative offset if
1896 // soffset+offset is non-negative. Since the compiler can only prove that
1897 // in a few special cases, it is safer to claim that negative offsets are
1898 // not supported.
1899 return false;
1900 }
1901
1902 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1903 return true;
1904
1905 if (AM.Scale == 1 && AM.HasBaseReg)
1906 return true;
1907
1908 return false;
1909 }
1910
1911 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1912 return Subtarget->enableFlatScratch()
1914 : isLegalMUBUFAddressingMode(AM);
1915
1916 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1917 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1918 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1919 // field.
1920 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1921 // an 8-bit dword offset but we don't know the alignment here.
1922 if (!isUInt<16>(AM.BaseOffs))
1923 return false;
1924
1925 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1926 return true;
1927
1928 if (AM.Scale == 1 && AM.HasBaseReg)
1929 return true;
1930
1931 return false;
1932 }
1933
1935 // For an unknown address space, this usually means that this is for some
1936 // reason being used for pure arithmetic, and not based on some addressing
1937 // computation. We don't have instructions that compute pointers with any
1938 // addressing modes, so treat them as having no offset like flat
1939 // instructions.
1941 }
1942
1943 // Assume a user alias of global for unknown address spaces.
1944 return isLegalGlobalAddressingMode(AM);
1945}
1946
1948 const MachineFunction &MF) const {
1950 return (MemVT.getSizeInBits() <= 4 * 32);
1951 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1952 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1953 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1954 }
1956 return (MemVT.getSizeInBits() <= 2 * 32);
1957 return true;
1958}
1959
1961 unsigned Size, unsigned AddrSpace, Align Alignment,
1962 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1963 if (IsFast)
1964 *IsFast = 0;
1965
1966 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1967 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1968 // Check if alignment requirements for ds_read/write instructions are
1969 // disabled.
1970 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1971 return false;
1972
1973 Align RequiredAlignment(
1974 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1975 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1976 Alignment < RequiredAlignment)
1977 return false;
1978
1979 // Either, the alignment requirements are "enabled", or there is an
1980 // unaligned LDS access related hardware bug though alignment requirements
1981 // are "disabled". In either case, we need to check for proper alignment
1982 // requirements.
1983 //
1984 switch (Size) {
1985 case 64:
1986 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1987 // address is negative, then the instruction is incorrectly treated as
1988 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1989 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1990 // load later in the SILoadStoreOptimizer.
1991 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1992 return false;
1993
1994 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1995 // can do a 4 byte aligned, 8 byte access in a single operation using
1996 // ds_read2/write2_b32 with adjacent offsets.
1997 RequiredAlignment = Align(4);
1998
1999 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2000 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2001 // ds_write2_b32 depending on the alignment. In either case with either
2002 // alignment there is no faster way of doing this.
2003
2004 // The numbers returned here and below are not additive, it is a 'speed
2005 // rank'. They are just meant to be compared to decide if a certain way
2006 // of lowering an operation is faster than another. For that purpose
2007 // naturally aligned operation gets it bitsize to indicate that "it
2008 // operates with a speed comparable to N-bit wide load". With the full
2009 // alignment ds128 is slower than ds96 for example. If underaligned it
2010 // is comparable to a speed of a single dword access, which would then
2011 // mean 32 < 128 and it is faster to issue a wide load regardless.
2012 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2013 // wider load which will not be aligned anymore the latter is slower.
2014 if (IsFast)
2015 *IsFast = (Alignment >= RequiredAlignment) ? 64
2016 : (Alignment < Align(4)) ? 32
2017 : 1;
2018 return true;
2019 }
2020
2021 break;
2022 case 96:
2023 if (!Subtarget->hasDS96AndDS128())
2024 return false;
2025
2026 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2027 // gfx8 and older.
2028
2029 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2030 // Naturally aligned access is fastest. However, also report it is Fast
2031 // if memory is aligned less than DWORD. A narrow load or store will be
2032 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2033 // be more of them, so overall we will pay less penalty issuing a single
2034 // instruction.
2035
2036 // See comment on the values above.
2037 if (IsFast)
2038 *IsFast = (Alignment >= RequiredAlignment) ? 96
2039 : (Alignment < Align(4)) ? 32
2040 : 1;
2041 return true;
2042 }
2043
2044 break;
2045 case 128:
2046 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2047 return false;
2048
2049 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2050 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2051 // single operation using ds_read2/write2_b64.
2052 RequiredAlignment = Align(8);
2053
2054 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2055 // Naturally aligned access is fastest. However, also report it is Fast
2056 // if memory is aligned less than DWORD. A narrow load or store will be
2057 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2058 // will be more of them, so overall we will pay less penalty issuing a
2059 // single instruction.
2060
2061 // See comment on the values above.
2062 if (IsFast)
2063 *IsFast = (Alignment >= RequiredAlignment) ? 128
2064 : (Alignment < Align(4)) ? 32
2065 : 1;
2066 return true;
2067 }
2068
2069 break;
2070 default:
2071 if (Size > 32)
2072 return false;
2073
2074 break;
2075 }
2076
2077 // See comment on the values above.
2078 // Note that we have a single-dword or sub-dword here, so if underaligned
2079 // it is a slowest possible access, hence returned value is 0.
2080 if (IsFast)
2081 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2082
2083 return Alignment >= RequiredAlignment ||
2084 Subtarget->hasUnalignedDSAccessEnabled();
2085 }
2086
2087 // FIXME: We have to be conservative here and assume that flat operations
2088 // will access scratch. If we had access to the IR function, then we
2089 // could determine if any private memory was used in the function.
2090 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2091 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2092 bool AlignedBy4 = Alignment >= Align(4);
2093 if (IsFast)
2094 *IsFast = AlignedBy4;
2095
2096 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2097 }
2098
2099 // So long as they are correct, wide global memory operations perform better
2100 // than multiple smaller memory ops -- even when misaligned
2101 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2102 if (IsFast)
2103 *IsFast = Size;
2104
2105 return Alignment >= Align(4) ||
2106 Subtarget->hasUnalignedBufferAccessEnabled();
2107 }
2108
2109 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2110 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2111 // out-of-bounds behavior, but in the edge case where an access starts
2112 // out-of-bounds and then enter in-bounds, the entire access would be treated
2113 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2114 // natural alignment of buffer accesses.
2115 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2116 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2117 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2118 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2119 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2120 return false;
2121 }
2122
2123 // Smaller than dword value must be aligned.
2124 if (Size < 32)
2125 return false;
2126
2127 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2128 // byte-address are ignored, thus forcing Dword alignment.
2129 // This applies to private, global, and constant memory.
2130 if (IsFast)
2131 *IsFast = 1;
2132
2133 return Size >= 32 && Alignment >= Align(4);
2134}
2135
2137 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2138 unsigned *IsFast) const {
2140 Alignment, Flags, IsFast);
2141}
2142
2144 LLVMContext &Context, const MemOp &Op,
2145 const AttributeList &FuncAttributes) const {
2146 // FIXME: Should account for address space here.
2147
2148 // The default fallback uses the private pointer size as a guess for a type to
2149 // use. Make sure we switch these to 64-bit accesses.
2150
2151 if (Op.size() >= 16 &&
2152 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2153 return MVT::v4i32;
2154
2155 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2156 return MVT::v2i32;
2157
2158 // Use the default.
2159 return MVT::Other;
2160}
2161
2163 const MemSDNode *MemNode = cast<MemSDNode>(N);
2164 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2165}
2166
2171
2173 unsigned DestAS) const {
2174 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2175 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2176 Subtarget->hasGloballyAddressableScratch()) {
2177 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2178 return false;
2179 }
2180
2181 // Flat -> private/local is a simple truncate.
2182 // Flat -> global is no-op
2183 return true;
2184 }
2185
2186 const GCNTargetMachine &TM =
2187 static_cast<const GCNTargetMachine &>(getTargetMachine());
2188 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2189}
2190
2198
2200 Type *Ty) const {
2201 // FIXME: Could be smarter if called for vector constants.
2202 return true;
2203}
2204
2206 unsigned Index) const {
2208 return false;
2209
2210 // TODO: Add more cases that are cheap.
2211 return Index == 0;
2212}
2213
2214bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2215 // TODO: This should be more aggressive, particular for 16-bit element
2216 // vectors. However there are some mixed improvements and regressions.
2217 EVT EltTy = VT.getVectorElementType();
2218 return EltTy.getSizeInBits() % 32 == 0;
2219}
2220
2222 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2223 switch (Op) {
2224 case ISD::LOAD:
2225 case ISD::STORE:
2226 return true;
2227 default:
2228 return false;
2229 }
2230 }
2231
2232 // SimplifySetCC uses this function to determine whether or not it should
2233 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2234 if (VT == MVT::i1 && Op == ISD::SETCC)
2235 return false;
2236
2238}
2239
2240SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2241 const SDLoc &SL,
2242 SDValue Chain,
2243 uint64_t Offset) const {
2244 const DataLayout &DL = DAG.getDataLayout();
2248
2249 auto [InputPtrReg, RC, ArgTy] =
2250 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2251
2252 // We may not have the kernarg segment argument if we have no kernel
2253 // arguments.
2254 if (!InputPtrReg)
2255 return DAG.getConstant(Offset, SL, PtrVT);
2256
2258 SDValue BasePtr = DAG.getCopyFromReg(
2259 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2260
2261 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2262}
2263
2264SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2265 const SDLoc &SL) const {
2268 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2269}
2270
2271SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2272 const SDLoc &SL) const {
2273
2275 std::optional<uint32_t> KnownSize =
2277 if (KnownSize.has_value())
2278 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2279 return SDValue();
2280}
2281
2282SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2283 const SDLoc &SL, SDValue Val,
2284 bool Signed,
2285 const ISD::InputArg *Arg) const {
2286 // First, if it is a widened vector, narrow it.
2287 if (VT.isVector() &&
2289 EVT NarrowedVT =
2292 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2293 DAG.getConstant(0, SL, MVT::i32));
2294 }
2295
2296 // Then convert the vector elements or scalar value.
2297 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2298 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2299 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2300 }
2301
2302 if (MemVT.isFloatingPoint())
2303 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2304 else if (Signed)
2305 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2306 else
2307 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2308
2309 return Val;
2310}
2311
2312SDValue SITargetLowering::lowerKernargMemParameter(
2313 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2314 uint64_t Offset, Align Alignment, bool Signed,
2315 const ISD::InputArg *Arg) const {
2316 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2317
2318 // Try to avoid using an extload by loading earlier than the argument address,
2319 // and extracting the relevant bits. The load should hopefully be merged with
2320 // the previous argument.
2321 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2322 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2323 int64_t AlignDownOffset = alignDown(Offset, 4);
2324 int64_t OffsetDiff = Offset - AlignDownOffset;
2325
2326 EVT IntVT = MemVT.changeTypeToInteger();
2327
2328 // TODO: If we passed in the base kernel offset we could have a better
2329 // alignment than 4, but we don't really need it.
2330 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2331 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2334
2335 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2336 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2337
2338 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2339 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2340 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2341
2342 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2343 }
2344
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2346 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2349
2350 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2351 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2352}
2353
2354SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2355 CCValAssign &VA, const SDLoc &SL,
2356 SDValue Chain,
2357 const ISD::InputArg &Arg) const {
2358 MachineFunction &MF = DAG.getMachineFunction();
2359 MachineFrameInfo &MFI = MF.getFrameInfo();
2360
2361 if (Arg.Flags.isByVal()) {
2362 unsigned Size = Arg.Flags.getByValSize();
2363 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2364 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2365 }
2366
2367 unsigned ArgOffset = VA.getLocMemOffset();
2368 unsigned ArgSize = VA.getValVT().getStoreSize();
2369
2370 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2371
2372 // Create load nodes to retrieve arguments from the stack.
2373 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2374 SDValue ArgValue;
2375
2376 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2378 MVT MemVT = VA.getValVT();
2379
2380 switch (VA.getLocInfo()) {
2381 default:
2382 break;
2383 case CCValAssign::BCvt:
2384 MemVT = VA.getLocVT();
2385 break;
2386 case CCValAssign::SExt:
2387 ExtType = ISD::SEXTLOAD;
2388 break;
2389 case CCValAssign::ZExt:
2390 ExtType = ISD::ZEXTLOAD;
2391 break;
2392 case CCValAssign::AExt:
2393 ExtType = ISD::EXTLOAD;
2394 break;
2395 }
2396
2397 ArgValue = DAG.getExtLoad(
2398 ExtType, SL, VA.getLocVT(), Chain, FIN,
2400 return ArgValue;
2401}
2402
2403SDValue SITargetLowering::getPreloadedValue(
2404 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2406 const ArgDescriptor *Reg = nullptr;
2407 const TargetRegisterClass *RC;
2408 LLT Ty;
2409
2411 const ArgDescriptor WorkGroupIDX =
2412 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2413 // If GridZ is not programmed in an entry function then the hardware will set
2414 // it to all zeros, so there is no need to mask the GridY value in the low
2415 // order bits.
2416 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2417 AMDGPU::TTMP7,
2418 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2419 const ArgDescriptor WorkGroupIDZ =
2420 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2421 if (Subtarget->hasArchitectedSGPRs() &&
2424 switch (PVID) {
2426 Reg = &WorkGroupIDX;
2427 RC = &AMDGPU::SReg_32RegClass;
2428 Ty = LLT::scalar(32);
2429 break;
2431 Reg = &WorkGroupIDY;
2432 RC = &AMDGPU::SReg_32RegClass;
2433 Ty = LLT::scalar(32);
2434 break;
2436 Reg = &WorkGroupIDZ;
2437 RC = &AMDGPU::SReg_32RegClass;
2438 Ty = LLT::scalar(32);
2439 break;
2440 default:
2441 break;
2442 }
2443 }
2444
2445 if (!Reg)
2446 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2447 if (!Reg) {
2449 // It's possible for a kernarg intrinsic call to appear in a kernel with
2450 // no allocated segment, in which case we do not add the user sgpr
2451 // argument, so just return null.
2452 return DAG.getConstant(0, SDLoc(), VT);
2453 }
2454
2455 // It's undefined behavior if a function marked with the amdgpu-no-*
2456 // attributes uses the corresponding intrinsic.
2457 return DAG.getPOISON(VT);
2458 }
2459
2460 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2461}
2462
2464 CallingConv::ID CallConv,
2465 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2466 FunctionType *FType,
2468 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2469 const ISD::InputArg *Arg = &Ins[I];
2470
2471 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2472 "vector type argument should have been split");
2473
2474 // First check if it's a PS input addr.
2475 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2476 PSInputNum <= 15) {
2477 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2478
2479 // Inconveniently only the first part of the split is marked as isSplit,
2480 // so skip to the end. We only want to increment PSInputNum once for the
2481 // entire split argument.
2482 if (Arg->Flags.isSplit()) {
2483 while (!Arg->Flags.isSplitEnd()) {
2484 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2485 "unexpected vector split in ps argument type");
2486 if (!SkipArg)
2487 Splits.push_back(*Arg);
2488 Arg = &Ins[++I];
2489 }
2490 }
2491
2492 if (SkipArg) {
2493 // We can safely skip PS inputs.
2494 Skipped.set(Arg->getOrigArgIndex());
2495 ++PSInputNum;
2496 continue;
2497 }
2498
2499 Info->markPSInputAllocated(PSInputNum);
2500 if (Arg->Used)
2501 Info->markPSInputEnabled(PSInputNum);
2502
2503 ++PSInputNum;
2504 }
2505
2506 Splits.push_back(*Arg);
2507 }
2508}
2509
2510// Allocate special inputs passed in VGPRs.
2512 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2513 SIMachineFunctionInfo &Info) const {
2514 const LLT S32 = LLT::scalar(32);
2516
2517 if (Info.hasWorkItemIDX()) {
2518 Register Reg = AMDGPU::VGPR0;
2519 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2520
2521 CCInfo.AllocateReg(Reg);
2522 unsigned Mask =
2523 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2524 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2525 }
2526
2527 if (Info.hasWorkItemIDY()) {
2528 assert(Info.hasWorkItemIDX());
2529 if (Subtarget->hasPackedTID()) {
2530 Info.setWorkItemIDY(
2531 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2532 } else {
2533 unsigned Reg = AMDGPU::VGPR1;
2534 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2535
2536 CCInfo.AllocateReg(Reg);
2537 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2538 }
2539 }
2540
2541 if (Info.hasWorkItemIDZ()) {
2542 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2543 if (Subtarget->hasPackedTID()) {
2544 Info.setWorkItemIDZ(
2545 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2546 } else {
2547 unsigned Reg = AMDGPU::VGPR2;
2548 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2549
2550 CCInfo.AllocateReg(Reg);
2551 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2552 }
2553 }
2554}
2555
2556// Try to allocate a VGPR at the end of the argument list, or if no argument
2557// VGPRs are left allocating a stack slot.
2558// If \p Mask is is given it indicates bitfield position in the register.
2559// If \p Arg is given use it with new ]p Mask instead of allocating new.
2560static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2561 ArgDescriptor Arg = ArgDescriptor()) {
2562 if (Arg.isSet())
2563 return ArgDescriptor::createArg(Arg, Mask);
2564
2565 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2566 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2567 if (RegIdx == ArgVGPRs.size()) {
2568 // Spill to stack required.
2569 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2570
2571 return ArgDescriptor::createStack(Offset, Mask);
2572 }
2573
2574 unsigned Reg = ArgVGPRs[RegIdx];
2575 Reg = CCInfo.AllocateReg(Reg);
2576 assert(Reg != AMDGPU::NoRegister);
2577
2578 MachineFunction &MF = CCInfo.getMachineFunction();
2579 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2580 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2581 return ArgDescriptor::createRegister(Reg, Mask);
2582}
2583
2585 const TargetRegisterClass *RC,
2586 unsigned NumArgRegs) {
2587 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2588 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2589 if (RegIdx == ArgSGPRs.size())
2590 report_fatal_error("ran out of SGPRs for arguments");
2591
2592 unsigned Reg = ArgSGPRs[RegIdx];
2593 Reg = CCInfo.AllocateReg(Reg);
2594 assert(Reg != AMDGPU::NoRegister);
2595
2596 MachineFunction &MF = CCInfo.getMachineFunction();
2597 MF.addLiveIn(Reg, RC);
2599}
2600
2601// If this has a fixed position, we still should allocate the register in the
2602// CCInfo state. Technically we could get away with this for values passed
2603// outside of the normal argument range.
2605 const TargetRegisterClass *RC,
2606 MCRegister Reg) {
2607 Reg = CCInfo.AllocateReg(Reg);
2608 assert(Reg != AMDGPU::NoRegister);
2609 MachineFunction &MF = CCInfo.getMachineFunction();
2610 MF.addLiveIn(Reg, RC);
2611}
2612
2613static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2614 if (Arg) {
2615 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2616 Arg.getRegister());
2617 } else
2618 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2619}
2620
2621static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2622 if (Arg) {
2623 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2624 Arg.getRegister());
2625 } else
2626 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2627}
2628
2629/// Allocate implicit function VGPR arguments at the end of allocated user
2630/// arguments.
2632 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2633 SIMachineFunctionInfo &Info) const {
2634 const unsigned Mask = 0x3ff;
2635 ArgDescriptor Arg;
2636
2637 if (Info.hasWorkItemIDX()) {
2638 Arg = allocateVGPR32Input(CCInfo, Mask);
2639 Info.setWorkItemIDX(Arg);
2640 }
2641
2642 if (Info.hasWorkItemIDY()) {
2643 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2644 Info.setWorkItemIDY(Arg);
2645 }
2646
2647 if (Info.hasWorkItemIDZ())
2648 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2649}
2650
2651/// Allocate implicit function VGPR arguments in fixed registers.
2653 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2654 SIMachineFunctionInfo &Info) const {
2655 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2656 if (!Reg)
2657 report_fatal_error("failed to allocate VGPR for implicit arguments");
2658
2659 const unsigned Mask = 0x3ff;
2660 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2661 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2662 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2663}
2664
2666 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2667 SIMachineFunctionInfo &Info) const {
2668 auto &ArgInfo = Info.getArgInfo();
2669 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2670
2671 // TODO: Unify handling with private memory pointers.
2672 if (UserSGPRInfo.hasDispatchPtr())
2673 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2674
2675 if (UserSGPRInfo.hasQueuePtr())
2676 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2677
2678 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2679 // constant offset from the kernarg segment.
2680 if (Info.hasImplicitArgPtr())
2681 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2682
2683 if (UserSGPRInfo.hasDispatchID())
2684 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2685
2686 // flat_scratch_init is not applicable for non-kernel functions.
2687
2688 if (Info.hasWorkGroupIDX())
2689 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2690
2691 if (Info.hasWorkGroupIDY())
2692 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2693
2694 if (Info.hasWorkGroupIDZ())
2695 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2696
2697 if (Info.hasLDSKernelId())
2698 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2699}
2700
2701// Allocate special inputs passed in user SGPRs.
2703 MachineFunction &MF,
2704 const SIRegisterInfo &TRI,
2705 SIMachineFunctionInfo &Info) const {
2706 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2707 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2708 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2709 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2710 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2711 }
2712
2713 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2714 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2715 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2716 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2717 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2718 }
2719
2720 if (UserSGPRInfo.hasDispatchPtr()) {
2721 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2722 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2723 CCInfo.AllocateReg(DispatchPtrReg);
2724 }
2725
2726 if (UserSGPRInfo.hasQueuePtr()) {
2727 Register QueuePtrReg = Info.addQueuePtr(TRI);
2728 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2729 CCInfo.AllocateReg(QueuePtrReg);
2730 }
2731
2732 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2734 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2735 CCInfo.AllocateReg(InputPtrReg);
2736
2737 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2738 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2739 }
2740
2741 if (UserSGPRInfo.hasDispatchID()) {
2742 Register DispatchIDReg = Info.addDispatchID(TRI);
2743 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2744 CCInfo.AllocateReg(DispatchIDReg);
2745 }
2746
2747 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2748 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2749 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2750 CCInfo.AllocateReg(FlatScratchInitReg);
2751 }
2752
2753 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2754 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2755 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2756 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2757 }
2758
2759 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2760 // these from the dispatch pointer.
2761}
2762
2763// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2764// sequential starting from the first argument.
2766 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2768 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2769 Function &F = MF.getFunction();
2770 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2771 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2772 bool InPreloadSequence = true;
2773 unsigned InIdx = 0;
2774 bool AlignedForImplictArgs = false;
2775 unsigned ImplicitArgOffset = 0;
2776 for (auto &Arg : F.args()) {
2777 if (!InPreloadSequence || !Arg.hasInRegAttr())
2778 break;
2779
2780 unsigned ArgIdx = Arg.getArgNo();
2781 // Don't preload non-original args or parts not in the current preload
2782 // sequence.
2783 if (InIdx < Ins.size() &&
2784 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2785 break;
2786
2787 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2788 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2789 InIdx++) {
2790 assert(ArgLocs[ArgIdx].isMemLoc());
2791 auto &ArgLoc = ArgLocs[InIdx];
2792 const Align KernelArgBaseAlign = Align(16);
2793 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2794 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2795 unsigned NumAllocSGPRs =
2796 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2797
2798 // Fix alignment for hidden arguments.
2799 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2800 if (!AlignedForImplictArgs) {
2801 ImplicitArgOffset =
2802 alignTo(LastExplicitArgOffset,
2803 Subtarget->getAlignmentForImplicitArgPtr()) -
2804 LastExplicitArgOffset;
2805 AlignedForImplictArgs = true;
2806 }
2807 ArgOffset += ImplicitArgOffset;
2808 }
2809
2810 // Arg is preloaded into the previous SGPR.
2811 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2812 assert(InIdx >= 1 && "No previous SGPR");
2813 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2814 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2815 continue;
2816 }
2817
2818 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2819 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2820 // Check for free user SGPRs for preloading.
2821 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2822 InPreloadSequence = false;
2823 break;
2824 }
2825
2826 // Preload this argument.
2827 const TargetRegisterClass *RC =
2828 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2829 SmallVectorImpl<MCRegister> *PreloadRegs =
2830 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2831
2832 if (PreloadRegs->size() > 1)
2833 RC = &AMDGPU::SGPR_32RegClass;
2834 for (auto &Reg : *PreloadRegs) {
2835 assert(Reg);
2836 MF.addLiveIn(Reg, RC);
2837 CCInfo.AllocateReg(Reg);
2838 }
2839
2840 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2841 }
2842 }
2843}
2844
2846 const SIRegisterInfo &TRI,
2847 SIMachineFunctionInfo &Info) const {
2848 // Always allocate this last since it is a synthetic preload.
2849 if (Info.hasLDSKernelId()) {
2850 Register Reg = Info.addLDSKernelId();
2851 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2852 CCInfo.AllocateReg(Reg);
2853 }
2854}
2855
2856// Allocate special input registers that are initialized per-wave.
2859 CallingConv::ID CallConv,
2860 bool IsShader) const {
2861 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2862 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2863 // Note: user SGPRs are handled by the front-end for graphics shaders
2864 // Pad up the used user SGPRs with dead inputs.
2865
2866 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2867 // before enabling architected SGPRs for workgroup IDs.
2868 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2869
2870 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2871 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2872 // rely on it to reach 16 since if we end up having no stack usage, it will
2873 // not really be added.
2874 unsigned NumRequiredSystemSGPRs =
2875 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2876 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2877 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2878 Register Reg = Info.addReservedUserSGPR();
2879 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2880 CCInfo.AllocateReg(Reg);
2881 }
2882 }
2883
2884 if (!HasArchitectedSGPRs) {
2885 if (Info.hasWorkGroupIDX()) {
2886 Register Reg = Info.addWorkGroupIDX();
2887 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2888 CCInfo.AllocateReg(Reg);
2889 }
2890
2891 if (Info.hasWorkGroupIDY()) {
2892 Register Reg = Info.addWorkGroupIDY();
2893 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2894 CCInfo.AllocateReg(Reg);
2895 }
2896
2897 if (Info.hasWorkGroupIDZ()) {
2898 Register Reg = Info.addWorkGroupIDZ();
2899 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2900 CCInfo.AllocateReg(Reg);
2901 }
2902 }
2903
2904 if (Info.hasWorkGroupInfo()) {
2905 Register Reg = Info.addWorkGroupInfo();
2906 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2907 CCInfo.AllocateReg(Reg);
2908 }
2909
2910 if (Info.hasPrivateSegmentWaveByteOffset()) {
2911 // Scratch wave offset passed in system SGPR.
2912 unsigned PrivateSegmentWaveByteOffsetReg;
2913
2914 if (IsShader) {
2915 PrivateSegmentWaveByteOffsetReg =
2916 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2917
2918 // This is true if the scratch wave byte offset doesn't have a fixed
2919 // location.
2920 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2921 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2922 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2923 }
2924 } else
2925 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2926
2927 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2928 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2929 }
2930
2931 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2932 Info.getNumPreloadedSGPRs() >= 16);
2933}
2934
2936 MachineFunction &MF,
2937 const SIRegisterInfo &TRI,
2939 // Now that we've figured out where the scratch register inputs are, see if
2940 // should reserve the arguments and use them directly.
2941 MachineFrameInfo &MFI = MF.getFrameInfo();
2942 bool HasStackObjects = MFI.hasStackObjects();
2943 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2944
2945 // Record that we know we have non-spill stack objects so we don't need to
2946 // check all stack objects later.
2947 if (HasStackObjects)
2948 Info.setHasNonSpillStackObjects(true);
2949
2950 // Everything live out of a block is spilled with fast regalloc, so it's
2951 // almost certain that spilling will be required.
2952 if (TM.getOptLevel() == CodeGenOptLevel::None)
2953 HasStackObjects = true;
2954
2955 // For now assume stack access is needed in any callee functions, so we need
2956 // the scratch registers to pass in.
2957 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2958
2959 if (!ST.enableFlatScratch()) {
2960 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2961 // If we have stack objects, we unquestionably need the private buffer
2962 // resource. For the Code Object V2 ABI, this will be the first 4 user
2963 // SGPR inputs. We can reserve those and use them directly.
2964
2965 Register PrivateSegmentBufferReg =
2967 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2968 } else {
2969 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2970 // We tentatively reserve the last registers (skipping the last registers
2971 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2972 // we'll replace these with the ones immediately after those which were
2973 // really allocated. In the prologue copies will be inserted from the
2974 // argument to these reserved registers.
2975
2976 // Without HSA, relocations are used for the scratch pointer and the
2977 // buffer resource setup is always inserted in the prologue. Scratch wave
2978 // offset is still in an input SGPR.
2979 Info.setScratchRSrcReg(ReservedBufferReg);
2980 }
2981 }
2982
2984
2985 // For entry functions we have to set up the stack pointer if we use it,
2986 // whereas non-entry functions get this "for free". This means there is no
2987 // intrinsic advantage to using S32 over S34 in cases where we do not have
2988 // calls but do need a frame pointer (i.e. if we are requested to have one
2989 // because frame pointer elimination is disabled). To keep things simple we
2990 // only ever use S32 as the call ABI stack pointer, and so using it does not
2991 // imply we need a separate frame pointer.
2992 //
2993 // Try to use s32 as the SP, but move it if it would interfere with input
2994 // arguments. This won't work with calls though.
2995 //
2996 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2997 // registers.
2998 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2999 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3000 } else {
3002
3003 if (MFI.hasCalls())
3004 report_fatal_error("call in graphics shader with too many input SGPRs");
3005
3006 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3007 if (!MRI.isLiveIn(Reg)) {
3008 Info.setStackPtrOffsetReg(Reg);
3009 break;
3010 }
3011 }
3012
3013 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3014 report_fatal_error("failed to find register for SP");
3015 }
3016
3017 // hasFP should be accurate for entry functions even before the frame is
3018 // finalized, because it does not rely on the known stack size, only
3019 // properties like whether variable sized objects are present.
3020 if (ST.getFrameLowering()->hasFP(MF)) {
3021 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3022 }
3023}
3024
3027 return !Info->isEntryFunction();
3028}
3029
3031
3033 MachineBasicBlock *Entry,
3034 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3036
3037 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3038 if (!IStart)
3039 return;
3040
3041 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3042 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3043 MachineBasicBlock::iterator MBBI = Entry->begin();
3044 for (const MCPhysReg *I = IStart; *I; ++I) {
3045 const TargetRegisterClass *RC = nullptr;
3046 if (AMDGPU::SReg_64RegClass.contains(*I))
3047 RC = &AMDGPU::SGPR_64RegClass;
3048 else if (AMDGPU::SReg_32RegClass.contains(*I))
3049 RC = &AMDGPU::SGPR_32RegClass;
3050 else
3051 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3052
3053 Register NewVR = MRI->createVirtualRegister(RC);
3054 // Create copy from CSR to a virtual register.
3055 Entry->addLiveIn(*I);
3056 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3057 .addReg(*I);
3058
3059 // Insert the copy-back instructions right before the terminator.
3060 for (auto *Exit : Exits)
3061 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3062 TII->get(TargetOpcode::COPY), *I)
3063 .addReg(NewVR);
3064 }
3065}
3066
3068 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3069 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3070 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3072
3074 const Function &Fn = MF.getFunction();
3077 bool IsError = false;
3078
3079 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3081 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3082 IsError = true;
3083 }
3084
3087 BitVector Skipped(Ins.size());
3088 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3089 *DAG.getContext());
3090
3091 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3092 bool IsKernel = AMDGPU::isKernel(CallConv);
3093 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3094
3095 if (IsGraphics) {
3096 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3097 assert(!UserSGPRInfo.hasDispatchPtr() &&
3098 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3099 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3100 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3101 (void)UserSGPRInfo;
3102 if (!Subtarget->enableFlatScratch())
3103 assert(!UserSGPRInfo.hasFlatScratchInit());
3104 if ((CallConv != CallingConv::AMDGPU_CS &&
3105 CallConv != CallingConv::AMDGPU_Gfx &&
3106 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3107 !Subtarget->hasArchitectedSGPRs())
3108 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3109 !Info->hasWorkGroupIDZ());
3110 }
3111
3112 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3113
3114 if (CallConv == CallingConv::AMDGPU_PS) {
3115 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3116
3117 // At least one interpolation mode must be enabled or else the GPU will
3118 // hang.
3119 //
3120 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3121 // set PSInputAddr, the user wants to enable some bits after the compilation
3122 // based on run-time states. Since we can't know what the final PSInputEna
3123 // will look like, so we shouldn't do anything here and the user should take
3124 // responsibility for the correct programming.
3125 //
3126 // Otherwise, the following restrictions apply:
3127 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3128 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3129 // enabled too.
3130 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3131 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3132 CCInfo.AllocateReg(AMDGPU::VGPR0);
3133 CCInfo.AllocateReg(AMDGPU::VGPR1);
3134 Info->markPSInputAllocated(0);
3135 Info->markPSInputEnabled(0);
3136 }
3137 if (Subtarget->isAmdPalOS()) {
3138 // For isAmdPalOS, the user does not enable some bits after compilation
3139 // based on run-time states; the register values being generated here are
3140 // the final ones set in hardware. Therefore we need to apply the
3141 // workaround to PSInputAddr and PSInputEnable together. (The case where
3142 // a bit is set in PSInputAddr but not PSInputEnable is where the
3143 // frontend set up an input arg for a particular interpolation mode, but
3144 // nothing uses that input arg. Really we should have an earlier pass
3145 // that removes such an arg.)
3146 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3147 if ((PsInputBits & 0x7F) == 0 ||
3148 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3149 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3150 }
3151 } else if (IsKernel) {
3152 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3153 } else {
3154 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3155 Ins.end());
3156 }
3157
3158 if (IsKernel)
3159 analyzeFormalArgumentsCompute(CCInfo, Ins);
3160
3161 if (IsEntryFunc) {
3162 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3163 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3164 if (IsKernel && Subtarget->hasKernargPreload())
3165 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3166
3167 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3168 } else if (!IsGraphics) {
3169 // For the fixed ABI, pass workitem IDs in the last argument register.
3170 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3171
3172 // FIXME: Sink this into allocateSpecialInputSGPRs
3173 if (!Subtarget->enableFlatScratch())
3174 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3175
3176 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3177 }
3178
3179 if (!IsKernel) {
3180 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3181 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3182
3183 // This assumes the registers are allocated by CCInfo in ascending order
3184 // with no gaps.
3185 Info->setNumWaveDispatchSGPRs(
3186 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3187 Info->setNumWaveDispatchVGPRs(
3188 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3189 } else if (Info->getNumKernargPreloadedSGPRs()) {
3190 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3191 }
3192
3194
3195 if (IsWholeWaveFunc) {
3197 {MVT::i1, MVT::Other}, Chain);
3198 InVals.push_back(Setup.getValue(0));
3199 Chains.push_back(Setup.getValue(1));
3200 }
3201
3202 // FIXME: This is the minimum kernel argument alignment. We should improve
3203 // this to the maximum alignment of the arguments.
3204 //
3205 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3206 // kern arg offset.
3207 const Align KernelArgBaseAlign = Align(16);
3208
3209 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3210 ++i) {
3211 const ISD::InputArg &Arg = Ins[i];
3212 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3213 InVals.push_back(DAG.getPOISON(Arg.VT));
3214 continue;
3215 }
3216
3217 CCValAssign &VA = ArgLocs[ArgIdx++];
3218 MVT VT = VA.getLocVT();
3219
3220 if (IsEntryFunc && VA.isMemLoc()) {
3221 VT = Ins[i].VT;
3222 EVT MemVT = VA.getLocVT();
3223
3224 const uint64_t Offset = VA.getLocMemOffset();
3225 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3226
3227 if (Arg.Flags.isByRef()) {
3228 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3229
3230 const GCNTargetMachine &TM =
3231 static_cast<const GCNTargetMachine &>(getTargetMachine());
3232 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3233 Arg.Flags.getPointerAddrSpace())) {
3236 }
3237
3238 InVals.push_back(Ptr);
3239 continue;
3240 }
3241
3242 SDValue NewArg;
3243 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3244 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3245 // In this case the argument is packed into the previous preload SGPR.
3246 int64_t AlignDownOffset = alignDown(Offset, 4);
3247 int64_t OffsetDiff = Offset - AlignDownOffset;
3248 EVT IntVT = MemVT.changeTypeToInteger();
3249
3250 const SIMachineFunctionInfo *Info =
3253 Register Reg =
3254 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3255
3256 assert(Reg);
3257 Register VReg = MRI.getLiveInVirtReg(Reg);
3258 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3259
3260 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3261 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3262
3263 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3264 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3265 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3266 Ins[i].Flags.isSExt(), &Ins[i]);
3267
3268 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3269 } else {
3270 const SIMachineFunctionInfo *Info =
3273 const SmallVectorImpl<MCRegister> &PreloadRegs =
3274 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3275
3276 SDValue Copy;
3277 if (PreloadRegs.size() == 1) {
3278 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3279 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3280 NewArg = DAG.getCopyFromReg(
3281 Chain, DL, VReg,
3283 TRI->getRegSizeInBits(*RC)));
3284
3285 } else {
3286 // If the kernarg alignment does not match the alignment of the SGPR
3287 // tuple RC that can accommodate this argument, it will be built up
3288 // via copies from from the individual SGPRs that the argument was
3289 // preloaded to.
3291 for (auto Reg : PreloadRegs) {
3292 Register VReg = MRI.getLiveInVirtReg(Reg);
3293 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3294 Elts.push_back(Copy);
3295 }
3296 NewArg =
3297 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3298 PreloadRegs.size()),
3299 DL, Elts);
3300 }
3301
3302 // If the argument was preloaded to multiple consecutive 32-bit
3303 // registers because of misalignment between addressable SGPR tuples
3304 // and the argument size, we can still assume that because of kernarg
3305 // segment alignment restrictions that NewArg's size is the same as
3306 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3307 // truncate since we cannot preload to less than a single SGPR and the
3308 // MemVT may be smaller.
3309 EVT MemVTInt =
3311 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3312 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3313
3314 NewArg = DAG.getBitcast(MemVT, NewArg);
3315 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3316 Ins[i].Flags.isSExt(), &Ins[i]);
3317 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3318 }
3319 } else {
3320 // Hidden arguments that are in the kernel signature must be preloaded
3321 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3322 // the argument list and is not preloaded.
3323 if (Arg.isOrigArg()) {
3324 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3325 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3327 *OrigArg->getParent(),
3328 "hidden argument in kernel signature was not preloaded",
3329 DL.getDebugLoc()));
3330 }
3331 }
3332
3333 NewArg =
3334 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3335 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3336 }
3337 Chains.push_back(NewArg.getValue(1));
3338
3339 auto *ParamTy =
3340 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3341 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3342 ParamTy &&
3343 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3344 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3345 // On SI local pointers are just offsets into LDS, so they are always
3346 // less than 16-bits. On CI and newer they could potentially be
3347 // real pointers, so we can't guarantee their size.
3348 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3349 DAG.getValueType(MVT::i16));
3350 }
3351
3352 InVals.push_back(NewArg);
3353 continue;
3354 }
3355 if (!IsEntryFunc && VA.isMemLoc()) {
3356 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3357 InVals.push_back(Val);
3358 if (!Arg.Flags.isByVal())
3359 Chains.push_back(Val.getValue(1));
3360 continue;
3361 }
3362
3363 assert(VA.isRegLoc() && "Parameter must be in a register!");
3364
3365 Register Reg = VA.getLocReg();
3366 const TargetRegisterClass *RC = nullptr;
3367 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3368 RC = &AMDGPU::VGPR_32RegClass;
3369 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3370 RC = &AMDGPU::SGPR_32RegClass;
3371 else
3372 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3373 EVT ValVT = VA.getValVT();
3374
3375 Reg = MF.addLiveIn(Reg, RC);
3376 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3377
3378 if (Arg.Flags.isSRet()) {
3379 // The return object should be reasonably addressable.
3380
3381 // FIXME: This helps when the return is a real sret. If it is a
3382 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3383 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3384 unsigned NumBits =
3386 Val = DAG.getNode(
3387 ISD::AssertZext, DL, VT, Val,
3388 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3389 }
3390
3391 // If this is an 8 or 16-bit value, it is really passed promoted
3392 // to 32 bits. Insert an assert[sz]ext to capture this, then
3393 // truncate to the right size.
3394 switch (VA.getLocInfo()) {
3395 case CCValAssign::Full:
3396 break;
3397 case CCValAssign::BCvt:
3398 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3399 break;
3400 case CCValAssign::SExt:
3401 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3402 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3403 break;
3404 case CCValAssign::ZExt:
3405 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3406 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3407 break;
3408 case CCValAssign::AExt:
3409 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3410 break;
3411 default:
3412 llvm_unreachable("Unknown loc info!");
3413 }
3414
3415 InVals.push_back(Val);
3416 }
3417
3418 // Start adding system SGPRs.
3419 if (IsEntryFunc)
3420 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3421
3422 // DAG.getPass() returns nullptr when using new pass manager.
3423 // TODO: Use DAG.getMFAM() to access analysis result.
3424 if (DAG.getPass()) {
3425 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3426 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3427 }
3428
3429 unsigned StackArgSize = CCInfo.getStackSize();
3430 Info->setBytesInStackArgArea(StackArgSize);
3431
3432 return Chains.empty() ? Chain
3433 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3434}
3435
3436// TODO: If return values can't fit in registers, we should return as many as
3437// possible in registers before passing on stack.
3439 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3440 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3441 const Type *RetTy) const {
3442 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3443 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3444 // for shaders. Vector types should be explicitly handled by CC.
3445 if (AMDGPU::isEntryFunctionCC(CallConv))
3446 return true;
3447
3449 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3450 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3451 return false;
3452
3453 // We must use the stack if return would require unavailable registers.
3454 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3455 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3456 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3457 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3458 return false;
3459
3460 return true;
3461}
3462
3463SDValue
3465 bool isVarArg,
3467 const SmallVectorImpl<SDValue> &OutVals,
3468 const SDLoc &DL, SelectionDAG &DAG) const {
3472
3473 if (AMDGPU::isKernel(CallConv)) {
3474 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3475 OutVals, DL, DAG);
3476 }
3477
3478 bool IsShader = AMDGPU::isShader(CallConv);
3479
3480 Info->setIfReturnsVoid(Outs.empty());
3481 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3482
3483 // CCValAssign - represent the assignment of the return value to a location.
3485
3486 // CCState - Info about the registers and stack slots.
3487 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3488 *DAG.getContext());
3489
3490 // Analyze outgoing return values.
3491 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3492
3493 SDValue Glue;
3495 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3496
3497 SDValue ReadFirstLane =
3498 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3499 // Copy the result values into the output registers.
3500 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3501 ++I, ++RealRVLocIdx) {
3502 CCValAssign &VA = RVLocs[I];
3503 assert(VA.isRegLoc() && "Can only return in registers!");
3504 // TODO: Partially return in registers if return values don't fit.
3505 SDValue Arg = OutVals[RealRVLocIdx];
3506
3507 // Copied from other backends.
3508 switch (VA.getLocInfo()) {
3509 case CCValAssign::Full:
3510 break;
3511 case CCValAssign::BCvt:
3512 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3513 break;
3514 case CCValAssign::SExt:
3515 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3516 break;
3517 case CCValAssign::ZExt:
3518 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3519 break;
3520 case CCValAssign::AExt:
3521 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3522 break;
3523 default:
3524 llvm_unreachable("Unknown loc info!");
3525 }
3526 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3528 ReadFirstLane, Arg);
3529 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3530 Glue = Chain.getValue(1);
3531 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3532 }
3533
3534 // FIXME: Does sret work properly?
3535 if (!Info->isEntryFunction()) {
3536 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3537 const MCPhysReg *I =
3538 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3539 if (I) {
3540 for (; *I; ++I) {
3541 if (AMDGPU::SReg_64RegClass.contains(*I))
3542 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3543 else if (AMDGPU::SReg_32RegClass.contains(*I))
3544 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3545 else
3546 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3547 }
3548 }
3549 }
3550
3551 // Update chain and glue.
3552 RetOps[0] = Chain;
3553 if (Glue.getNode())
3554 RetOps.push_back(Glue);
3555
3556 unsigned Opc = AMDGPUISD::ENDPGM;
3557 if (!IsWaveEnd)
3558 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3559 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3561 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3562}
3563
3565 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3566 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3567 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3568 SDValue ThisVal) const {
3569 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3570
3571 // Assign locations to each value returned by this call.
3573 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3574 *DAG.getContext());
3575 CCInfo.AnalyzeCallResult(Ins, RetCC);
3576
3577 // Copy all of the result registers out of their specified physreg.
3578 for (CCValAssign VA : RVLocs) {
3579 SDValue Val;
3580
3581 if (VA.isRegLoc()) {
3582 Val =
3583 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3584 Chain = Val.getValue(1);
3585 InGlue = Val.getValue(2);
3586 } else if (VA.isMemLoc()) {
3587 report_fatal_error("TODO: return values in memory");
3588 } else
3589 llvm_unreachable("unknown argument location type");
3590
3591 switch (VA.getLocInfo()) {
3592 case CCValAssign::Full:
3593 break;
3594 case CCValAssign::BCvt:
3595 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3596 break;
3597 case CCValAssign::ZExt:
3598 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3599 DAG.getValueType(VA.getValVT()));
3600 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3601 break;
3602 case CCValAssign::SExt:
3603 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3604 DAG.getValueType(VA.getValVT()));
3605 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3606 break;
3607 case CCValAssign::AExt:
3608 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3609 break;
3610 default:
3611 llvm_unreachable("Unknown loc info!");
3612 }
3613
3614 InVals.push_back(Val);
3615 }
3616
3617 return Chain;
3618}
3619
3620// Add code to pass special inputs required depending on used features separate
3621// from the explicit user arguments present in the IR.
3623 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3624 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3625 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3626 // If we don't have a call site, this was a call inserted by
3627 // legalization. These can never use special inputs.
3628 if (!CLI.CB)
3629 return;
3630
3631 SelectionDAG &DAG = CLI.DAG;
3632 const SDLoc &DL = CLI.DL;
3633 const Function &F = DAG.getMachineFunction().getFunction();
3634
3635 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3636 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3637
3638 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3640 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3641 // DAG.getPass() returns nullptr when using new pass manager.
3642 // TODO: Use DAG.getMFAM() to access analysis result.
3643 if (DAG.getPass()) {
3644 auto &ArgUsageInfo =
3646 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3647 }
3648 }
3649
3650 // TODO: Unify with private memory register handling. This is complicated by
3651 // the fact that at least in kernels, the input argument is not necessarily
3652 // in the same location as the input.
3653 // clang-format off
3654 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3656 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3657 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3658 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3659 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3660 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3661 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3662 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3663 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3664 };
3665 // clang-format on
3666
3667 for (auto [InputID, Attr] : ImplicitAttrs) {
3668 // If the callee does not use the attribute value, skip copying the value.
3669 if (CLI.CB->hasFnAttr(Attr))
3670 continue;
3671
3672 const auto [OutgoingArg, ArgRC, ArgTy] =
3673 CalleeArgInfo->getPreloadedValue(InputID);
3674 if (!OutgoingArg)
3675 continue;
3676
3677 const auto [IncomingArg, IncomingArgRC, Ty] =
3678 CallerArgInfo.getPreloadedValue(InputID);
3679 assert(IncomingArgRC == ArgRC);
3680
3681 // All special arguments are ints for now.
3682 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3683 SDValue InputReg;
3684
3685 if (IncomingArg) {
3686 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3687 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3688 // The implicit arg ptr is special because it doesn't have a corresponding
3689 // input for kernels, and is computed from the kernarg segment pointer.
3690 InputReg = getImplicitArgPtr(DAG, DL);
3691 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3692 std::optional<uint32_t> Id =
3694 if (Id.has_value()) {
3695 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3696 } else {
3697 InputReg = DAG.getPOISON(ArgVT);
3698 }
3699 } else {
3700 // We may have proven the input wasn't needed, although the ABI is
3701 // requiring it. We just need to allocate the register appropriately.
3702 InputReg = DAG.getPOISON(ArgVT);
3703 }
3704
3705 if (OutgoingArg->isRegister()) {
3706 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3707 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3708 report_fatal_error("failed to allocate implicit input argument");
3709 } else {
3710 unsigned SpecialArgOffset =
3711 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3712 SDValue ArgStore =
3713 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3714 MemOpChains.push_back(ArgStore);
3715 }
3716 }
3717
3718 // Pack workitem IDs into a single register or pass it as is if already
3719 // packed.
3720
3721 auto [OutgoingArg, ArgRC, Ty] =
3723 if (!OutgoingArg)
3724 std::tie(OutgoingArg, ArgRC, Ty) =
3726 if (!OutgoingArg)
3727 std::tie(OutgoingArg, ArgRC, Ty) =
3729 if (!OutgoingArg)
3730 return;
3731
3732 const ArgDescriptor *IncomingArgX = std::get<0>(
3734 const ArgDescriptor *IncomingArgY = std::get<0>(
3736 const ArgDescriptor *IncomingArgZ = std::get<0>(
3738
3739 SDValue InputReg;
3740 SDLoc SL;
3741
3742 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3743 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3744 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3745
3746 // If incoming ids are not packed we need to pack them.
3747 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3748 NeedWorkItemIDX) {
3749 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3750 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3751 } else {
3752 InputReg = DAG.getConstant(0, DL, MVT::i32);
3753 }
3754 }
3755
3756 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3757 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3758 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3759 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3760 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3761 InputReg = InputReg.getNode()
3762 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3763 : Y;
3764 }
3765
3766 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3767 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3768 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3769 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3770 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3771 InputReg = InputReg.getNode()
3772 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3773 : Z;
3774 }
3775
3776 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3777 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3778 // We're in a situation where the outgoing function requires the workitem
3779 // ID, but the calling function does not have it (e.g a graphics function
3780 // calling a C calling convention function). This is illegal, but we need
3781 // to produce something.
3782 InputReg = DAG.getPOISON(MVT::i32);
3783 } else {
3784 // Workitem ids are already packed, any of present incoming arguments
3785 // will carry all required fields.
3786 ArgDescriptor IncomingArg =
3787 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3788 : IncomingArgY ? *IncomingArgY
3789 : *IncomingArgZ,
3790 ~0u);
3791 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3792 }
3793 }
3794
3795 if (OutgoingArg->isRegister()) {
3796 if (InputReg)
3797 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3798
3799 CCInfo.AllocateReg(OutgoingArg->getRegister());
3800 } else {
3801 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3802 if (InputReg) {
3803 SDValue ArgStore =
3804 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3805 MemOpChains.push_back(ArgStore);
3806 }
3807 }
3808}
3809
3811 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3813 const SmallVectorImpl<SDValue> &OutVals,
3814 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3815 if (AMDGPU::isChainCC(CalleeCC))
3816 return true;
3817
3818 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3819 return false;
3820
3821 // For a divergent call target, we need to do a waterfall loop over the
3822 // possible callees which precludes us from using a simple jump.
3823 if (Callee->isDivergent())
3824 return false;
3825
3827 const Function &CallerF = MF.getFunction();
3828 CallingConv::ID CallerCC = CallerF.getCallingConv();
3830 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3831
3832 // Kernels aren't callable, and don't have a live in return address so it
3833 // doesn't make sense to do a tail call with entry functions.
3834 if (!CallerPreserved)
3835 return false;
3836
3837 bool CCMatch = CallerCC == CalleeCC;
3838
3840 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3841 return true;
3842 return false;
3843 }
3844
3845 // TODO: Can we handle var args?
3846 if (IsVarArg)
3847 return false;
3848
3849 for (const Argument &Arg : CallerF.args()) {
3850 if (Arg.hasByValAttr())
3851 return false;
3852 }
3853
3854 LLVMContext &Ctx = *DAG.getContext();
3855
3856 // Check that the call results are passed in the same way.
3857 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3858 CCAssignFnForCall(CalleeCC, IsVarArg),
3859 CCAssignFnForCall(CallerCC, IsVarArg)))
3860 return false;
3861
3862 // The callee has to preserve all registers the caller needs to preserve.
3863 if (!CCMatch) {
3864 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3865 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3866 return false;
3867 }
3868
3869 // Nothing more to check if the callee is taking no arguments.
3870 if (Outs.empty())
3871 return true;
3872
3874 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3875
3876 // FIXME: We are not allocating special input registers, so we will be
3877 // deciding based on incorrect register assignments.
3878 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3879
3880 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3881 // If the stack arguments for this call do not fit into our own save area then
3882 // the call cannot be made tail.
3883 // TODO: Is this really necessary?
3884 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3885 return false;
3886
3887 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3888 // FIXME: What about inreg arguments that end up passed in memory?
3889 if (!CCVA.isRegLoc())
3890 continue;
3891
3892 // If we are passing an argument in an SGPR, and the value is divergent,
3893 // this call requires a waterfall loop.
3894 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3895 LLVM_DEBUG(
3896 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3897 << printReg(CCVA.getLocReg(), TRI) << '\n');
3898 return false;
3899 }
3900 }
3901
3902 const MachineRegisterInfo &MRI = MF.getRegInfo();
3903 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3904}
3905
3907 if (!CI->isTailCall())
3908 return false;
3909
3910 const Function *ParentFn = CI->getParent()->getParent();
3912 return false;
3913 return true;
3914}
3915
3916namespace {
3917// Chain calls have special arguments that we need to handle. These are
3918// tagging along at the end of the arguments list(s), after the SGPR and VGPR
3919// arguments (index 0 and 1 respectively).
3920enum ChainCallArgIdx {
3921 Exec = 2,
3922 Flags,
3923 NumVGPRs,
3924 FallbackExec,
3925 FallbackCallee
3926};
3927} // anonymous namespace
3928
3929// The wave scratch offset register is used as the global base pointer.
3931 SmallVectorImpl<SDValue> &InVals) const {
3932 CallingConv::ID CallConv = CLI.CallConv;
3933 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3934
3935 SelectionDAG &DAG = CLI.DAG;
3936
3937 const SDLoc &DL = CLI.DL;
3938 SDValue Chain = CLI.Chain;
3939 SDValue Callee = CLI.Callee;
3940
3941 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
3942 bool UsesDynamicVGPRs = false;
3943 if (IsChainCallConv) {
3944 // The last arguments should be the value that we need to put in EXEC,
3945 // followed by the flags and any other arguments with special meanings.
3946 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
3947 // we don't treat them like the "real" arguments.
3948 auto RequestedExecIt =
3949 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
3950 return Arg.OrigArgIndex == 2;
3951 });
3952 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
3953
3954 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
3955 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
3956 CLI.OutVals.end());
3957 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
3958
3959 assert(CLI.Outs.back().OrigArgIndex < 2 &&
3960 "Haven't popped all the special args");
3961
3962 TargetLowering::ArgListEntry RequestedExecArg =
3963 CLI.Args[ChainCallArgIdx::Exec];
3964 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3965 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3966
3967 // Convert constants into TargetConstants, so they become immediate operands
3968 // instead of being selected into S_MOV.
3969 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
3970 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
3971 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
3972 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
3973 } else
3974 ChainCallSpecialArgs.push_back(Arg.Node);
3975 };
3976
3977 PushNodeOrTargetConstant(RequestedExecArg);
3978
3979 // Process any other special arguments depending on the value of the flags.
3980 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
3981
3982 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
3983 if (FlagsValue.isZero()) {
3984 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
3985 return lowerUnhandledCall(CLI, InVals,
3986 "no additional args allowed if flags == 0");
3987 } else if (FlagsValue.isOneBitSet(0)) {
3988 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3989 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
3990 }
3991
3992 if (!Subtarget->isWave32()) {
3993 return lowerUnhandledCall(
3994 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
3995 }
3996
3997 UsesDynamicVGPRs = true;
3998 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
3999 CLI.Args.end(), PushNodeOrTargetConstant);
4000 }
4001 }
4002
4004 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4006 bool &IsTailCall = CLI.IsTailCall;
4007 bool IsVarArg = CLI.IsVarArg;
4008 bool IsSibCall = false;
4010
4011 if (Callee.isUndef() || isNullConstant(Callee)) {
4012 if (!CLI.IsTailCall) {
4013 for (ISD::InputArg &Arg : CLI.Ins)
4014 InVals.push_back(DAG.getPOISON(Arg.VT));
4015 }
4016
4017 return Chain;
4018 }
4019
4020 if (IsVarArg) {
4021 return lowerUnhandledCall(CLI, InVals,
4022 "unsupported call to variadic function ");
4023 }
4024
4025 if (!CLI.CB)
4026 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4027
4028 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4029 return lowerUnhandledCall(CLI, InVals,
4030 "unsupported required tail call to function ");
4031 }
4032
4033 if (IsTailCall) {
4034 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4035 Outs, OutVals, Ins, DAG);
4036 if (!IsTailCall &&
4037 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4038 report_fatal_error("failed to perform tail call elimination on a call "
4039 "site marked musttail or on llvm.amdgcn.cs.chain");
4040 }
4041
4042 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4043
4044 // A sibling call is one where we're under the usual C ABI and not planning
4045 // to change that but can still do a tail call:
4046 if (!TailCallOpt && IsTailCall)
4047 IsSibCall = true;
4048
4049 if (IsTailCall)
4050 ++NumTailCalls;
4051 }
4052
4055 SmallVector<SDValue, 8> MemOpChains;
4056
4057 // Analyze operands of the call, assigning locations to each operand.
4059 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4060 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4061
4062 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4064 // With a fixed ABI, allocate fixed registers before user arguments.
4065 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4066 }
4067
4068 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4069
4070 // Get a count of how many bytes are to be pushed on the stack.
4071 unsigned NumBytes = CCInfo.getStackSize();
4072
4073 if (IsSibCall) {
4074 // Since we're not changing the ABI to make this a tail call, the memory
4075 // operands are already available in the caller's incoming argument space.
4076 NumBytes = 0;
4077 }
4078
4079 // FPDiff is the byte offset of the call's argument area from the callee's.
4080 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4081 // by this amount for a tail call. In a sibling call it must be 0 because the
4082 // caller will deallocate the entire stack and the callee still expects its
4083 // arguments to begin at SP+0. Completely unused for non-tail calls.
4084 int32_t FPDiff = 0;
4085 MachineFrameInfo &MFI = MF.getFrameInfo();
4086 auto *TRI = Subtarget->getRegisterInfo();
4087
4088 // Adjust the stack pointer for the new arguments...
4089 // These operations are automatically eliminated by the prolog/epilog pass
4090 if (!IsSibCall)
4091 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4092
4093 if (!IsSibCall || IsChainCallConv) {
4094 if (!Subtarget->enableFlatScratch()) {
4095 SmallVector<SDValue, 4> CopyFromChains;
4096
4097 // In the HSA case, this should be an identity copy.
4098 SDValue ScratchRSrcReg =
4099 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4100 RegsToPass.emplace_back(IsChainCallConv
4101 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4102 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4103 ScratchRSrcReg);
4104 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4105 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4106 }
4107 }
4108
4109 const unsigned NumSpecialInputs = RegsToPass.size();
4110
4111 MVT PtrVT = MVT::i32;
4112
4113 // Walk the register/memloc assignments, inserting copies/loads.
4114 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4115 CCValAssign &VA = ArgLocs[i];
4116 SDValue Arg = OutVals[i];
4117
4118 // Promote the value if needed.
4119 switch (VA.getLocInfo()) {
4120 case CCValAssign::Full:
4121 break;
4122 case CCValAssign::BCvt:
4123 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4124 break;
4125 case CCValAssign::ZExt:
4126 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4127 break;
4128 case CCValAssign::SExt:
4129 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4130 break;
4131 case CCValAssign::AExt:
4132 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4133 break;
4134 case CCValAssign::FPExt:
4135 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4136 break;
4137 default:
4138 llvm_unreachable("Unknown loc info!");
4139 }
4140
4141 if (VA.isRegLoc()) {
4142 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4143 } else {
4144 assert(VA.isMemLoc());
4145
4146 SDValue DstAddr;
4147 MachinePointerInfo DstInfo;
4148
4149 unsigned LocMemOffset = VA.getLocMemOffset();
4150 int32_t Offset = LocMemOffset;
4151
4152 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4153 MaybeAlign Alignment;
4154
4155 if (IsTailCall) {
4156 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4157 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4158 : VA.getValVT().getStoreSize();
4159
4160 // FIXME: We can have better than the minimum byval required alignment.
4161 Alignment =
4162 Flags.isByVal()
4163 ? Flags.getNonZeroByValAlign()
4164 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4165
4166 Offset = Offset + FPDiff;
4167 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4168
4169 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4170 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4171
4172 // Make sure any stack arguments overlapping with where we're storing
4173 // are loaded before this eventual operation. Otherwise they'll be
4174 // clobbered.
4175
4176 // FIXME: Why is this really necessary? This seems to just result in a
4177 // lot of code to copy the stack and write them back to the same
4178 // locations, which are supposed to be immutable?
4179 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4180 } else {
4181 // Stores to the argument stack area are relative to the stack pointer.
4182 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4183 MVT::i32);
4184 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4185 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4186 Alignment =
4187 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4188 }
4189
4190 if (Outs[i].Flags.isByVal()) {
4191 SDValue SizeNode =
4192 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4193 SDValue Cpy =
4194 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4195 Outs[i].Flags.getNonZeroByValAlign(),
4196 /*isVol = */ false, /*AlwaysInline = */ true,
4197 /*CI=*/nullptr, std::nullopt, DstInfo,
4199
4200 MemOpChains.push_back(Cpy);
4201 } else {
4202 SDValue Store =
4203 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4204 MemOpChains.push_back(Store);
4205 }
4206 }
4207 }
4208
4209 if (!MemOpChains.empty())
4210 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4211
4212 SDValue ReadFirstLaneID =
4213 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4214
4215 SDValue TokenGlue;
4216 if (CLI.ConvergenceControlToken) {
4217 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4219 }
4220
4221 // Build a sequence of copy-to-reg nodes chained together with token chain
4222 // and flag operands which copy the outgoing args into the appropriate regs.
4223 SDValue InGlue;
4224
4225 unsigned ArgIdx = 0;
4226 for (auto [Reg, Val] : RegsToPass) {
4227 if (ArgIdx++ >= NumSpecialInputs &&
4228 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4229 // For chain calls, the inreg arguments are required to be
4230 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4231 // they are uniform.
4232 //
4233 // For other calls, if an inreg arguments is known to be uniform,
4234 // speculatively insert a readfirstlane in case it is in a VGPR.
4235 //
4236 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4237 // value, so let that continue to produce invalid code.
4238
4239 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4240 if (TokenGlue)
4241 ReadfirstlaneArgs.push_back(TokenGlue);
4243 ReadfirstlaneArgs);
4244 }
4245
4246 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4247 InGlue = Chain.getValue(1);
4248 }
4249
4250 // We don't usually want to end the call-sequence here because we would tidy
4251 // the frame up *after* the call, however in the ABI-changing tail-call case
4252 // we've carefully laid out the parameters so that when sp is reset they'll be
4253 // in the correct location.
4254 if (IsTailCall && !IsSibCall) {
4255 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4256 InGlue = Chain.getValue(1);
4257 }
4258
4259 std::vector<SDValue> Ops({Chain});
4260
4261 // Add a redundant copy of the callee global which will not be legalized, as
4262 // we need direct access to the callee later.
4264 const GlobalValue *GV = GSD->getGlobal();
4265 Ops.push_back(Callee);
4266 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4267 } else {
4268 if (IsTailCall) {
4269 // isEligibleForTailCallOptimization considered whether the call target is
4270 // divergent, but we may still end up with a uniform value in a VGPR.
4271 // Insert a readfirstlane just in case.
4272 SDValue ReadFirstLaneID =
4273 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4274
4275 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4276 if (TokenGlue)
4277 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4278 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4279 ReadfirstlaneArgs);
4280 }
4281
4282 Ops.push_back(Callee);
4283 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4284 }
4285
4286 if (IsTailCall) {
4287 // Each tail call may have to adjust the stack by a different amount, so
4288 // this information must travel along with the operation for eventual
4289 // consumption by emitEpilogue.
4290 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4291 }
4292
4293 if (IsChainCallConv)
4294 llvm::append_range(Ops, ChainCallSpecialArgs);
4295
4296 // Add argument registers to the end of the list so that they are known live
4297 // into the call.
4298 for (auto &[Reg, Val] : RegsToPass)
4299 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4300
4301 // Add a register mask operand representing the call-preserved registers.
4302 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4303 assert(Mask && "Missing call preserved mask for calling convention");
4304 Ops.push_back(DAG.getRegisterMask(Mask));
4305
4306 if (SDValue Token = CLI.ConvergenceControlToken) {
4308 GlueOps.push_back(Token);
4309 if (InGlue)
4310 GlueOps.push_back(InGlue);
4311
4312 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4313 MVT::Glue, GlueOps),
4314 0);
4315 }
4316
4317 if (InGlue)
4318 Ops.push_back(InGlue);
4319
4320 // If we're doing a tall call, use a TC_RETURN here rather than an
4321 // actual call instruction.
4322 if (IsTailCall) {
4323 MFI.setHasTailCall();
4324 unsigned OPC = AMDGPUISD::TC_RETURN;
4325 switch (CallConv) {
4328 break;
4331 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4333 break;
4334 }
4335
4336 // If the caller is a whole wave function, we need to use a special opcode
4337 // so we can patch up EXEC.
4338 if (Info->isWholeWaveFunction())
4340
4341 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4342 }
4343
4344 // Returns a chain and a flag for retval copy to use.
4345 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4346 Chain = Call.getValue(0);
4347 InGlue = Call.getValue(1);
4348
4349 uint64_t CalleePopBytes = NumBytes;
4350 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4351 if (!Ins.empty())
4352 InGlue = Chain.getValue(1);
4353
4354 // Handle result values, copying them out of physregs into vregs that we
4355 // return.
4356 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4357 InVals, /*IsThisReturn=*/false, SDValue());
4358}
4359
4360// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4361// except for:
4362// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4363// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4365 SelectionDAG &DAG) const {
4366 const MachineFunction &MF = DAG.getMachineFunction();
4368
4369 SDLoc dl(Op);
4370 EVT VT = Op.getValueType();
4371 SDValue Chain = Op.getOperand(0);
4372 Register SPReg = Info->getStackPtrOffsetReg();
4373
4374 // Chain the dynamic stack allocation so that it doesn't modify the stack
4375 // pointer when other instructions are using the stack.
4376 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4377
4378 SDValue Size = Op.getOperand(1);
4379 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4380 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4381
4382 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4384 "Stack grows upwards for AMDGPU");
4385
4386 Chain = BaseAddr.getValue(1);
4387 Align StackAlign = TFL->getStackAlign();
4388 if (Alignment > StackAlign) {
4389 uint64_t ScaledAlignment = Alignment.value()
4390 << Subtarget->getWavefrontSizeLog2();
4391 uint64_t StackAlignMask = ScaledAlignment - 1;
4392 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4393 DAG.getConstant(StackAlignMask, dl, VT));
4394 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4395 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4396 }
4397
4398 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4399 SDValue NewSP;
4401 // For constant sized alloca, scale alloca size by wave-size
4402 SDValue ScaledSize = DAG.getNode(
4403 ISD::SHL, dl, VT, Size,
4404 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4405 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4406 } else {
4407 // For dynamic sized alloca, perform wave-wide reduction to get max of
4408 // alloca size(divergent) and then scale it by wave-size
4409 SDValue WaveReduction =
4410 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4411 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4412 Size, DAG.getConstant(0, dl, MVT::i32));
4413 SDValue ScaledSize = DAG.getNode(
4414 ISD::SHL, dl, VT, Size,
4415 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4416 NewSP =
4417 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4418 SDValue ReadFirstLaneID =
4419 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4420 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4421 NewSP);
4422 }
4423
4424 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4425 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4426
4427 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4428}
4429
4431 if (Op.getValueType() != MVT::i32)
4432 return Op; // Defer to cannot select error.
4433
4435 SDLoc SL(Op);
4436
4437 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4438
4439 // Convert from wave uniform to swizzled vector address. This should protect
4440 // from any edge cases where the stacksave result isn't directly used with
4441 // stackrestore.
4442 SDValue VectorAddress =
4443 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4444 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4445}
4446
4448 SelectionDAG &DAG) const {
4449 SDLoc SL(Op);
4450 assert(Op.getValueType() == MVT::i32);
4451
4452 uint32_t BothRoundHwReg =
4454 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4455
4456 SDValue IntrinID =
4457 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4458 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4459 Op.getOperand(0), IntrinID, GetRoundBothImm);
4460
4461 // There are two rounding modes, one for f32 and one for f64/f16. We only
4462 // report in the standard value range if both are the same.
4463 //
4464 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4465 // ties away from zero is not supported, and the other values are rotated by
4466 // 1.
4467 //
4468 // If the two rounding modes are not the same, report a target defined value.
4469
4470 // Mode register rounding mode fields:
4471 //
4472 // [1:0] Single-precision round mode.
4473 // [3:2] Double/Half-precision round mode.
4474 //
4475 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4476 //
4477 // Hardware Spec
4478 // Toward-0 3 0
4479 // Nearest Even 0 1
4480 // +Inf 1 2
4481 // -Inf 2 3
4482 // NearestAway0 N/A 4
4483 //
4484 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4485 // table we can index by the raw hardware mode.
4486 //
4487 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4488
4489 SDValue BitTable =
4491
4492 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4493 SDValue RoundModeTimesNumBits =
4494 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4495
4496 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4497 // knew only one mode was demanded.
4498 SDValue TableValue =
4499 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4500 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4501
4502 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4503 SDValue TableEntry =
4504 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4505
4506 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4507 // if it's an extended value.
4508 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4509 SDValue IsStandardValue =
4510 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4511 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4512 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4513 TableEntry, EnumOffset);
4514
4515 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4516}
4517
4519 SelectionDAG &DAG) const {
4520 SDLoc SL(Op);
4521
4522 SDValue NewMode = Op.getOperand(1);
4523 assert(NewMode.getValueType() == MVT::i32);
4524
4525 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4526 // hardware MODE.fp_round values.
4527 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4528 uint32_t ClampedVal = std::min(
4529 static_cast<uint32_t>(ConstMode->getZExtValue()),
4531 NewMode = DAG.getConstant(
4532 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4533 } else {
4534 // If we know the input can only be one of the supported standard modes in
4535 // the range 0-3, we can use a simplified mapping to hardware values.
4536 KnownBits KB = DAG.computeKnownBits(NewMode);
4537 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4538 // The supported standard values are 0-3. The extended values start at 8. We
4539 // need to offset by 4 if the value is in the extended range.
4540
4541 if (UseReducedTable) {
4542 // Truncate to the low 32-bits.
4543 SDValue BitTable = DAG.getConstant(
4544 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4545
4546 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4547 SDValue RoundModeTimesNumBits =
4548 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4549
4550 NewMode =
4551 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4552
4553 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4554 // the table extracted bits into inline immediates.
4555 } else {
4556 // table_index = umin(value, value - 4)
4557 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4558 SDValue BitTable =
4560
4561 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4562 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4563 SDValue IndexVal =
4564 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4565
4566 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4567 SDValue RoundModeTimesNumBits =
4568 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4569
4570 SDValue TableValue =
4571 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4572 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4573
4574 // No need to mask out the high bits since the setreg will ignore them
4575 // anyway.
4576 NewMode = TruncTable;
4577 }
4578
4579 // Insert a readfirstlane in case the value is a VGPR. We could do this
4580 // earlier and keep more operations scalar, but that interferes with
4581 // combining the source.
4582 SDValue ReadFirstLaneID =
4583 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4584 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4585 ReadFirstLaneID, NewMode);
4586 }
4587
4588 // N.B. The setreg will be later folded into s_round_mode on supported
4589 // targets.
4590 SDValue IntrinID =
4591 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4592 uint32_t BothRoundHwReg =
4594 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4595
4596 SDValue SetReg =
4597 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4598 IntrinID, RoundBothImm, NewMode);
4599
4600 return SetReg;
4601}
4602
4604 if (Op->isDivergent() &&
4605 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4606 // Cannot do I$ prefetch with divergent pointer.
4607 return SDValue();
4608
4609 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4613 break;
4615 if (Subtarget->hasSafeSmemPrefetch())
4616 break;
4617 [[fallthrough]];
4618 default:
4619 return SDValue();
4620 }
4621
4622 // I$ prefetch
4623 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4624 return SDValue();
4625
4626 return Op;
4627}
4628
4629// Work around DAG legality rules only based on the result type.
4631 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4632 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4633 EVT SrcVT = Src.getValueType();
4634
4635 if (SrcVT.getScalarType() != MVT::bf16)
4636 return Op;
4637
4638 SDLoc SL(Op);
4639 SDValue BitCast =
4640 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4641
4642 EVT DstVT = Op.getValueType();
4643 if (IsStrict)
4644 llvm_unreachable("Need STRICT_BF16_TO_FP");
4645
4646 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4647}
4648
4650 SDLoc SL(Op);
4651 if (Op.getValueType() != MVT::i64)
4652 return Op;
4653
4654 uint32_t ModeHwReg =
4656 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4657 uint32_t TrapHwReg =
4659 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4660
4661 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4662 SDValue IntrinID =
4663 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4664 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4665 Op.getOperand(0), IntrinID, ModeHwRegImm);
4666 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4667 Op.getOperand(0), IntrinID, TrapHwRegImm);
4668 SDValue TokenReg =
4669 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4670 GetTrapReg.getValue(1));
4671
4672 SDValue CvtPtr =
4673 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4674 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4675
4676 return DAG.getMergeValues({Result, TokenReg}, SL);
4677}
4678
4680 SDLoc SL(Op);
4681 if (Op.getOperand(1).getValueType() != MVT::i64)
4682 return Op;
4683
4684 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4685 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4686 DAG.getConstant(0, SL, MVT::i32));
4687 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4688 DAG.getConstant(1, SL, MVT::i32));
4689
4690 SDValue ReadFirstLaneID =
4691 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4692 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4693 ReadFirstLaneID, NewModeReg);
4694 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4695 ReadFirstLaneID, NewTrapReg);
4696
4697 unsigned ModeHwReg =
4699 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4700 unsigned TrapHwReg =
4702 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4703
4704 SDValue IntrinID =
4705 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4706 SDValue SetModeReg =
4707 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4708 IntrinID, ModeHwRegImm, NewModeReg);
4709 SDValue SetTrapReg =
4710 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4711 IntrinID, TrapHwRegImm, NewTrapReg);
4712 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4713}
4714
4716 const MachineFunction &MF) const {
4717 const Function &Fn = MF.getFunction();
4718
4720 .Case("m0", AMDGPU::M0)
4721 .Case("exec", AMDGPU::EXEC)
4722 .Case("exec_lo", AMDGPU::EXEC_LO)
4723 .Case("exec_hi", AMDGPU::EXEC_HI)
4724 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4725 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4726 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4727 .Default(Register());
4728 if (!Reg)
4729 return Reg;
4730
4731 if (!Subtarget->hasFlatScrRegister() &&
4732 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4733 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4734 "\" for subtarget."));
4735 }
4736
4737 switch (Reg) {
4738 case AMDGPU::M0:
4739 case AMDGPU::EXEC_LO:
4740 case AMDGPU::EXEC_HI:
4741 case AMDGPU::FLAT_SCR_LO:
4742 case AMDGPU::FLAT_SCR_HI:
4743 if (VT.getSizeInBits() == 32)
4744 return Reg;
4745 break;
4746 case AMDGPU::EXEC:
4747 case AMDGPU::FLAT_SCR:
4748 if (VT.getSizeInBits() == 64)
4749 return Reg;
4750 break;
4751 default:
4752 llvm_unreachable("missing register type checking");
4753 }
4754
4756 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4757}
4758
4759// If kill is not the last instruction, split the block so kill is always a
4760// proper terminator.
4763 MachineBasicBlock *BB) const {
4764 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4766 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4767 return SplitBB;
4768}
4769
4770// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4771// \p MI will be the only instruction in the loop body block. Otherwise, it will
4772// be the first instruction in the remainder block.
4773//
4774/// \returns { LoopBody, Remainder }
4775static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4777 MachineFunction *MF = MBB.getParent();
4779
4780 // To insert the loop we need to split the block. Move everything after this
4781 // point to a new block, and insert a new empty block between the two.
4783 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4785 ++MBBI;
4786
4787 MF->insert(MBBI, LoopBB);
4788 MF->insert(MBBI, RemainderBB);
4789
4790 LoopBB->addSuccessor(LoopBB);
4791 LoopBB->addSuccessor(RemainderBB);
4792
4793 // Move the rest of the block into a new block.
4794 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4795
4796 if (InstInLoop) {
4797 auto Next = std::next(I);
4798
4799 // Move instruction to loop body.
4800 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4801
4802 // Move the rest of the block.
4803 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4804 } else {
4805 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4806 }
4807
4808 MBB.addSuccessor(LoopBB);
4809
4810 return std::pair(LoopBB, RemainderBB);
4811}
4812
4813/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4815 MachineBasicBlock *MBB = MI.getParent();
4817 auto I = MI.getIterator();
4818 auto E = std::next(I);
4819
4820 // clang-format off
4821 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4822 .addImm(0);
4823 // clang-format on
4824
4825 MIBundleBuilder Bundler(*MBB, I, E);
4826 finalizeBundle(*MBB, Bundler.begin());
4827}
4828
4831 MachineBasicBlock *BB) const {
4832 const DebugLoc &DL = MI.getDebugLoc();
4833
4835
4837
4838 // Apparently kill flags are only valid if the def is in the same block?
4839 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4840 Src->setIsKill(false);
4841
4842 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4843
4844 MachineBasicBlock::iterator I = LoopBB->end();
4845
4846 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4848
4849 // Clear TRAP_STS.MEM_VIOL
4850 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4851 .addImm(0)
4852 .addImm(EncodedReg);
4853
4855
4856 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4857
4858 // Load and check TRAP_STS.MEM_VIOL
4859 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4860 .addImm(EncodedReg);
4861
4862 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4863 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4864 .addReg(Reg, RegState::Kill)
4865 .addImm(0);
4866 // clang-format off
4867 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4868 .addMBB(LoopBB);
4869 // clang-format on
4870
4871 return RemainderBB;
4872}
4873
4874// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4875// wavefront. If the value is uniform and just happens to be in a VGPR, this
4876// will only do one iteration. In the worst case, this will loop 64 times.
4877//
4878// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4881 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4882 const DebugLoc &DL, const MachineOperand &Idx,
4883 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4884 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4885 Register &SGPRIdxReg) {
4886
4887 MachineFunction *MF = OrigBB.getParent();
4888 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4889 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4891
4892 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4893 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4894 Register NewExec = MRI.createVirtualRegister(BoolRC);
4895 Register CurrentIdxReg =
4896 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4897 Register CondReg = MRI.createVirtualRegister(BoolRC);
4898
4899 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4900 .addReg(InitReg)
4901 .addMBB(&OrigBB)
4902 .addReg(ResultReg)
4903 .addMBB(&LoopBB);
4904
4905 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4906 .addReg(InitSaveExecReg)
4907 .addMBB(&OrigBB)
4908 .addReg(NewExec)
4909 .addMBB(&LoopBB);
4910
4911 // Read the next variant <- also loop target.
4912 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4913 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4914
4915 // Compare the just read M0 value to all possible Idx values.
4916 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4917 .addReg(CurrentIdxReg)
4918 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4919
4920 // Update EXEC, save the original EXEC value to VCC.
4921 BuildMI(LoopBB, I, DL,
4922 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4923 : AMDGPU::S_AND_SAVEEXEC_B64),
4924 NewExec)
4925 .addReg(CondReg, RegState::Kill);
4926
4927 MRI.setSimpleHint(NewExec, CondReg);
4928
4929 if (UseGPRIdxMode) {
4930 if (Offset == 0) {
4931 SGPRIdxReg = CurrentIdxReg;
4932 } else {
4933 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4934 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4935 .addReg(CurrentIdxReg, RegState::Kill)
4936 .addImm(Offset);
4937 }
4938 } else {
4939 // Move index from VCC into M0
4940 if (Offset == 0) {
4941 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4942 .addReg(CurrentIdxReg, RegState::Kill);
4943 } else {
4944 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4945 .addReg(CurrentIdxReg, RegState::Kill)
4946 .addImm(Offset);
4947 }
4948 }
4949
4950 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4951 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4952 MachineInstr *InsertPt =
4953 BuildMI(LoopBB, I, DL,
4954 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4955 : AMDGPU::S_XOR_B64_term),
4956 Exec)
4957 .addReg(Exec)
4958 .addReg(NewExec);
4959
4960 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4961 // s_cbranch_scc0?
4962
4963 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4964 // clang-format off
4965 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4966 .addMBB(&LoopBB);
4967 // clang-format on
4968
4969 return InsertPt->getIterator();
4970}
4971
4972// This has slightly sub-optimal regalloc when the source vector is killed by
4973// the read. The register allocator does not understand that the kill is
4974// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4975// subregister from it, using 1 more VGPR than necessary. This was saved when
4976// this was expanded after register allocation.
4979 unsigned InitResultReg, unsigned PhiReg, int Offset,
4980 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4981 MachineFunction *MF = MBB.getParent();
4982 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4985 const DebugLoc &DL = MI.getDebugLoc();
4987
4988 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4989 Register DstReg = MI.getOperand(0).getReg();
4990 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4991 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4992 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4993 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4994
4995 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4996
4997 // Save the EXEC mask
4998 // clang-format off
4999 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
5000 .addReg(Exec);
5001 // clang-format on
5002
5003 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5004
5005 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5006
5007 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5008 InitResultReg, DstReg, PhiReg, TmpExec,
5009 Offset, UseGPRIdxMode, SGPRIdxReg);
5010
5011 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5013 ++MBBI;
5014 MF->insert(MBBI, LandingPad);
5015 LoopBB->removeSuccessor(RemainderBB);
5016 LandingPad->addSuccessor(RemainderBB);
5017 LoopBB->addSuccessor(LandingPad);
5018 MachineBasicBlock::iterator First = LandingPad->begin();
5019 // clang-format off
5020 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
5021 .addReg(SaveExec);
5022 // clang-format on
5023
5024 return InsPt;
5025}
5026
5027// Returns subreg index, offset
5028static std::pair<unsigned, int>
5030 const TargetRegisterClass *SuperRC, unsigned VecReg,
5031 int Offset) {
5032 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5033
5034 // Skip out of bounds offsets, or else we would end up using an undefined
5035 // register.
5036 if (Offset >= NumElts || Offset < 0)
5037 return std::pair(AMDGPU::sub0, Offset);
5038
5039 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5040}
5041
5044 int Offset) {
5045 MachineBasicBlock *MBB = MI.getParent();
5046 const DebugLoc &DL = MI.getDebugLoc();
5048
5049 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5050
5051 assert(Idx->getReg() != AMDGPU::NoRegister);
5052
5053 if (Offset == 0) {
5054 // clang-format off
5055 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5056 .add(*Idx);
5057 // clang-format on
5058 } else {
5059 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5060 .add(*Idx)
5061 .addImm(Offset);
5062 }
5063}
5064
5067 int Offset) {
5068 MachineBasicBlock *MBB = MI.getParent();
5069 const DebugLoc &DL = MI.getDebugLoc();
5071
5072 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5073
5074 if (Offset == 0)
5075 return Idx->getReg();
5076
5077 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5078 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5079 .add(*Idx)
5080 .addImm(Offset);
5081 return Tmp;
5082}
5083
5086 const GCNSubtarget &ST) {
5087 const SIInstrInfo *TII = ST.getInstrInfo();
5088 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5089 MachineFunction *MF = MBB.getParent();
5091
5092 Register Dst = MI.getOperand(0).getReg();
5093 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5094 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5095 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5096
5097 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5098 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5099
5100 unsigned SubReg;
5101 std::tie(SubReg, Offset) =
5102 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5103
5104 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5105
5106 // Check for a SGPR index.
5107 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5109 const DebugLoc &DL = MI.getDebugLoc();
5110
5111 if (UseGPRIdxMode) {
5112 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5113 // to avoid interfering with other uses, so probably requires a new
5114 // optimization pass.
5116
5117 const MCInstrDesc &GPRIDXDesc =
5118 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5119 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5120 .addReg(SrcReg)
5121 .addReg(Idx)
5122 .addImm(SubReg);
5123 } else {
5125
5126 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5127 .addReg(SrcReg, 0, SubReg)
5128 .addReg(SrcReg, RegState::Implicit);
5129 }
5130
5131 MI.eraseFromParent();
5132
5133 return &MBB;
5134 }
5135
5136 // Control flow needs to be inserted if indexing with a VGPR.
5137 const DebugLoc &DL = MI.getDebugLoc();
5139
5140 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5141 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5142
5143 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5144
5145 Register SGPRIdxReg;
5146 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5147 UseGPRIdxMode, SGPRIdxReg);
5148
5149 MachineBasicBlock *LoopBB = InsPt->getParent();
5150
5151 if (UseGPRIdxMode) {
5152 const MCInstrDesc &GPRIDXDesc =
5153 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5154
5155 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5156 .addReg(SrcReg)
5157 .addReg(SGPRIdxReg)
5158 .addImm(SubReg);
5159 } else {
5160 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5161 .addReg(SrcReg, 0, SubReg)
5162 .addReg(SrcReg, RegState::Implicit);
5163 }
5164
5165 MI.eraseFromParent();
5166
5167 return LoopBB;
5168}
5169
5172 const GCNSubtarget &ST) {
5173 const SIInstrInfo *TII = ST.getInstrInfo();
5174 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5175 MachineFunction *MF = MBB.getParent();
5177
5178 Register Dst = MI.getOperand(0).getReg();
5179 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5180 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5181 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5182 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5183 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5184 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5185
5186 // This can be an immediate, but will be folded later.
5187 assert(Val->getReg());
5188
5189 unsigned SubReg;
5190 std::tie(SubReg, Offset) =
5191 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5192 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5193
5194 if (Idx->getReg() == AMDGPU::NoRegister) {
5196 const DebugLoc &DL = MI.getDebugLoc();
5197
5198 assert(Offset == 0);
5199
5200 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5201 .add(*SrcVec)
5202 .add(*Val)
5203 .addImm(SubReg);
5204
5205 MI.eraseFromParent();
5206 return &MBB;
5207 }
5208
5209 // Check for a SGPR index.
5210 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5212 const DebugLoc &DL = MI.getDebugLoc();
5213
5214 if (UseGPRIdxMode) {
5216
5217 const MCInstrDesc &GPRIDXDesc =
5218 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5219 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5220 .addReg(SrcVec->getReg())
5221 .add(*Val)
5222 .addReg(Idx)
5223 .addImm(SubReg);
5224 } else {
5226
5227 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5228 TRI.getRegSizeInBits(*VecRC), 32, false);
5229 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5230 .addReg(SrcVec->getReg())
5231 .add(*Val)
5232 .addImm(SubReg);
5233 }
5234 MI.eraseFromParent();
5235 return &MBB;
5236 }
5237
5238 // Control flow needs to be inserted if indexing with a VGPR.
5239 if (Val->isReg())
5240 MRI.clearKillFlags(Val->getReg());
5241
5242 const DebugLoc &DL = MI.getDebugLoc();
5243
5244 Register PhiReg = MRI.createVirtualRegister(VecRC);
5245
5246 Register SGPRIdxReg;
5247 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5248 UseGPRIdxMode, SGPRIdxReg);
5249 MachineBasicBlock *LoopBB = InsPt->getParent();
5250
5251 if (UseGPRIdxMode) {
5252 const MCInstrDesc &GPRIDXDesc =
5253 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5254
5255 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5256 .addReg(PhiReg)
5257 .add(*Val)
5258 .addReg(SGPRIdxReg)
5259 .addImm(SubReg);
5260 } else {
5261 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5262 TRI.getRegSizeInBits(*VecRC), 32, false);
5263 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5264 .addReg(PhiReg)
5265 .add(*Val)
5266 .addImm(SubReg);
5267 }
5268
5269 MI.eraseFromParent();
5270 return LoopBB;
5271}
5272
5274 MachineBasicBlock *BB) {
5275 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5276 // For GFX12, we emit s_add_u64 and s_sub_u64.
5277 MachineFunction *MF = BB->getParent();
5278 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5279 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5281 const DebugLoc &DL = MI.getDebugLoc();
5282 MachineOperand &Dest = MI.getOperand(0);
5283 MachineOperand &Src0 = MI.getOperand(1);
5284 MachineOperand &Src1 = MI.getOperand(2);
5285 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5286 if (ST.hasScalarAddSub64()) {
5287 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5288 // clang-format off
5289 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5290 .add(Src0)
5291 .add(Src1);
5292 // clang-format on
5293 } else {
5294 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5295 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5296
5297 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5298 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5299
5300 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5301 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5302 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5303 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5304
5305 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5306 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5307 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5308 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5309
5310 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5311 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5312 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5313 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5314 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5315 .addReg(DestSub0)
5316 .addImm(AMDGPU::sub0)
5317 .addReg(DestSub1)
5318 .addImm(AMDGPU::sub1);
5319 }
5320 MI.eraseFromParent();
5321 return BB;
5322}
5323
5325 switch (Opc) {
5326 case AMDGPU::S_MIN_U32:
5327 return std::numeric_limits<uint32_t>::max();
5328 case AMDGPU::S_MIN_I32:
5329 return std::numeric_limits<int32_t>::max();
5330 case AMDGPU::S_MAX_U32:
5331 return std::numeric_limits<uint32_t>::min();
5332 case AMDGPU::S_MAX_I32:
5333 return std::numeric_limits<int32_t>::min();
5334 case AMDGPU::S_ADD_I32:
5335 case AMDGPU::S_SUB_I32:
5336 case AMDGPU::S_OR_B32:
5337 case AMDGPU::S_XOR_B32:
5338 return std::numeric_limits<uint32_t>::min();
5339 case AMDGPU::S_AND_B32:
5340 return std::numeric_limits<uint32_t>::max();
5341 default:
5343 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5344 }
5345}
5346
5348 switch (Opc) {
5349 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5350 return std::numeric_limits<uint64_t>::max();
5351 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5352 return std::numeric_limits<int64_t>::max();
5353 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5354 return std::numeric_limits<uint64_t>::min();
5355 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5356 return std::numeric_limits<int64_t>::min();
5357 case AMDGPU::S_ADD_U64_PSEUDO:
5358 case AMDGPU::S_SUB_U64_PSEUDO:
5359 return std::numeric_limits<uint64_t>::min();
5360 default:
5362 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5363 }
5364}
5365
5366static bool is32bitWaveReduceOperation(unsigned Opc) {
5367 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5368 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5369 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5370 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5371 Opc == AMDGPU::S_XOR_B32;
5372}
5373
5376 const GCNSubtarget &ST,
5377 unsigned Opc) {
5379 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5380 const DebugLoc &DL = MI.getDebugLoc();
5381 const SIInstrInfo *TII = ST.getInstrInfo();
5382
5383 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5384 Register SrcReg = MI.getOperand(1).getReg();
5385 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5386 Register DstReg = MI.getOperand(0).getReg();
5387 MachineBasicBlock *RetBB = nullptr;
5388 if (isSGPR) {
5389 switch (Opc) {
5390 case AMDGPU::S_MIN_U32:
5391 case AMDGPU::S_MIN_I32:
5392 case AMDGPU::S_MAX_U32:
5393 case AMDGPU::S_MAX_I32:
5394 case AMDGPU::S_AND_B32:
5395 case AMDGPU::S_OR_B32: {
5396 // Idempotent operations.
5397 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5398 RetBB = &BB;
5399 break;
5400 }
5401 case AMDGPU::V_CMP_LT_U64_e64: // umin
5402 case AMDGPU::V_CMP_LT_I64_e64: // min
5403 case AMDGPU::V_CMP_GT_U64_e64: // umax
5404 case AMDGPU::V_CMP_GT_I64_e64: { // max
5405 // Idempotent operations.
5406 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5407 RetBB = &BB;
5408 break;
5409 }
5410 case AMDGPU::S_XOR_B32:
5411 case AMDGPU::S_ADD_I32:
5412 case AMDGPU::S_ADD_U64_PSEUDO:
5413 case AMDGPU::S_SUB_I32:
5414 case AMDGPU::S_SUB_U64_PSEUDO: {
5415 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5416 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5417 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5418 Register NumActiveLanes =
5419 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5420
5421 bool IsWave32 = ST.isWave32();
5422 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5423 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5424 unsigned BitCountOpc =
5425 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5426
5427 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5428
5429 auto NewAccumulator =
5430 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5431 .addReg(ExecMask);
5432
5433 switch (Opc) {
5434 case AMDGPU::S_XOR_B32: {
5435 // Performing an XOR operation on a uniform value
5436 // depends on the parity of the number of active lanes.
5437 // For even parity, the result will be 0, for odd
5438 // parity the result will be the same as the input value.
5439 Register ParityRegister =
5440 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5441
5442 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5443 .addReg(NewAccumulator->getOperand(0).getReg())
5444 .addImm(1)
5445 .setOperandDead(3); // Dead scc
5446 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5447 .addReg(SrcReg)
5448 .addReg(ParityRegister);
5449 break;
5450 }
5451 case AMDGPU::S_SUB_I32: {
5452 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5453
5454 // Take the negation of the source operand.
5455 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5456 .addImm(0)
5457 .addReg(SrcReg);
5458 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5459 .addReg(NegatedVal)
5460 .addReg(NewAccumulator->getOperand(0).getReg());
5461 break;
5462 }
5463 case AMDGPU::S_ADD_I32: {
5464 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5465 .addReg(SrcReg)
5466 .addReg(NewAccumulator->getOperand(0).getReg());
5467 break;
5468 }
5469 case AMDGPU::S_ADD_U64_PSEUDO:
5470 case AMDGPU::S_SUB_U64_PSEUDO: {
5471 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5472 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5473 Register Op1H_Op0L_Reg =
5474 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5475 Register Op1L_Op0H_Reg =
5476 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5477 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5478 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5479 Register NegatedValLo =
5480 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5481 Register NegatedValHi =
5482 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5483
5484 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5485 const TargetRegisterClass *Src1SubRC =
5486 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5487
5488 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5489 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5490 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5491 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5492
5493 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5494 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5495 .addImm(0)
5496 .addReg(NewAccumulator->getOperand(0).getReg())
5497 .setOperandDead(3); // Dead scc
5498 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5499 .addReg(NegatedValLo)
5500 .addImm(31)
5501 .setOperandDead(3); // Dead scc
5502 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5503 .add(Op1L)
5504 .addReg(NegatedValHi);
5505 }
5506 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5507 ? NegatedValLo
5508 : NewAccumulator->getOperand(0).getReg();
5509 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5510 .add(Op1L)
5511 .addReg(LowOpcode);
5512 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5513 .add(Op1L)
5514 .addReg(LowOpcode);
5515 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5516 .add(Op1H)
5517 .addReg(LowOpcode);
5518
5519 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5520 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5521 .addReg(CarryReg)
5522 .addReg(Op1H_Op0L_Reg)
5523 .setOperandDead(3); // Dead scc
5524
5525 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5526 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5527 .addReg(HiVal)
5528 .addReg(Op1L_Op0H_Reg)
5529 .setOperandDead(3); // Dead scc
5530 }
5531 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5532 .addReg(DestSub0)
5533 .addImm(AMDGPU::sub0)
5534 .addReg(DestSub1)
5535 .addImm(AMDGPU::sub1);
5536 break;
5537 }
5538 }
5539 RetBB = &BB;
5540 }
5541 }
5542 } else {
5543 // TODO: Implement DPP Strategy and switch based on immediate strategy
5544 // operand. For now, for all the cases (default, Iterative and DPP we use
5545 // iterative approach by default.)
5546
5547 // To reduce the VGPR using iterative approach, we need to iterate
5548 // over all the active lanes. Lowering consists of ComputeLoop,
5549 // which iterate over only active lanes. We use copy of EXEC register
5550 // as induction variable and every active lane modifies it using bitset0
5551 // so that we will get the next active lane for next iteration.
5553 Register SrcReg = MI.getOperand(1).getReg();
5554 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5555
5556 // Create Control flow for loop
5557 // Split MI's Machine Basic block into For loop
5558 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5559
5560 // Create virtual registers required for lowering.
5561 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5562 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5563 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5564 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5565 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5566 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5567 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5568 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5569 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5570
5571 bool IsWave32 = ST.isWave32();
5572 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5573 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5574
5575 // Create initial values of induction variable from Exec, Accumulator and
5576 // insert branch instr to newly created ComputeBlock
5577 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5578 if (is32BitOpc) {
5580 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5581 .addImm(IdentityValue);
5582 } else {
5584 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5585 .addImm(IdentityValue);
5586 }
5587 // clang-format off
5588 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5589 .addMBB(ComputeLoop);
5590 // clang-format on
5591
5592 // Start constructing ComputeLoop
5593 I = ComputeLoop->begin();
5594 auto Accumulator =
5595 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5596 .addReg(IdentityValReg)
5597 .addMBB(&BB);
5598 auto ActiveBits =
5599 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5600 .addReg(LoopIterator)
5601 .addMBB(&BB);
5602
5603 I = ComputeLoop->end();
5604 MachineInstr *NewAccumulator;
5605 // Perform the computations
5606 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5607 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5608 .addReg(ActiveBitsReg);
5609 if (is32BitOpc) {
5610 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5611 LaneValueReg)
5612 .addReg(SrcReg)
5613 .addReg(FF1Reg);
5614 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5615 .addReg(Accumulator->getOperand(0).getReg())
5616 .addReg(LaneValueReg);
5617 } else {
5618 Register LaneValueLoReg =
5619 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5620 Register LaneValueHiReg =
5621 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5622 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5623 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5624 const TargetRegisterClass *SrcSubRC =
5625 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5626 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5627 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5628 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5629 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5630 // lane value input should be in an sgpr
5631 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5632 LaneValueLoReg)
5633 .add(Op1L)
5634 .addReg(FF1Reg);
5635 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5636 LaneValueHiReg)
5637 .add(Op1H)
5638 .addReg(FF1Reg);
5639 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5640 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5641 .addReg(LaneValueLoReg)
5642 .addImm(AMDGPU::sub0)
5643 .addReg(LaneValueHiReg)
5644 .addImm(AMDGPU::sub1);
5645 switch (Opc) {
5646 case AMDGPU::V_CMP_GT_I64_e64:
5647 case AMDGPU::V_CMP_GT_U64_e64:
5648 case AMDGPU::V_CMP_LT_I64_e64:
5649 case AMDGPU::V_CMP_LT_U64_e64: {
5650 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5651 Register ComparisonResultReg =
5652 MRI.createVirtualRegister(WaveMaskRegClass);
5653 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5654 const TargetRegisterClass *VSubRegClass =
5655 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5656 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5657 MachineOperand SrcReg0Sub0 =
5658 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5659 VregClass, AMDGPU::sub0, VSubRegClass);
5660 MachineOperand SrcReg0Sub1 =
5661 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5662 VregClass, AMDGPU::sub1, VSubRegClass);
5663 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5664 AccumulatorVReg)
5665 .add(SrcReg0Sub0)
5666 .addImm(AMDGPU::sub0)
5667 .add(SrcReg0Sub1)
5668 .addImm(AMDGPU::sub1);
5669 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5670 .addReg(LaneValue->getOperand(0).getReg())
5671 .addReg(AccumulatorVReg);
5672
5673 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5674 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5675 .addReg(LaneMaskReg)
5676 .addReg(ActiveBitsReg);
5677
5678 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5679 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5680 .addReg(LaneValue->getOperand(0).getReg())
5681 .addReg(Accumulator->getOperand(0).getReg());
5682 break;
5683 }
5684 case AMDGPU::S_ADD_U64_PSEUDO:
5685 case AMDGPU::S_SUB_U64_PSEUDO: {
5686 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5687 .addReg(Accumulator->getOperand(0).getReg())
5688 .addReg(LaneValue->getOperand(0).getReg());
5689 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5690 break;
5691 }
5692 }
5693 }
5694 // Manipulate the iterator to get the next active lane
5695 unsigned BITSETOpc =
5696 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5697 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5698 .addReg(FF1Reg)
5699 .addReg(ActiveBitsReg);
5700
5701 // Add phi nodes
5702 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5703 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5704
5705 // Creating branching
5706 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5707 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5708 .addReg(NewActiveBitsReg)
5709 .addImm(0);
5710 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5711 .addMBB(ComputeLoop);
5712
5713 RetBB = ComputeEnd;
5714 }
5715 MI.eraseFromParent();
5716 return RetBB;
5717}
5718
5721 MachineBasicBlock *BB) const {
5722
5724 MachineFunction *MF = BB->getParent();
5726
5727 switch (MI.getOpcode()) {
5728 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5729 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5730 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5731 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5732 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5733 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5734 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5735 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5736 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5737 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5738 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5739 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5740 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5741 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5742 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5743 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5744 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5745 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5746 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5747 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5748 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5749 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5750 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5751 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5752 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5753 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5754 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5755 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5756 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5757 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5758 case AMDGPU::S_UADDO_PSEUDO:
5759 case AMDGPU::S_USUBO_PSEUDO: {
5760 const DebugLoc &DL = MI.getDebugLoc();
5761 MachineOperand &Dest0 = MI.getOperand(0);
5762 MachineOperand &Dest1 = MI.getOperand(1);
5763 MachineOperand &Src0 = MI.getOperand(2);
5764 MachineOperand &Src1 = MI.getOperand(3);
5765
5766 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5767 ? AMDGPU::S_ADD_I32
5768 : AMDGPU::S_SUB_I32;
5769 // clang-format off
5770 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5771 .add(Src0)
5772 .add(Src1);
5773 // clang-format on
5774
5775 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5776 .addImm(1)
5777 .addImm(0);
5778
5779 MI.eraseFromParent();
5780 return BB;
5781 }
5782 case AMDGPU::S_ADD_U64_PSEUDO:
5783 case AMDGPU::S_SUB_U64_PSEUDO: {
5784 return Expand64BitScalarArithmetic(MI, BB);
5785 }
5786 case AMDGPU::V_ADD_U64_PSEUDO:
5787 case AMDGPU::V_SUB_U64_PSEUDO: {
5789 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5790 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5791 const DebugLoc &DL = MI.getDebugLoc();
5792
5793 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5794
5795 MachineOperand &Dest = MI.getOperand(0);
5796 MachineOperand &Src0 = MI.getOperand(1);
5797 MachineOperand &Src1 = MI.getOperand(2);
5798
5799 if (ST.hasAddSubU64Insts()) {
5800 auto I = BuildMI(*BB, MI, DL,
5801 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5802 : AMDGPU::V_SUB_U64_e64),
5803 Dest.getReg())
5804 .add(Src0)
5805 .add(Src1)
5806 .addImm(0); // clamp
5807 TII->legalizeOperands(*I);
5808 MI.eraseFromParent();
5809 return BB;
5810 }
5811
5812 if (IsAdd && ST.hasLshlAddU64Inst()) {
5813 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5814 Dest.getReg())
5815 .add(Src0)
5816 .addImm(0)
5817 .add(Src1);
5818 TII->legalizeOperands(*Add);
5819 MI.eraseFromParent();
5820 return BB;
5821 }
5822
5823 const auto *CarryRC = TRI->getWaveMaskRegClass();
5824
5825 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5826 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5827
5828 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5829 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5830
5831 const TargetRegisterClass *Src0RC = Src0.isReg()
5832 ? MRI.getRegClass(Src0.getReg())
5833 : &AMDGPU::VReg_64RegClass;
5834 const TargetRegisterClass *Src1RC = Src1.isReg()
5835 ? MRI.getRegClass(Src1.getReg())
5836 : &AMDGPU::VReg_64RegClass;
5837
5838 const TargetRegisterClass *Src0SubRC =
5839 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5840 const TargetRegisterClass *Src1SubRC =
5841 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5842
5843 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5844 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5845 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5846 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5847
5848 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5849 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5850 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5851 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5852
5853 unsigned LoOpc =
5854 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5855 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5856 .addReg(CarryReg, RegState::Define)
5857 .add(SrcReg0Sub0)
5858 .add(SrcReg1Sub0)
5859 .addImm(0); // clamp bit
5860
5861 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5862 MachineInstr *HiHalf =
5863 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5864 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5865 .add(SrcReg0Sub1)
5866 .add(SrcReg1Sub1)
5867 .addReg(CarryReg, RegState::Kill)
5868 .addImm(0); // clamp bit
5869
5870 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5871 .addReg(DestSub0)
5872 .addImm(AMDGPU::sub0)
5873 .addReg(DestSub1)
5874 .addImm(AMDGPU::sub1);
5875 TII->legalizeOperands(*LoHalf);
5876 TII->legalizeOperands(*HiHalf);
5877 MI.eraseFromParent();
5878 return BB;
5879 }
5880 case AMDGPU::S_ADD_CO_PSEUDO:
5881 case AMDGPU::S_SUB_CO_PSEUDO: {
5882 // This pseudo has a chance to be selected
5883 // only from uniform add/subcarry node. All the VGPR operands
5884 // therefore assumed to be splat vectors.
5886 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5887 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5889 const DebugLoc &DL = MI.getDebugLoc();
5890 MachineOperand &Dest = MI.getOperand(0);
5891 MachineOperand &CarryDest = MI.getOperand(1);
5892 MachineOperand &Src0 = MI.getOperand(2);
5893 MachineOperand &Src1 = MI.getOperand(3);
5894 MachineOperand &Src2 = MI.getOperand(4);
5895 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5896 ? AMDGPU::S_ADDC_U32
5897 : AMDGPU::S_SUBB_U32;
5898 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5899 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5900 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5901 .addReg(Src0.getReg());
5902 Src0.setReg(RegOp0);
5903 }
5904 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5905 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5906 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5907 .addReg(Src1.getReg());
5908 Src1.setReg(RegOp1);
5909 }
5910 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5911 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5912 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5913 .addReg(Src2.getReg());
5914 Src2.setReg(RegOp2);
5915 }
5916
5917 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5918 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5919 assert(WaveSize == 64 || WaveSize == 32);
5920
5921 if (WaveSize == 64) {
5922 if (ST.hasScalarCompareEq64()) {
5923 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5924 .addReg(Src2.getReg())
5925 .addImm(0);
5926 } else {
5927 const TargetRegisterClass *SubRC =
5928 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5929 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5930 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5931 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5932 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5933 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5934
5935 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5936 .add(Src2Sub0)
5937 .add(Src2Sub1);
5938
5939 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5940 .addReg(Src2_32, RegState::Kill)
5941 .addImm(0);
5942 }
5943 } else {
5944 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5945 .addReg(Src2.getReg())
5946 .addImm(0);
5947 }
5948
5949 // clang-format off
5950 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
5951 .add(Src0)
5952 .add(Src1);
5953 // clang-format on
5954
5955 unsigned SelOpc =
5956 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5957
5958 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5959 .addImm(-1)
5960 .addImm(0);
5961
5962 MI.eraseFromParent();
5963 return BB;
5964 }
5965 case AMDGPU::SI_INIT_M0: {
5966 MachineOperand &M0Init = MI.getOperand(0);
5967 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5968 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5969 AMDGPU::M0)
5970 .add(M0Init);
5971 MI.eraseFromParent();
5972 return BB;
5973 }
5974 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5975 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
5976 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5977 TII->get(AMDGPU::S_CMP_EQ_U32))
5978 .addImm(0)
5979 .addImm(0);
5980 return BB;
5981 }
5982 case AMDGPU::GET_GROUPSTATICSIZE: {
5983 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5984 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5985 DebugLoc DL = MI.getDebugLoc();
5986 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5987 .add(MI.getOperand(0))
5988 .addImm(MFI->getLDSSize());
5989 MI.eraseFromParent();
5990 return BB;
5991 }
5992 case AMDGPU::GET_SHADERCYCLESHILO: {
5995 const DebugLoc &DL = MI.getDebugLoc();
5996 // The algorithm is:
5997 //
5998 // hi1 = getreg(SHADER_CYCLES_HI)
5999 // lo1 = getreg(SHADER_CYCLES_LO)
6000 // hi2 = getreg(SHADER_CYCLES_HI)
6001 //
6002 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6003 // Otherwise there was overflow and the result is hi2:0. In both cases the
6004 // result should represent the actual time at some point during the sequence
6005 // of three getregs.
6006 using namespace AMDGPU::Hwreg;
6007 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6008 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6009 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6010 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6011 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6012 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6013 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6014 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6015 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6016 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6017 .addReg(RegHi1)
6018 .addReg(RegHi2);
6019 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6020 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6021 .addReg(RegLo1)
6022 .addImm(0);
6023 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6024 .add(MI.getOperand(0))
6025 .addReg(RegLo)
6026 .addImm(AMDGPU::sub0)
6027 .addReg(RegHi2)
6028 .addImm(AMDGPU::sub1);
6029 MI.eraseFromParent();
6030 return BB;
6031 }
6032 case AMDGPU::SI_INDIRECT_SRC_V1:
6033 case AMDGPU::SI_INDIRECT_SRC_V2:
6034 case AMDGPU::SI_INDIRECT_SRC_V4:
6035 case AMDGPU::SI_INDIRECT_SRC_V8:
6036 case AMDGPU::SI_INDIRECT_SRC_V9:
6037 case AMDGPU::SI_INDIRECT_SRC_V10:
6038 case AMDGPU::SI_INDIRECT_SRC_V11:
6039 case AMDGPU::SI_INDIRECT_SRC_V12:
6040 case AMDGPU::SI_INDIRECT_SRC_V16:
6041 case AMDGPU::SI_INDIRECT_SRC_V32:
6042 return emitIndirectSrc(MI, *BB, *getSubtarget());
6043 case AMDGPU::SI_INDIRECT_DST_V1:
6044 case AMDGPU::SI_INDIRECT_DST_V2:
6045 case AMDGPU::SI_INDIRECT_DST_V4:
6046 case AMDGPU::SI_INDIRECT_DST_V8:
6047 case AMDGPU::SI_INDIRECT_DST_V9:
6048 case AMDGPU::SI_INDIRECT_DST_V10:
6049 case AMDGPU::SI_INDIRECT_DST_V11:
6050 case AMDGPU::SI_INDIRECT_DST_V12:
6051 case AMDGPU::SI_INDIRECT_DST_V16:
6052 case AMDGPU::SI_INDIRECT_DST_V32:
6053 return emitIndirectDst(MI, *BB, *getSubtarget());
6054 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6055 case AMDGPU::SI_KILL_I1_PSEUDO:
6056 return splitKillBlock(MI, BB);
6057 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6059 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6060 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6061
6062 Register Dst = MI.getOperand(0).getReg();
6063 const MachineOperand &Src0 = MI.getOperand(1);
6064 const MachineOperand &Src1 = MI.getOperand(2);
6065 const DebugLoc &DL = MI.getDebugLoc();
6066 Register SrcCond = MI.getOperand(3).getReg();
6067
6068 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6069 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6070 const auto *CondRC = TRI->getWaveMaskRegClass();
6071 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6072
6073 const TargetRegisterClass *Src0RC = Src0.isReg()
6074 ? MRI.getRegClass(Src0.getReg())
6075 : &AMDGPU::VReg_64RegClass;
6076 const TargetRegisterClass *Src1RC = Src1.isReg()
6077 ? MRI.getRegClass(Src1.getReg())
6078 : &AMDGPU::VReg_64RegClass;
6079
6080 const TargetRegisterClass *Src0SubRC =
6081 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6082 const TargetRegisterClass *Src1SubRC =
6083 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6084
6085 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6086 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6087 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6088 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6089
6090 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6091 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6092 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6093 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6094
6095 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6096 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6097 .addImm(0)
6098 .add(Src0Sub0)
6099 .addImm(0)
6100 .add(Src1Sub0)
6101 .addReg(SrcCondCopy);
6102 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6103 .addImm(0)
6104 .add(Src0Sub1)
6105 .addImm(0)
6106 .add(Src1Sub1)
6107 .addReg(SrcCondCopy);
6108
6109 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6110 .addReg(DstLo)
6111 .addImm(AMDGPU::sub0)
6112 .addReg(DstHi)
6113 .addImm(AMDGPU::sub1);
6114 MI.eraseFromParent();
6115 return BB;
6116 }
6117 case AMDGPU::SI_BR_UNDEF: {
6119 const DebugLoc &DL = MI.getDebugLoc();
6120 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6121 .add(MI.getOperand(0));
6122 Br->getOperand(1).setIsUndef(); // read undef SCC
6123 MI.eraseFromParent();
6124 return BB;
6125 }
6126 case AMDGPU::ADJCALLSTACKUP:
6127 case AMDGPU::ADJCALLSTACKDOWN: {
6129 MachineInstrBuilder MIB(*MF, &MI);
6130 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6131 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6132 return BB;
6133 }
6134 case AMDGPU::SI_CALL_ISEL: {
6136 const DebugLoc &DL = MI.getDebugLoc();
6137
6138 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6139
6141 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6142
6143 for (const MachineOperand &MO : MI.operands())
6144 MIB.add(MO);
6145
6146 MIB.cloneMemRefs(MI);
6147 MI.eraseFromParent();
6148 return BB;
6149 }
6150 case AMDGPU::V_ADD_CO_U32_e32:
6151 case AMDGPU::V_SUB_CO_U32_e32:
6152 case AMDGPU::V_SUBREV_CO_U32_e32: {
6153 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6154 const DebugLoc &DL = MI.getDebugLoc();
6155 unsigned Opc = MI.getOpcode();
6156
6157 bool NeedClampOperand = false;
6158 if (TII->pseudoToMCOpcode(Opc) == -1) {
6160 NeedClampOperand = true;
6161 }
6162
6163 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6164 if (TII->isVOP3(*I)) {
6165 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6166 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6167 I.addReg(TRI->getVCC(), RegState::Define);
6168 }
6169 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6170 if (NeedClampOperand)
6171 I.addImm(0); // clamp bit for e64 encoding
6172
6173 TII->legalizeOperands(*I);
6174
6175 MI.eraseFromParent();
6176 return BB;
6177 }
6178 case AMDGPU::V_ADDC_U32_e32:
6179 case AMDGPU::V_SUBB_U32_e32:
6180 case AMDGPU::V_SUBBREV_U32_e32:
6181 // These instructions have an implicit use of vcc which counts towards the
6182 // constant bus limit.
6183 TII->legalizeOperands(MI);
6184 return BB;
6185 case AMDGPU::DS_GWS_INIT:
6186 case AMDGPU::DS_GWS_SEMA_BR:
6187 case AMDGPU::DS_GWS_BARRIER:
6188 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6189 [[fallthrough]];
6190 case AMDGPU::DS_GWS_SEMA_V:
6191 case AMDGPU::DS_GWS_SEMA_P:
6192 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6193 // A s_waitcnt 0 is required to be the instruction immediately following.
6194 if (getSubtarget()->hasGWSAutoReplay()) {
6196 return BB;
6197 }
6198
6199 return emitGWSMemViolTestLoop(MI, BB);
6200 case AMDGPU::S_SETREG_B32: {
6201 // Try to optimize cases that only set the denormal mode or rounding mode.
6202 //
6203 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6204 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6205 // instead.
6206 //
6207 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6208 // allow you to have a no side effect instruction in the output of a
6209 // sideeffecting pattern.
6210 auto [ID, Offset, Width] =
6211 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6213 return BB;
6214
6215 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6216 const unsigned SetMask = WidthMask << Offset;
6217
6218 if (getSubtarget()->hasDenormModeInst()) {
6219 unsigned SetDenormOp = 0;
6220 unsigned SetRoundOp = 0;
6221
6222 // The dedicated instructions can only set the whole denorm or round mode
6223 // at once, not a subset of bits in either.
6224 if (SetMask ==
6226 // If this fully sets both the round and denorm mode, emit the two
6227 // dedicated instructions for these.
6228 SetRoundOp = AMDGPU::S_ROUND_MODE;
6229 SetDenormOp = AMDGPU::S_DENORM_MODE;
6230 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6231 SetRoundOp = AMDGPU::S_ROUND_MODE;
6232 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6233 SetDenormOp = AMDGPU::S_DENORM_MODE;
6234 }
6235
6236 if (SetRoundOp || SetDenormOp) {
6238 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6239 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6240 unsigned ImmVal = Def->getOperand(1).getImm();
6241 if (SetRoundOp) {
6242 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6243 .addImm(ImmVal & 0xf);
6244
6245 // If we also have the denorm mode, get just the denorm mode bits.
6246 ImmVal >>= 4;
6247 }
6248
6249 if (SetDenormOp) {
6250 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6251 .addImm(ImmVal & 0xf);
6252 }
6253
6254 MI.eraseFromParent();
6255 return BB;
6256 }
6257 }
6258 }
6259
6260 // If only FP bits are touched, used the no side effects pseudo.
6261 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6262 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6263 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6264
6265 return BB;
6266 }
6267 case AMDGPU::S_INVERSE_BALLOT_U32:
6268 case AMDGPU::S_INVERSE_BALLOT_U64:
6269 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6270 // necessary. After that they are equivalent to a COPY.
6271 MI.setDesc(TII->get(AMDGPU::COPY));
6272 return BB;
6273 case AMDGPU::ENDPGM_TRAP: {
6274 const DebugLoc &DL = MI.getDebugLoc();
6275 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6276 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6277 MI.addOperand(MachineOperand::CreateImm(0));
6278 return BB;
6279 }
6280
6281 // We need a block split to make the real endpgm a terminator. We also don't
6282 // want to break phis in successor blocks, so we can't just delete to the
6283 // end of the block.
6284
6285 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6287 MF->push_back(TrapBB);
6288 // clang-format off
6289 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6290 .addImm(0);
6291 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6292 .addMBB(TrapBB);
6293 // clang-format on
6294
6295 BB->addSuccessor(TrapBB);
6296 MI.eraseFromParent();
6297 return SplitBB;
6298 }
6299 case AMDGPU::SIMULATED_TRAP: {
6300 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6302 MachineBasicBlock *SplitBB =
6303 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6304 MI.eraseFromParent();
6305 return SplitBB;
6306 }
6307 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6308 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6310
6311 // During ISel, it's difficult to propagate the original EXEC mask to use as
6312 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6313 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6314 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6315 Register OriginalExec = Setup->getOperand(0).getReg();
6316 MF->getRegInfo().clearKillFlags(OriginalExec);
6317 MI.getOperand(0).setReg(OriginalExec);
6318 return BB;
6319 }
6320 default:
6321 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6322 if (!MI.mayStore())
6324 return BB;
6325 }
6327 }
6328}
6329
6331 // This currently forces unfolding various combinations of fsub into fma with
6332 // free fneg'd operands. As long as we have fast FMA (controlled by
6333 // isFMAFasterThanFMulAndFAdd), we should perform these.
6334
6335 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6336 // most of these combines appear to be cycle neutral but save on instruction
6337 // count / code size.
6338 return true;
6339}
6340
6342
6344 EVT VT) const {
6345 if (!VT.isVector()) {
6346 return MVT::i1;
6347 }
6348 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6349}
6350
6352 // TODO: Should i16 be used always if legal? For now it would force VALU
6353 // shifts.
6354 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6355}
6356
6358 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6359 ? Ty.changeElementSize(16)
6360 : Ty.changeElementSize(32);
6361}
6362
6363// Answering this is somewhat tricky and depends on the specific device which
6364// have different rates for fma or all f64 operations.
6365//
6366// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6367// regardless of which device (although the number of cycles differs between
6368// devices), so it is always profitable for f64.
6369//
6370// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6371// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6372// which we can always do even without fused FP ops since it returns the same
6373// result as the separate operations and since it is always full
6374// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6375// however does not support denormals, so we do report fma as faster if we have
6376// a fast fma device and require denormals.
6377//
6379 EVT VT) const {
6380 VT = VT.getScalarType();
6381
6382 switch (VT.getSimpleVT().SimpleTy) {
6383 case MVT::f32: {
6384 // If mad is not available this depends only on if f32 fma is full rate.
6385 if (!Subtarget->hasMadMacF32Insts())
6386 return Subtarget->hasFastFMAF32();
6387
6388 // Otherwise f32 mad is always full rate and returns the same result as
6389 // the separate operations so should be preferred over fma.
6390 // However does not support denormals.
6392 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6393
6394 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6395 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6396 }
6397 case MVT::f64:
6398 return true;
6399 case MVT::f16:
6400 case MVT::bf16:
6401 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6402 default:
6403 break;
6404 }
6405
6406 return false;
6407}
6408
6410 LLT Ty) const {
6411 switch (Ty.getScalarSizeInBits()) {
6412 case 16:
6413 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6414 case 32:
6415 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6416 case 64:
6417 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6418 default:
6419 break;
6420 }
6421
6422 return false;
6423}
6424
6426 if (!Ty.isScalar())
6427 return false;
6428
6429 if (Ty.getScalarSizeInBits() == 16)
6430 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6431 if (Ty.getScalarSizeInBits() == 32)
6432 return Subtarget->hasMadMacF32Insts() &&
6433 denormalModeIsFlushAllF32(*MI.getMF());
6434
6435 return false;
6436}
6437
6439 const SDNode *N) const {
6440 // TODO: Check future ftz flag
6441 // v_mad_f32/v_mac_f32 do not support denormals.
6442 EVT VT = N->getValueType(0);
6443 if (VT == MVT::f32)
6444 return Subtarget->hasMadMacF32Insts() &&
6446 if (VT == MVT::f16) {
6447 return Subtarget->hasMadF16() &&
6449 }
6450
6451 return false;
6452}
6453
6454//===----------------------------------------------------------------------===//
6455// Custom DAG Lowering Operations
6456//===----------------------------------------------------------------------===//
6457
6458// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6459// wider vector type is legal.
6461 SelectionDAG &DAG) const {
6462 unsigned Opc = Op.getOpcode();
6463 EVT VT = Op.getValueType();
6464 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6465 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6466 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6467 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6468
6469 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6470
6471 SDLoc SL(Op);
6472 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6473 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6474
6475 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6476}
6477
6478// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6479// wider vector type is legal.
6481 SelectionDAG &DAG) const {
6482 unsigned Opc = Op.getOpcode();
6483 EVT VT = Op.getValueType();
6484 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6485 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6486 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6487 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6488 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6489 VT == MVT::v32bf16);
6490
6491 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6492 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6493
6494 SDLoc SL(Op);
6495
6496 SDValue OpLo =
6497 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6498 SDValue OpHi =
6499 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6500
6501 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6502}
6503
6505 SelectionDAG &DAG) const {
6506 unsigned Opc = Op.getOpcode();
6507 EVT VT = Op.getValueType();
6508 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6509 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6510 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6511 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6512 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6513 VT == MVT::v32bf16);
6514
6515 SDValue Op0 = Op.getOperand(0);
6516 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6517 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6518 : std::pair(Op0, Op0);
6519
6520 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6521 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6522
6523 SDLoc SL(Op);
6524 auto ResVT = DAG.GetSplitDestVTs(VT);
6525
6526 SDValue OpLo =
6527 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6528 SDValue OpHi =
6529 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6530
6531 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6532}
6533
6535 switch (Op.getOpcode()) {
6536 default:
6538 case ISD::BRCOND:
6539 return LowerBRCOND(Op, DAG);
6540 case ISD::RETURNADDR:
6541 return LowerRETURNADDR(Op, DAG);
6542 case ISD::LOAD: {
6543 SDValue Result = LowerLOAD(Op, DAG);
6544 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6545 "Load should return a value and a chain");
6546 return Result;
6547 }
6548 case ISD::FSQRT: {
6549 EVT VT = Op.getValueType();
6550 if (VT == MVT::f32)
6551 return lowerFSQRTF32(Op, DAG);
6552 if (VT == MVT::f64)
6553 return lowerFSQRTF64(Op, DAG);
6554 return SDValue();
6555 }
6556 case ISD::FSIN:
6557 case ISD::FCOS:
6558 return LowerTrig(Op, DAG);
6559 case ISD::SELECT:
6560 return LowerSELECT(Op, DAG);
6561 case ISD::FDIV:
6562 return LowerFDIV(Op, DAG);
6563 case ISD::FFREXP:
6564 return LowerFFREXP(Op, DAG);
6565 case ISD::ATOMIC_CMP_SWAP:
6566 return LowerATOMIC_CMP_SWAP(Op, DAG);
6567 case ISD::STORE:
6568 return LowerSTORE(Op, DAG);
6569 case ISD::GlobalAddress: {
6572 return LowerGlobalAddress(MFI, Op, DAG);
6573 }
6575 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6577 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6579 return LowerINTRINSIC_VOID(Op, DAG);
6580 case ISD::ADDRSPACECAST:
6581 return lowerADDRSPACECAST(Op, DAG);
6583 return lowerINSERT_SUBVECTOR(Op, DAG);
6585 return lowerINSERT_VECTOR_ELT(Op, DAG);
6587 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6589 return lowerVECTOR_SHUFFLE(Op, DAG);
6591 return lowerSCALAR_TO_VECTOR(Op, DAG);
6592 case ISD::BUILD_VECTOR:
6593 return lowerBUILD_VECTOR(Op, DAG);
6594 case ISD::FP_ROUND:
6596 return lowerFP_ROUND(Op, DAG);
6597 case ISD::TRAP:
6598 return lowerTRAP(Op, DAG);
6599 case ISD::DEBUGTRAP:
6600 return lowerDEBUGTRAP(Op, DAG);
6601 case ISD::ABS:
6602 case ISD::FABS:
6603 case ISD::FNEG:
6604 case ISD::FCANONICALIZE:
6605 case ISD::BSWAP:
6606 return splitUnaryVectorOp(Op, DAG);
6607 case ISD::FMINNUM:
6608 case ISD::FMAXNUM:
6609 return lowerFMINNUM_FMAXNUM(Op, DAG);
6610 case ISD::FMINIMUMNUM:
6611 case ISD::FMAXIMUMNUM:
6612 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6613 case ISD::FMINIMUM:
6614 case ISD::FMAXIMUM:
6615 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6616 case ISD::FLDEXP:
6617 case ISD::STRICT_FLDEXP:
6618 return lowerFLDEXP(Op, DAG);
6619 case ISD::FMA:
6620 return splitTernaryVectorOp(Op, DAG);
6621 case ISD::FP_TO_SINT:
6622 case ISD::FP_TO_UINT:
6623 return LowerFP_TO_INT(Op, DAG);
6624 case ISD::SHL:
6625 case ISD::SRA:
6626 case ISD::SRL:
6627 case ISD::ADD:
6628 case ISD::SUB:
6629 case ISD::SMIN:
6630 case ISD::SMAX:
6631 case ISD::UMIN:
6632 case ISD::UMAX:
6633 case ISD::FADD:
6634 case ISD::FMUL:
6635 case ISD::FMINNUM_IEEE:
6636 case ISD::FMAXNUM_IEEE:
6637 case ISD::UADDSAT:
6638 case ISD::USUBSAT:
6639 case ISD::SADDSAT:
6640 case ISD::SSUBSAT:
6641 return splitBinaryVectorOp(Op, DAG);
6642 case ISD::FCOPYSIGN:
6643 return lowerFCOPYSIGN(Op, DAG);
6644 case ISD::MUL:
6645 return lowerMUL(Op, DAG);
6646 case ISD::SMULO:
6647 case ISD::UMULO:
6648 return lowerXMULO(Op, DAG);
6649 case ISD::SMUL_LOHI:
6650 case ISD::UMUL_LOHI:
6651 return lowerXMUL_LOHI(Op, DAG);
6652 case ISD::DYNAMIC_STACKALLOC:
6653 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6654 case ISD::STACKSAVE:
6655 return LowerSTACKSAVE(Op, DAG);
6656 case ISD::GET_ROUNDING:
6657 return lowerGET_ROUNDING(Op, DAG);
6658 case ISD::SET_ROUNDING:
6659 return lowerSET_ROUNDING(Op, DAG);
6660 case ISD::PREFETCH:
6661 return lowerPREFETCH(Op, DAG);
6662 case ISD::FP_EXTEND:
6664 return lowerFP_EXTEND(Op, DAG);
6665 case ISD::GET_FPENV:
6666 return lowerGET_FPENV(Op, DAG);
6667 case ISD::SET_FPENV:
6668 return lowerSET_FPENV(Op, DAG);
6669 }
6670 return SDValue();
6671}
6672
6673// Used for D16: Casts the result of an instruction into the right vector,
6674// packs values if loads return unpacked values.
6676 const SDLoc &DL, SelectionDAG &DAG,
6677 bool Unpacked) {
6678 if (!LoadVT.isVector())
6679 return Result;
6680
6681 // Cast back to the original packed type or to a larger type that is a
6682 // multiple of 32 bit for D16. Widening the return type is a required for
6683 // legalization.
6684 EVT FittingLoadVT = LoadVT;
6685 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6686 FittingLoadVT =
6688 LoadVT.getVectorNumElements() + 1);
6689 }
6690
6691 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6692 // Truncate to v2i16/v4i16.
6693 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6694
6695 // Workaround legalizer not scalarizing truncate after vector op
6696 // legalization but not creating intermediate vector trunc.
6698 DAG.ExtractVectorElements(Result, Elts);
6699 for (SDValue &Elt : Elts)
6700 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6701
6702 // Pad illegal v1i16/v3fi6 to v4i16
6703 if ((LoadVT.getVectorNumElements() % 2) == 1)
6704 Elts.push_back(DAG.getPOISON(MVT::i16));
6705
6706 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6707
6708 // Bitcast to original type (v2f16/v4f16).
6709 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6710 }
6711
6712 // Cast back to the original packed type.
6713 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6714}
6715
6716SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6717 SelectionDAG &DAG,
6719 bool IsIntrinsic) const {
6720 SDLoc DL(M);
6721
6722 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6723 EVT LoadVT = M->getValueType(0);
6724
6725 EVT EquivLoadVT = LoadVT;
6726 if (LoadVT.isVector()) {
6727 if (Unpacked) {
6728 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6729 LoadVT.getVectorNumElements());
6730 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6731 // Widen v3f16 to legal type
6732 EquivLoadVT =
6734 LoadVT.getVectorNumElements() + 1);
6735 }
6736 }
6737
6738 // Change from v4f16/v2f16 to EquivLoadVT.
6739 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6740
6742 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6743 M->getMemoryVT(), M->getMemOperand());
6744
6745 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6746
6747 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6748}
6749
6750SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6751 SelectionDAG &DAG,
6752 ArrayRef<SDValue> Ops) const {
6753 SDLoc DL(M);
6754 EVT LoadVT = M->getValueType(0);
6755 EVT EltType = LoadVT.getScalarType();
6756 EVT IntVT = LoadVT.changeTypeToInteger();
6757
6758 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6759
6760 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6761 bool IsTFE = M->getNumValues() == 3;
6762
6763 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6765 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6766 : AMDGPUISD::BUFFER_LOAD;
6767
6768 if (IsD16) {
6769 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6770 }
6771
6772 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6773 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6774 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6775 IsTFE);
6776
6777 if (isTypeLegal(LoadVT)) {
6778 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6779 M->getMemOperand(), DAG);
6780 }
6781
6782 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6783 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6784 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6785 M->getMemOperand(), DAG);
6786 return DAG.getMergeValues(
6787 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6788 DL);
6789}
6790
6792 SelectionDAG &DAG) {
6793 EVT VT = N->getValueType(0);
6794 unsigned CondCode = N->getConstantOperandVal(3);
6795 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6796 return DAG.getPOISON(VT);
6797
6798 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6799
6800 SDValue LHS = N->getOperand(1);
6801 SDValue RHS = N->getOperand(2);
6802
6803 SDLoc DL(N);
6804
6805 EVT CmpVT = LHS.getValueType();
6806 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6807 unsigned PromoteOp =
6809 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6810 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6811 }
6812
6813 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6814
6815 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6816 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6817
6818 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6819 DAG.getCondCode(CCOpcode));
6820 if (VT.bitsEq(CCVT))
6821 return SetCC;
6822 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6823}
6824
6826 SelectionDAG &DAG) {
6827 EVT VT = N->getValueType(0);
6828
6829 unsigned CondCode = N->getConstantOperandVal(3);
6830 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6831 return DAG.getPOISON(VT);
6832
6833 SDValue Src0 = N->getOperand(1);
6834 SDValue Src1 = N->getOperand(2);
6835 EVT CmpVT = Src0.getValueType();
6836 SDLoc SL(N);
6837
6838 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6839 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6840 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6841 }
6842
6843 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6844 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6845 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6846 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6847 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6848 DAG.getCondCode(CCOpcode));
6849 if (VT.bitsEq(CCVT))
6850 return SetCC;
6851 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6852}
6853
6855 SelectionDAG &DAG) {
6856 EVT VT = N->getValueType(0);
6857 SDValue Src = N->getOperand(1);
6858 SDLoc SL(N);
6859
6860 if (Src.getOpcode() == ISD::SETCC) {
6861 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6862 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6863 Src.getOperand(1), Src.getOperand(2));
6864 }
6865 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6866 // (ballot 0) -> 0
6867 if (Arg->isZero())
6868 return DAG.getConstant(0, SL, VT);
6869
6870 // (ballot 1) -> EXEC/EXEC_LO
6871 if (Arg->isOne()) {
6872 Register Exec;
6873 if (VT.getScalarSizeInBits() == 32)
6874 Exec = AMDGPU::EXEC_LO;
6875 else if (VT.getScalarSizeInBits() == 64)
6876 Exec = AMDGPU::EXEC;
6877 else
6878 return SDValue();
6879
6880 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6881 }
6882 }
6883
6884 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6885 // ISD::SETNE)
6886 return DAG.getNode(
6887 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6888 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6889}
6890
6892 SelectionDAG &DAG) {
6893 EVT VT = N->getValueType(0);
6894 unsigned ValSize = VT.getSizeInBits();
6895 unsigned IID = N->getConstantOperandVal(0);
6896 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6897 IID == Intrinsic::amdgcn_permlanex16;
6898 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6899 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6900 SDLoc SL(N);
6901 MVT IntVT = MVT::getIntegerVT(ValSize);
6902 const GCNSubtarget *ST = TLI.getSubtarget();
6903 unsigned SplitSize = 32;
6904 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6905 ST->hasDPALU_DPP() &&
6906 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
6907 SplitSize = 64;
6908
6909 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6910 SDValue Src2, MVT ValT) -> SDValue {
6912 switch (IID) {
6913 case Intrinsic::amdgcn_permlane16:
6914 case Intrinsic::amdgcn_permlanex16:
6915 case Intrinsic::amdgcn_update_dpp:
6916 Operands.push_back(N->getOperand(6));
6917 Operands.push_back(N->getOperand(5));
6918 Operands.push_back(N->getOperand(4));
6919 [[fallthrough]];
6920 case Intrinsic::amdgcn_writelane:
6921 Operands.push_back(Src2);
6922 [[fallthrough]];
6923 case Intrinsic::amdgcn_readlane:
6924 case Intrinsic::amdgcn_set_inactive:
6925 case Intrinsic::amdgcn_set_inactive_chain_arg:
6926 case Intrinsic::amdgcn_mov_dpp8:
6927 Operands.push_back(Src1);
6928 [[fallthrough]];
6929 case Intrinsic::amdgcn_readfirstlane:
6930 case Intrinsic::amdgcn_permlane64:
6931 Operands.push_back(Src0);
6932 break;
6933 default:
6934 llvm_unreachable("unhandled lane op");
6935 }
6936
6937 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6938 std::reverse(Operands.begin(), Operands.end());
6939
6940 if (SDNode *GL = N->getGluedNode()) {
6941 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6942 GL = GL->getOperand(0).getNode();
6943 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6944 SDValue(GL, 0)));
6945 }
6946
6947 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6948 };
6949
6950 SDValue Src0 = N->getOperand(1);
6951 SDValue Src1, Src2;
6952 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6953 IID == Intrinsic::amdgcn_mov_dpp8 ||
6954 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6955 Src1 = N->getOperand(2);
6956 if (IID == Intrinsic::amdgcn_writelane ||
6957 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6958 Src2 = N->getOperand(3);
6959 }
6960
6961 if (ValSize == SplitSize) {
6962 // Already legal
6963 return SDValue();
6964 }
6965
6966 if (ValSize < 32) {
6967 bool IsFloat = VT.isFloatingPoint();
6968 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6969 SL, MVT::i32);
6970
6971 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6972 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6973 SL, MVT::i32);
6974 }
6975
6976 if (IID == Intrinsic::amdgcn_writelane) {
6977 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6978 SL, MVT::i32);
6979 }
6980
6981 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6982 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6983 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6984 }
6985
6986 if (ValSize % SplitSize != 0)
6987 return SDValue();
6988
6989 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6990 EVT VT = N->getValueType(0);
6991 unsigned NE = VT.getVectorNumElements();
6992 EVT EltVT = VT.getVectorElementType();
6994 unsigned NumOperands = N->getNumOperands();
6995 SmallVector<SDValue, 4> Operands(NumOperands);
6996 SDNode *GL = N->getGluedNode();
6997
6998 // only handle convergencectrl_glue
6999 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7000
7001 for (unsigned i = 0; i != NE; ++i) {
7002 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7003 ++j) {
7004 SDValue Operand = N->getOperand(j);
7005 EVT OperandVT = Operand.getValueType();
7006 if (OperandVT.isVector()) {
7007 // A vector operand; extract a single element.
7008 EVT OperandEltVT = OperandVT.getVectorElementType();
7009 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7010 Operand, DAG.getVectorIdxConstant(i, SL));
7011 } else {
7012 // A scalar operand; just use it as is.
7013 Operands[j] = Operand;
7014 }
7015 }
7016
7017 if (GL)
7018 Operands[NumOperands - 1] =
7019 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7020 SDValue(GL->getOperand(0).getNode(), 0));
7021
7022 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7023 }
7024
7025 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7026 return DAG.getBuildVector(VecVT, SL, Scalars);
7027 };
7028
7029 if (VT.isVector()) {
7030 switch (MVT::SimpleValueType EltTy =
7032 case MVT::i32:
7033 case MVT::f32:
7034 if (SplitSize == 32) {
7035 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7036 return unrollLaneOp(LaneOp.getNode());
7037 }
7038 [[fallthrough]];
7039 case MVT::i16:
7040 case MVT::f16:
7041 case MVT::bf16: {
7042 unsigned SubVecNumElt =
7043 SplitSize / VT.getVectorElementType().getSizeInBits();
7044 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7046 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7047 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7048 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7049 DAG.getConstant(EltIdx, SL, MVT::i32));
7050
7051 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7052 IsPermLane16)
7053 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7054 DAG.getConstant(EltIdx, SL, MVT::i32));
7055
7056 if (IID == Intrinsic::amdgcn_writelane)
7057 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7058 DAG.getConstant(EltIdx, SL, MVT::i32));
7059
7060 Pieces.push_back(
7061 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7062 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7063 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7064 EltIdx += SubVecNumElt;
7065 }
7066 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7067 }
7068 default:
7069 // Handle all other cases by bitcasting to i32 vectors
7070 break;
7071 }
7072 }
7073
7074 MVT VecVT =
7075 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7076 Src0 = DAG.getBitcast(VecVT, Src0);
7077
7078 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7079 Src1 = DAG.getBitcast(VecVT, Src1);
7080
7081 if (IID == Intrinsic::amdgcn_writelane)
7082 Src2 = DAG.getBitcast(VecVT, Src2);
7083
7084 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7085 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7086 return DAG.getBitcast(VT, UnrolledLaneOp);
7087}
7088
7091 SelectionDAG &DAG) const {
7092 switch (N->getOpcode()) {
7094 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7095 Results.push_back(Res);
7096 return;
7097 }
7099 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7100 Results.push_back(Res);
7101 return;
7102 }
7104 unsigned IID = N->getConstantOperandVal(0);
7105 switch (IID) {
7106 case Intrinsic::amdgcn_make_buffer_rsrc:
7107 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7108 return;
7109 case Intrinsic::amdgcn_cvt_pkrtz: {
7110 SDValue Src0 = N->getOperand(1);
7111 SDValue Src1 = N->getOperand(2);
7112 SDLoc SL(N);
7113 SDValue Cvt =
7114 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7115 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7116 return;
7117 }
7118 case Intrinsic::amdgcn_cvt_pknorm_i16:
7119 case Intrinsic::amdgcn_cvt_pknorm_u16:
7120 case Intrinsic::amdgcn_cvt_pk_i16:
7121 case Intrinsic::amdgcn_cvt_pk_u16: {
7122 SDValue Src0 = N->getOperand(1);
7123 SDValue Src1 = N->getOperand(2);
7124 SDLoc SL(N);
7125 unsigned Opcode;
7126
7127 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7129 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7131 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7133 else
7135
7136 EVT VT = N->getValueType(0);
7137 if (isTypeLegal(VT))
7138 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7139 else {
7140 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7141 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7142 }
7143 return;
7144 }
7145 case Intrinsic::amdgcn_s_buffer_load: {
7146 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7147 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7148 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7149 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7150 // s_buffer_load_i8.
7151 if (!Subtarget->hasScalarSubwordLoads())
7152 return;
7153 SDValue Op = SDValue(N, 0);
7154 SDValue Rsrc = Op.getOperand(1);
7155 SDValue Offset = Op.getOperand(2);
7156 SDValue CachePolicy = Op.getOperand(3);
7157 EVT VT = Op.getValueType();
7158 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7159 SDLoc DL(Op);
7161 const DataLayout &DataLayout = DAG.getDataLayout();
7162 Align Alignment =
7168 VT.getStoreSize(), Alignment);
7169 SDValue LoadVal;
7170 if (!Offset->isDivergent()) {
7171 SDValue Ops[] = {Rsrc, // source register
7172 Offset, CachePolicy};
7173 SDValue BufferLoad =
7175 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7176 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7177 } else {
7178 SDValue Ops[] = {
7179 DAG.getEntryNode(), // Chain
7180 Rsrc, // rsrc
7181 DAG.getConstant(0, DL, MVT::i32), // vindex
7182 {}, // voffset
7183 {}, // soffset
7184 {}, // offset
7185 CachePolicy, // cachepolicy
7186 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7187 };
7188 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7189 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7190 }
7191 Results.push_back(LoadVal);
7192 return;
7193 }
7194 case Intrinsic::amdgcn_dead: {
7195 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7196 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7197 return;
7198 }
7199 }
7200 break;
7201 }
7203 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7204 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7205 // FIXME: Hacky
7206 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7207 Results.push_back(Res.getOperand(I));
7208 }
7209 } else {
7210 Results.push_back(Res);
7211 Results.push_back(Res.getValue(1));
7212 }
7213 return;
7214 }
7215
7216 break;
7217 }
7218 case ISD::SELECT: {
7219 SDLoc SL(N);
7220 EVT VT = N->getValueType(0);
7221 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7222 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7223 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7224
7225 EVT SelectVT = NewVT;
7226 if (NewVT.bitsLT(MVT::i32)) {
7227 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7228 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7229 SelectVT = MVT::i32;
7230 }
7231
7232 SDValue NewSelect =
7233 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7234
7235 if (NewVT != SelectVT)
7236 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7237 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7238 return;
7239 }
7240 case ISD::FNEG: {
7241 if (N->getValueType(0) != MVT::v2f16)
7242 break;
7243
7244 SDLoc SL(N);
7245 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7246
7247 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7248 DAG.getConstant(0x80008000, SL, MVT::i32));
7249 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7250 return;
7251 }
7252 case ISD::FABS: {
7253 if (N->getValueType(0) != MVT::v2f16)
7254 break;
7255
7256 SDLoc SL(N);
7257 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7258
7259 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7260 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7261 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7262 return;
7263 }
7264 case ISD::FSQRT: {
7265 if (N->getValueType(0) != MVT::f16)
7266 break;
7267 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7268 break;
7269 }
7270 default:
7272 break;
7273 }
7274}
7275
7276/// Helper function for LowerBRCOND
7277static SDNode *findUser(SDValue Value, unsigned Opcode) {
7278
7279 for (SDUse &U : Value->uses()) {
7280 if (U.get() != Value)
7281 continue;
7282
7283 if (U.getUser()->getOpcode() == Opcode)
7284 return U.getUser();
7285 }
7286 return nullptr;
7287}
7288
7289unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7290 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7291 switch (Intr->getConstantOperandVal(1)) {
7292 case Intrinsic::amdgcn_if:
7293 return AMDGPUISD::IF;
7294 case Intrinsic::amdgcn_else:
7295 return AMDGPUISD::ELSE;
7296 case Intrinsic::amdgcn_loop:
7297 return AMDGPUISD::LOOP;
7298 case Intrinsic::amdgcn_end_cf:
7299 llvm_unreachable("should not occur");
7300 default:
7301 return 0;
7302 }
7303 }
7304
7305 // break, if_break, else_break are all only used as inputs to loop, not
7306 // directly as branch conditions.
7307 return 0;
7308}
7309
7316
7318 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7319 return false;
7320
7321 // FIXME: Either avoid relying on address space here or change the default
7322 // address space for functions to avoid the explicit check.
7323 return (GV->getValueType()->isFunctionTy() ||
7326}
7327
7329 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7330}
7331
7333 if (!GV->hasExternalLinkage())
7334 return true;
7335
7336 const auto OS = getTargetMachine().getTargetTriple().getOS();
7337 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7338}
7339
7340/// This transforms the control flow intrinsics to get the branch destination as
7341/// last parameter, also switches branch target with BR if the need arise
7342SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7343 SDLoc DL(BRCOND);
7344
7345 SDNode *Intr = BRCOND.getOperand(1).getNode();
7346 SDValue Target = BRCOND.getOperand(2);
7347 SDNode *BR = nullptr;
7348 SDNode *SetCC = nullptr;
7349
7350 if (Intr->getOpcode() == ISD::SETCC) {
7351 // As long as we negate the condition everything is fine
7352 SetCC = Intr;
7353 Intr = SetCC->getOperand(0).getNode();
7354
7355 } else {
7356 // Get the target from BR if we don't negate the condition
7357 BR = findUser(BRCOND, ISD::BR);
7358 assert(BR && "brcond missing unconditional branch user");
7359 Target = BR->getOperand(1);
7360 }
7361
7362 unsigned CFNode = isCFIntrinsic(Intr);
7363 if (CFNode == 0) {
7364 // This is a uniform branch so we don't need to legalize.
7365 return BRCOND;
7366 }
7367
7368 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7370
7371 assert(!SetCC ||
7372 (SetCC->getConstantOperandVal(1) == 1 &&
7373 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7374 ISD::SETNE));
7375
7376 // operands of the new intrinsic call
7378 if (HaveChain)
7379 Ops.push_back(BRCOND.getOperand(0));
7380
7381 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7382 Ops.push_back(Target);
7383
7384 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7385
7386 // build the new intrinsic call
7387 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7388
7389 if (!HaveChain) {
7390 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7391
7393 }
7394
7395 if (BR) {
7396 // Give the branch instruction our target
7397 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7398 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7399 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7400 }
7401
7402 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7403
7404 // Copy the intrinsic results to registers
7405 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7406 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7407 if (!CopyToReg)
7408 continue;
7409
7410 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7411 SDValue(Result, i - 1), SDValue());
7412
7413 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7414 }
7415
7416 // Remove the old intrinsic from the chain
7417 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7418 Intr->getOperand(0));
7419
7420 return Chain;
7421}
7422
7423SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7424 MVT VT = Op.getSimpleValueType();
7425 SDLoc DL(Op);
7426 // Checking the depth
7427 if (Op.getConstantOperandVal(0) != 0)
7428 return DAG.getConstant(0, DL, VT);
7429
7430 MachineFunction &MF = DAG.getMachineFunction();
7431 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7432 // Check for kernel and shader functions
7433 if (Info->isEntryFunction())
7434 return DAG.getConstant(0, DL, VT);
7435
7436 MachineFrameInfo &MFI = MF.getFrameInfo();
7437 // There is a call to @llvm.returnaddress in this function
7438 MFI.setReturnAddressIsTaken(true);
7439
7440 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7441 // Get the return address reg and mark it as an implicit live-in
7442 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7443 getRegClassFor(VT, Op.getNode()->isDivergent()));
7444
7445 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7446}
7447
7448SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7449 const SDLoc &DL, EVT VT) const {
7450 return Op.getValueType().bitsLE(VT)
7451 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7452 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7453 DAG.getTargetConstant(0, DL, MVT::i32));
7454}
7455
7456SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7457 SelectionDAG &DAG) const {
7458 EVT DstVT = Op.getValueType();
7459 unsigned NumElts = DstVT.getVectorNumElements();
7460 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7461
7462 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7463
7464 SDLoc DL(Op);
7465 unsigned Opc = Op.getOpcode();
7466 SDValue Flags = Op.getOperand(1);
7467 EVT HalfDstVT =
7468 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7469 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7470 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7471
7472 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7473}
7474
7475SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7476 SDValue Src = Op.getOperand(0);
7477 EVT SrcVT = Src.getValueType();
7478 EVT DstVT = Op.getValueType();
7479
7480 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7481 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7482 if (SrcVT.getScalarType() != MVT::f32)
7483 return SDValue();
7484 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7485 }
7486
7487 if (SrcVT.getScalarType() != MVT::f64)
7488 return Op;
7489
7490 SDLoc DL(Op);
7491 if (DstVT == MVT::f16) {
7492 // TODO: Handle strictfp
7493 if (Op.getOpcode() != ISD::FP_ROUND)
7494 return Op;
7495
7496 if (!Subtarget->has16BitInsts()) {
7497 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7498 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7499 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7500 }
7501 if (Op->getFlags().hasApproximateFuncs()) {
7502 SDValue Flags = Op.getOperand(1);
7503 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7504 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7505 }
7506 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7507 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7508 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7509 }
7510
7511 assert(DstVT.getScalarType() == MVT::bf16 &&
7512 "custom lower FP_ROUND for f16 or bf16");
7513 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7514
7515 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7516 // hardware f32 -> bf16 instruction.
7517 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7518 MVT::f32;
7519 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7520 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7521 DAG.getTargetConstant(0, DL, MVT::i32));
7522}
7523
7524SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7525 SelectionDAG &DAG) const {
7526 EVT VT = Op.getValueType();
7527 const MachineFunction &MF = DAG.getMachineFunction();
7528 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7529 bool IsIEEEMode = Info->getMode().IEEE;
7530
7531 // FIXME: Assert during selection that this is only selected for
7532 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7533 // mode functions, but this happens to be OK since it's only done in cases
7534 // where there is known no sNaN.
7535 if (IsIEEEMode)
7536 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7537
7538 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7539 VT == MVT::v16bf16)
7540 return splitBinaryVectorOp(Op, DAG);
7541 return Op;
7542}
7543
7544SDValue
7545SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7546 SelectionDAG &DAG) const {
7547 EVT VT = Op.getValueType();
7548 const MachineFunction &MF = DAG.getMachineFunction();
7549 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7550 bool IsIEEEMode = Info->getMode().IEEE;
7551
7552 if (IsIEEEMode)
7553 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7554
7555 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7556 VT == MVT::v16bf16)
7557 return splitBinaryVectorOp(Op, DAG);
7558 return Op;
7559}
7560
7561SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7562 SelectionDAG &DAG) const {
7563 EVT VT = Op.getValueType();
7564 if (VT.isVector())
7565 return splitBinaryVectorOp(Op, DAG);
7566
7567 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7568 !Subtarget->hasMinimum3Maximum3F16() &&
7569 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7570 "should not need to widen f16 minimum/maximum to v2f16");
7571
7572 // Widen f16 operation to v2f16
7573
7574 // fminimum f16:x, f16:y ->
7575 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7576 // (v2f16 (scalar_to_vector y))), 0
7577 SDLoc SL(Op);
7578 SDValue WideSrc0 =
7579 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7580 SDValue WideSrc1 =
7581 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7582
7583 SDValue Widened =
7584 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7585
7586 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7587 DAG.getConstant(0, SL, MVT::i32));
7588}
7589
7590SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7591 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7592 EVT VT = Op.getValueType();
7593 assert(VT == MVT::f16);
7594
7595 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7596 EVT ExpVT = Exp.getValueType();
7597 if (ExpVT == MVT::i16)
7598 return Op;
7599
7600 SDLoc DL(Op);
7601
7602 // Correct the exponent type for f16 to i16.
7603 // Clamp the range of the exponent to the instruction's range.
7604
7605 // TODO: This should be a generic narrowing legalization, and can easily be
7606 // for GlobalISel.
7607
7608 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7609 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7610
7611 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7612 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7613
7614 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7615
7616 if (IsStrict) {
7617 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7618 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7619 }
7620
7621 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7622}
7623
7625 switch (Op->getOpcode()) {
7626 case ISD::SRA:
7627 case ISD::SMIN:
7628 case ISD::SMAX:
7629 return ISD::SIGN_EXTEND;
7630 case ISD::SRL:
7631 case ISD::UMIN:
7632 case ISD::UMAX:
7633 return ISD::ZERO_EXTEND;
7634 case ISD::ADD:
7635 case ISD::SUB:
7636 case ISD::AND:
7637 case ISD::OR:
7638 case ISD::XOR:
7639 case ISD::SHL:
7640 case ISD::SELECT:
7641 case ISD::MUL:
7642 // operation result won't be influenced by garbage high bits.
7643 // TODO: are all of those cases correct, and are there more?
7644 return ISD::ANY_EXTEND;
7645 case ISD::SETCC: {
7646 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7648 }
7649 default:
7650 llvm_unreachable("unexpected opcode!");
7651 }
7652}
7653
7654SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7655 DAGCombinerInfo &DCI) const {
7656 const unsigned Opc = Op.getOpcode();
7657 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7658 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7659 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7660 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7661 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7662
7663 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7664 : Op->getOperand(0).getValueType();
7665 auto ExtTy = OpTy.changeElementType(MVT::i32);
7666
7667 if (DCI.isBeforeLegalizeOps() ||
7668 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7669 return SDValue();
7670
7671 auto &DAG = DCI.DAG;
7672
7673 SDLoc DL(Op);
7674 SDValue LHS;
7675 SDValue RHS;
7676 if (Opc == ISD::SELECT) {
7677 LHS = Op->getOperand(1);
7678 RHS = Op->getOperand(2);
7679 } else {
7680 LHS = Op->getOperand(0);
7681 RHS = Op->getOperand(1);
7682 }
7683
7684 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7685 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7686
7687 // Special case: for shifts, the RHS always needs a zext.
7688 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7689 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7690 else
7691 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7692
7693 // setcc always return i1/i1 vec so no need to truncate after.
7694 if (Opc == ISD::SETCC) {
7695 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7696 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7697 }
7698
7699 // For other ops, we extend the operation's return type as well so we need to
7700 // truncate back to the original type.
7701 SDValue NewVal;
7702 if (Opc == ISD::SELECT)
7703 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7704 else
7705 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7706
7707 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7708}
7709
7710SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7711 SDValue Mag = Op.getOperand(0);
7712 EVT MagVT = Mag.getValueType();
7713
7714 if (MagVT.getVectorNumElements() > 2)
7715 return splitBinaryVectorOp(Op, DAG);
7716
7717 SDValue Sign = Op.getOperand(1);
7718 EVT SignVT = Sign.getValueType();
7719
7720 if (MagVT == SignVT)
7721 return Op;
7722
7723 // fcopysign v2f16:mag, v2f32:sign ->
7724 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7725
7726 SDLoc SL(Op);
7727 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7728 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7729
7730 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7731
7732 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7733}
7734
7735// Custom lowering for vector multiplications and s_mul_u64.
7736SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7737 EVT VT = Op.getValueType();
7738
7739 // Split vector operands.
7740 if (VT.isVector())
7741 return splitBinaryVectorOp(Op, DAG);
7742
7743 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7744
7745 // There are four ways to lower s_mul_u64:
7746 //
7747 // 1. If all the operands are uniform, then we lower it as it is.
7748 //
7749 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7750 // multiplications because there is not a vector equivalent of s_mul_u64.
7751 //
7752 // 3. If the cost model decides that it is more efficient to use vector
7753 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7754 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7755 //
7756 // 4. If the cost model decides to use vector registers and both of the
7757 // operands are zero-extended/sign-extended from 32-bits, then we split the
7758 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7759 // possible to check if the operands are zero-extended or sign-extended in
7760 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7761 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7762 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7763 // If the cost model decides that we have to use vector registers, then
7764 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7765 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7766 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7767 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7768 // SIInstrInfo.cpp .
7769
7770 if (Op->isDivergent())
7771 return SDValue();
7772
7773 SDValue Op0 = Op.getOperand(0);
7774 SDValue Op1 = Op.getOperand(1);
7775 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7776 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7777 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7778 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7779 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7780 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7781 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7782 SDLoc SL(Op);
7783 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7784 return SDValue(
7785 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7786 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7787 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7788 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7789 return SDValue(
7790 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7791 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7792 return Op;
7793}
7794
7795SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7796 EVT VT = Op.getValueType();
7797 SDLoc SL(Op);
7798 SDValue LHS = Op.getOperand(0);
7799 SDValue RHS = Op.getOperand(1);
7800 bool isSigned = Op.getOpcode() == ISD::SMULO;
7801
7802 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7803 const APInt &C = RHSC->getAPIntValue();
7804 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7805 if (C.isPowerOf2()) {
7806 // smulo(x, signed_min) is same as umulo(x, signed_min).
7807 bool UseArithShift = isSigned && !C.isMinSignedValue();
7808 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7809 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7810 SDValue Overflow =
7811 DAG.getSetCC(SL, MVT::i1,
7812 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7813 Result, ShiftAmt),
7814 LHS, ISD::SETNE);
7815 return DAG.getMergeValues({Result, Overflow}, SL);
7816 }
7817 }
7818
7819 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7820 SDValue Top =
7821 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7822
7823 SDValue Sign = isSigned
7824 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7825 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7826 SL, MVT::i32))
7827 : DAG.getConstant(0, SL, VT);
7828 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7829
7830 return DAG.getMergeValues({Result, Overflow}, SL);
7831}
7832
7833SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7834 if (Op->isDivergent()) {
7835 // Select to V_MAD_[IU]64_[IU]32.
7836 return Op;
7837 }
7838 if (Subtarget->hasSMulHi()) {
7839 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7840 return SDValue();
7841 }
7842 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7843 // calculate the high part, so we might as well do the whole thing with
7844 // V_MAD_[IU]64_[IU]32.
7845 return Op;
7846}
7847
7848SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7849 if (!Subtarget->isTrapHandlerEnabled() ||
7850 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7851 return lowerTrapEndpgm(Op, DAG);
7852
7853 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7854 : lowerTrapHsaQueuePtr(Op, DAG);
7855}
7856
7857SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7858 SDLoc SL(Op);
7859 SDValue Chain = Op.getOperand(0);
7860 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7861}
7862
7863SDValue
7864SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7865 const SDLoc &DL, Align Alignment,
7866 ImplicitParameter Param) const {
7867 MachineFunction &MF = DAG.getMachineFunction();
7868 uint64_t Offset = getImplicitParameterOffset(MF, Param);
7869 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7870 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7871 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7874}
7875
7876SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7877 SelectionDAG &DAG) const {
7878 SDLoc SL(Op);
7879 SDValue Chain = Op.getOperand(0);
7880
7881 SDValue QueuePtr;
7882 // For code object version 5, QueuePtr is passed through implicit kernarg.
7883 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7885 QueuePtr =
7886 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7887 } else {
7888 MachineFunction &MF = DAG.getMachineFunction();
7889 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7890 Register UserSGPR = Info->getQueuePtrUserSGPR();
7891
7892 if (UserSGPR == AMDGPU::NoRegister) {
7893 // We probably are in a function incorrectly marked with
7894 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7895 // trap, so just use a null pointer.
7896 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7897 } else {
7898 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7899 MVT::i64);
7900 }
7901 }
7902
7903 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7904 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7905
7906 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7907 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7908 ToReg.getValue(1)};
7909 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7910}
7911
7912SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7913 SDLoc SL(Op);
7914 SDValue Chain = Op.getOperand(0);
7915
7916 // We need to simulate the 's_trap 2' instruction on targets that run in
7917 // PRIV=1 (where it is treated as a nop).
7918 if (Subtarget->hasPrivEnabledTrap2NopBug())
7919 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7920
7921 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7922 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7923 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7924}
7925
7926SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7927 SDLoc SL(Op);
7928 SDValue Chain = Op.getOperand(0);
7929 MachineFunction &MF = DAG.getMachineFunction();
7930
7931 if (!Subtarget->isTrapHandlerEnabled() ||
7932 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7933 LLVMContext &Ctx = MF.getFunction().getContext();
7934 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
7935 "debugtrap handler not supported",
7936 Op.getDebugLoc(), DS_Warning));
7937 return Chain;
7938 }
7939
7940 uint64_t TrapID =
7941 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
7942 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7943 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7944}
7945
7946SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7947 SelectionDAG &DAG) const {
7948 if (Subtarget->hasApertureRegs()) {
7949 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7950 ? AMDGPU::SRC_SHARED_BASE
7951 : AMDGPU::SRC_PRIVATE_BASE;
7952 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
7953 !Subtarget->hasGloballyAddressableScratch()) &&
7954 "Cannot use src_private_base with globally addressable scratch!");
7955 // Note: this feature (register) is broken. When used as a 32-bit operand,
7956 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7957 // bits.
7958 //
7959 // To work around the issue, directly emit a 64 bit mov from this register
7960 // then extract the high bits. Note that this shouldn't even result in a
7961 // shift being emitted and simply become a pair of registers (e.g.):
7962 // s_mov_b64 s[6:7], src_shared_base
7963 // v_mov_b32_e32 v1, s7
7964 //
7965 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7966 // coalescing would kick in and it would think it's okay to use the "HI"
7967 // subregister directly (instead of extracting the HI 32 bits) which is an
7968 // artificial (unusable) register.
7969 // Register TableGen definitions would need an overhaul to get rid of the
7970 // artificial "HI" aperture registers and prevent this kind of issue from
7971 // happening.
7972 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7973 DAG.getRegister(ApertureRegNo, MVT::i64));
7974 return DAG.getNode(
7975 ISD::TRUNCATE, DL, MVT::i32,
7976 DAG.getNode(ISD::SRL, DL, MVT::i64,
7977 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7978 }
7979
7980 // For code object version 5, private_base and shared_base are passed through
7981 // implicit kernargs.
7982 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7986 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7987 }
7988
7989 MachineFunction &MF = DAG.getMachineFunction();
7990 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7991 Register UserSGPR = Info->getQueuePtrUserSGPR();
7992 if (UserSGPR == AMDGPU::NoRegister) {
7993 // We probably are in a function incorrectly marked with
7994 // amdgpu-no-queue-ptr. This is undefined.
7995 return DAG.getPOISON(MVT::i32);
7996 }
7997
7998 SDValue QueuePtr =
7999 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8000
8001 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8002 // private_segment_aperture_base_hi.
8003 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8004
8005 SDValue Ptr =
8006 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8007
8008 // TODO: Use custom target PseudoSourceValue.
8009 // TODO: We should use the value from the IR intrinsic call, but it might not
8010 // be available and how do we get it?
8011 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8012 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8013 commonAlignment(Align(64), StructOffset),
8016}
8017
8018/// Return true if the value is a known valid address, such that a null check is
8019/// not necessary.
8021 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8023 return true;
8024
8025 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8026 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8027
8028 // TODO: Search through arithmetic, handle arguments and loads
8029 // marked nonnull.
8030 return false;
8031}
8032
8033SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8034 SelectionDAG &DAG) const {
8035 SDLoc SL(Op);
8036
8037 const AMDGPUTargetMachine &TM =
8038 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8039
8040 unsigned DestAS, SrcAS;
8041 SDValue Src;
8042 bool IsNonNull = false;
8043 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8044 SrcAS = ASC->getSrcAddressSpace();
8045 Src = ASC->getOperand(0);
8046 DestAS = ASC->getDestAddressSpace();
8047 } else {
8048 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8049 Op.getConstantOperandVal(0) ==
8050 Intrinsic::amdgcn_addrspacecast_nonnull);
8051 Src = Op->getOperand(1);
8052 SrcAS = Op->getConstantOperandVal(2);
8053 DestAS = Op->getConstantOperandVal(3);
8054 IsNonNull = true;
8055 }
8056
8057 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8058
8059 // flat -> local/private
8060 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8061 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8062 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8063 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8064
8065 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8066 Subtarget->hasGloballyAddressableScratch()) {
8067 // flat -> private with globally addressable scratch: subtract
8068 // src_flat_scratch_base_lo.
8069 SDValue FlatScratchBaseLo(
8070 DAG.getMachineNode(
8071 AMDGPU::S_MOV_B32, SL, MVT::i32,
8072 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8073 0);
8074 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8075 }
8076
8077 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8078 return Ptr;
8079
8080 unsigned NullVal = TM.getNullPointerValue(DestAS);
8081 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8082 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8083
8084 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8085 SegmentNullPtr);
8086 }
8087 }
8088
8089 // local/private -> flat
8090 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8091 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8092 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8093 SDValue CvtPtr;
8094 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8095 Subtarget->hasGloballyAddressableScratch()) {
8096 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8097 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8098 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8099 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8100 ThreadID = DAG.getNode(
8101 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8102 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8103 AllOnes, ThreadID);
8104 if (Subtarget->isWave64())
8105 ThreadID = DAG.getNode(
8106 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8107 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8108 AllOnes, ThreadID);
8109 SDValue ShAmt = DAG.getShiftAmountConstant(
8110 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8111 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8112 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8113 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8114 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8115 // 64-bit hi:lo value.
8116 SDValue FlatScratchBase = {
8117 DAG.getMachineNode(
8118 AMDGPU::S_MOV_B64, SL, MVT::i64,
8119 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8120 0};
8121 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8122 } else {
8123 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8124 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8125 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8126 }
8127
8128 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8129 return CvtPtr;
8130
8131 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8132 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8133
8134 SDValue NonNull =
8135 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8136
8137 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8138 FlatNullPtr);
8139 }
8140 }
8141
8142 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8143 Op.getValueType() == MVT::i64) {
8144 const SIMachineFunctionInfo *Info =
8145 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8146 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8147 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8148 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8149 }
8150
8151 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8152 Src.getValueType() == MVT::i64)
8153 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8154
8155 // global <-> flat are no-ops and never emitted.
8156
8157 // Invalid casts are poison.
8158 return DAG.getPOISON(Op->getValueType(0));
8159}
8160
8161// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8162// the small vector and inserting them into the big vector. That is better than
8163// the default expansion of doing it via a stack slot. Even though the use of
8164// the stack slot would be optimized away afterwards, the stack slot itself
8165// remains.
8166SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8167 SelectionDAG &DAG) const {
8168 SDValue Vec = Op.getOperand(0);
8169 SDValue Ins = Op.getOperand(1);
8170 SDValue Idx = Op.getOperand(2);
8171 EVT VecVT = Vec.getValueType();
8172 EVT InsVT = Ins.getValueType();
8173 EVT EltVT = VecVT.getVectorElementType();
8174 unsigned InsNumElts = InsVT.getVectorNumElements();
8175 unsigned IdxVal = Idx->getAsZExtVal();
8176 SDLoc SL(Op);
8177
8178 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8179 // Insert 32-bit registers at a time.
8180 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8181
8182 unsigned VecNumElts = VecVT.getVectorNumElements();
8183 EVT NewVecVT =
8184 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8185 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8187 MVT::i32, InsNumElts / 2);
8188
8189 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8190 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8191
8192 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8193 SDValue Elt;
8194 if (InsNumElts == 2) {
8195 Elt = Ins;
8196 } else {
8197 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8198 DAG.getConstant(I, SL, MVT::i32));
8199 }
8200 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8201 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8202 }
8203
8204 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8205 }
8206
8207 for (unsigned I = 0; I != InsNumElts; ++I) {
8208 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8209 DAG.getConstant(I, SL, MVT::i32));
8210 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8211 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8212 }
8213 return Vec;
8214}
8215
8216SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8217 SelectionDAG &DAG) const {
8218 SDValue Vec = Op.getOperand(0);
8219 SDValue InsVal = Op.getOperand(1);
8220 SDValue Idx = Op.getOperand(2);
8221 EVT VecVT = Vec.getValueType();
8222 EVT EltVT = VecVT.getVectorElementType();
8223 unsigned VecSize = VecVT.getSizeInBits();
8224 unsigned EltSize = EltVT.getSizeInBits();
8225 SDLoc SL(Op);
8226
8227 // Specially handle the case of v4i16 with static indexing.
8228 unsigned NumElts = VecVT.getVectorNumElements();
8229 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8230 if (NumElts == 4 && EltSize == 16 && KIdx) {
8231 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8232
8233 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8234 DAG.getConstant(0, SL, MVT::i32));
8235 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8236 DAG.getConstant(1, SL, MVT::i32));
8237
8238 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8239 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8240
8241 unsigned Idx = KIdx->getZExtValue();
8242 bool InsertLo = Idx < 2;
8243 SDValue InsHalf = DAG.getNode(
8244 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8245 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8246 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8247
8248 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8249
8250 SDValue Concat =
8251 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8252 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8253
8254 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8255 }
8256
8257 // Static indexing does not lower to stack access, and hence there is no need
8258 // for special custom lowering to avoid stack access.
8259 if (isa<ConstantSDNode>(Idx))
8260 return SDValue();
8261
8262 // Avoid stack access for dynamic indexing by custom lowering to
8263 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8264
8265 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8266
8267 MVT IntVT = MVT::getIntegerVT(VecSize);
8268
8269 // Convert vector index to bit-index and get the required bit mask.
8270 assert(isPowerOf2_32(EltSize));
8271 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8272 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8273 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8274 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8275 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8276
8277 // 1. Create a congruent vector with the target value in each element.
8278 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8279 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8280
8281 // 2. Mask off all other indices except the required index within (1).
8282 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8283
8284 // 3. Mask off the required index within the target vector.
8285 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8286 SDValue RHS =
8287 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8288
8289 // 4. Get (2) and (3) ORed into the target vector.
8290 SDValue BFI =
8291 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8292
8293 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8294}
8295
8296SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8297 SelectionDAG &DAG) const {
8298 SDLoc SL(Op);
8299
8300 EVT ResultVT = Op.getValueType();
8301 SDValue Vec = Op.getOperand(0);
8302 SDValue Idx = Op.getOperand(1);
8303 EVT VecVT = Vec.getValueType();
8304 unsigned VecSize = VecVT.getSizeInBits();
8305 EVT EltVT = VecVT.getVectorElementType();
8306
8307 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8308
8309 // Make sure we do any optimizations that will make it easier to fold
8310 // source modifiers before obscuring it with bit operations.
8311
8312 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8313 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8314 return Combined;
8315
8316 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8317 SDValue Lo, Hi;
8318 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8319
8320 if (VecSize == 128) {
8321 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8322 Lo = DAG.getBitcast(LoVT,
8323 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8324 DAG.getConstant(0, SL, MVT::i32)));
8325 Hi = DAG.getBitcast(HiVT,
8326 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8327 DAG.getConstant(1, SL, MVT::i32)));
8328 } else if (VecSize == 256) {
8329 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8330 SDValue Parts[4];
8331 for (unsigned P = 0; P < 4; ++P) {
8332 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8333 DAG.getConstant(P, SL, MVT::i32));
8334 }
8335
8336 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8337 Parts[0], Parts[1]));
8338 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8339 Parts[2], Parts[3]));
8340 } else {
8341 assert(VecSize == 512);
8342
8343 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8344 SDValue Parts[8];
8345 for (unsigned P = 0; P < 8; ++P) {
8346 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8347 DAG.getConstant(P, SL, MVT::i32));
8348 }
8349
8350 Lo = DAG.getBitcast(LoVT,
8351 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8352 Parts[0], Parts[1], Parts[2], Parts[3]));
8353 Hi = DAG.getBitcast(HiVT,
8354 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8355 Parts[4], Parts[5], Parts[6], Parts[7]));
8356 }
8357
8358 EVT IdxVT = Idx.getValueType();
8359 unsigned NElem = VecVT.getVectorNumElements();
8360 assert(isPowerOf2_32(NElem));
8361 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8362 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8363 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8364 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8365 }
8366
8367 assert(VecSize <= 64);
8368
8369 MVT IntVT = MVT::getIntegerVT(VecSize);
8370
8371 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8372 SDValue VecBC = peekThroughBitcasts(Vec);
8373 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8374 SDValue Src = VecBC.getOperand(0);
8375 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8376 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8377 }
8378
8379 unsigned EltSize = EltVT.getSizeInBits();
8380 assert(isPowerOf2_32(EltSize));
8381
8382 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8383
8384 // Convert vector index to bit-index (* EltSize)
8385 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8386
8387 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8388 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8389
8390 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8391 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8392 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8393 }
8394
8395 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8396}
8397
8398static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8399 assert(Elt % 2 == 0);
8400 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8401}
8402
8403static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8404 assert(Elt % 2 == 0);
8405 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8406 !(Mask[Elt + 1] & 1);
8407}
8408
8409SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8410 SelectionDAG &DAG) const {
8411 SDLoc SL(Op);
8412 EVT ResultVT = Op.getValueType();
8413 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8414 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8415 const int NewSrcNumElts = 2;
8416 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8417 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8418
8419 // Break up the shuffle into registers sized pieces.
8420 //
8421 // We're trying to form sub-shuffles that the register allocation pipeline
8422 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8423 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8424 // pair of copies into a consecutive register copy, so use the ordinary
8425 // extract_vector_elt lowering unless we can use the shuffle.
8426 //
8427 // TODO: This is a bit of hack, and we should probably always use
8428 // extract_subvector for the largest possible subvector we can (or at least
8429 // use it for PackVT aligned pieces). However we have worse support for
8430 // combines on them don't directly treat extract_subvector / insert_subvector
8431 // as legal. The DAG scheduler also ends up doing a worse job with the
8432 // extract_subvectors.
8433 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8434
8435 // vector_shuffle <0,1,6,7> lhs, rhs
8436 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8437 //
8438 // vector_shuffle <6,7,2,3> lhs, rhs
8439 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8440 //
8441 // vector_shuffle <6,7,0,1> lhs, rhs
8442 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8443
8444 // Avoid scalarizing when both halves are reading from consecutive elements.
8445
8446 // If we're treating 2 element shuffles as legal, also create odd-to-even
8447 // shuffles of neighboring pairs.
8448 //
8449 // vector_shuffle <3,2,7,6> lhs, rhs
8450 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8451 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8452
8454 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8455 if (ShouldUseConsecutiveExtract &&
8457 const int Idx = SVN->getMaskElt(I);
8458 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8459 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8460 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8461 SVN->getOperand(VecIdx),
8462 DAG.getConstant(EltIdx, SL, MVT::i32));
8463 Pieces.push_back(SubVec);
8464 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8466 int Idx0 = SVN->getMaskElt(I);
8467 int Idx1 = SVN->getMaskElt(I + 1);
8468
8469 SDValue SrcOp0 = SVN->getOperand(0);
8470 SDValue SrcOp1 = SrcOp0;
8471 if (Idx0 >= SrcNumElts) {
8472 SrcOp0 = SVN->getOperand(1);
8473 Idx0 -= SrcNumElts;
8474 }
8475
8476 if (Idx1 >= SrcNumElts) {
8477 SrcOp1 = SVN->getOperand(1);
8478 Idx1 -= SrcNumElts;
8479 }
8480
8481 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8482 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8483
8484 // Extract nearest even aligned piece.
8485 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8486 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8487 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8488 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8489
8490 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8491 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8492
8493 SDValue Result0 = SubVec0;
8494 SDValue Result1 = SubVec0;
8495
8496 if (SubVec0 != SubVec1) {
8497 NewMaskIdx1 += NewSrcNumElts;
8498 Result1 = SubVec1;
8499 } else {
8500 Result1 = DAG.getPOISON(PackVT);
8501 }
8502
8503 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8504 {NewMaskIdx0, NewMaskIdx1});
8505 Pieces.push_back(Shuf);
8506 } else {
8507 const int Idx0 = SVN->getMaskElt(I);
8508 const int Idx1 = SVN->getMaskElt(I + 1);
8509 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8510 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8511 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8512 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8513
8514 SDValue Vec0 = SVN->getOperand(VecIdx0);
8515 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8516 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8517
8518 SDValue Vec1 = SVN->getOperand(VecIdx1);
8519 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8520 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8521 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8522 }
8523 }
8524
8525 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8526}
8527
8528SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8529 SelectionDAG &DAG) const {
8530 SDValue SVal = Op.getOperand(0);
8531 EVT ResultVT = Op.getValueType();
8532 EVT SValVT = SVal.getValueType();
8533 SDValue UndefVal = DAG.getPOISON(SValVT);
8534 SDLoc SL(Op);
8535
8537 VElts.push_back(SVal);
8538 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8539 VElts.push_back(UndefVal);
8540
8541 return DAG.getBuildVector(ResultVT, SL, VElts);
8542}
8543
8544SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8545 SelectionDAG &DAG) const {
8546 SDLoc SL(Op);
8547 EVT VT = Op.getValueType();
8548
8549 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8550 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8551
8552 SDValue Lo = Op.getOperand(0);
8553 SDValue Hi = Op.getOperand(1);
8554
8555 // Avoid adding defined bits with the zero_extend.
8556 if (Hi.isUndef()) {
8557 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8558 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8559 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8560 }
8561
8562 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8563 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8564
8565 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8566 DAG.getConstant(16, SL, MVT::i32));
8567 if (Lo.isUndef())
8568 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8569
8570 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8571 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8572
8573 SDValue Or =
8574 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8575 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8576 }
8577
8578 // Split into 2-element chunks.
8579 const unsigned NumParts = VT.getVectorNumElements() / 2;
8580 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8581 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8582
8584 for (unsigned P = 0; P < NumParts; ++P) {
8585 SDValue Vec = DAG.getBuildVector(
8586 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8587 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8588 }
8589
8590 SDValue Blend =
8591 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8592 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8593}
8594
8596 const GlobalAddressSDNode *GA) const {
8597 // OSes that use ELF REL relocations (instead of RELA) can only store a
8598 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8599 // which can create arbitrary 64-bit addends. (This is only a problem for
8600 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8601 // the high 32 bits of the addend.)
8602 //
8603 // This should be kept in sync with how HasRelocationAddend is initialized in
8604 // the constructor of ELFAMDGPUAsmBackend.
8605 if (!Subtarget->isAmdHsaOS())
8606 return false;
8607
8608 // We can fold offsets for anything that doesn't require a GOT relocation.
8609 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8613}
8614
8615static SDValue
8617 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8618 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8619 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8620 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8621 // lowered to the following code sequence:
8622 //
8623 // For constant address space:
8624 // s_getpc_b64 s[0:1]
8625 // s_add_u32 s0, s0, $symbol
8626 // s_addc_u32 s1, s1, 0
8627 //
8628 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8629 // a fixup or relocation is emitted to replace $symbol with a literal
8630 // constant, which is a pc-relative offset from the encoding of the $symbol
8631 // operand to the global variable.
8632 //
8633 // For global address space:
8634 // s_getpc_b64 s[0:1]
8635 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8636 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8637 //
8638 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8639 // fixups or relocations are emitted to replace $symbol@*@lo and
8640 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8641 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8642 // operand to the global variable.
8643 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8644 assert(GAFlags != SIInstrInfo::MO_NONE);
8645
8646 SDValue Ptr =
8647 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8648 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8649 }
8650
8651 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8652 SDValue PtrHi;
8653 if (GAFlags == SIInstrInfo::MO_NONE)
8654 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8655 else
8656 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8657 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8658}
8659
8660SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8661 SDValue Op,
8662 SelectionDAG &DAG) const {
8663 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8664 SDLoc DL(GSD);
8665 EVT PtrVT = Op.getValueType();
8666
8667 const GlobalValue *GV = GSD->getGlobal();
8673 GV->hasExternalLinkage()) {
8674 Type *Ty = GV->getValueType();
8675 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8676 // zero-sized type in other languages to declare the dynamic shared
8677 // memory which size is not known at the compile time. They will be
8678 // allocated by the runtime and placed directly after the static
8679 // allocated ones. They all share the same offset.
8680 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8681 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8682 // Adjust alignment for that dynamic shared memory array.
8685 MFI->setUsesDynamicLDS(true);
8686 return SDValue(
8687 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8688 }
8689 }
8691 }
8692
8694 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8696 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8697 }
8698
8699 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8700 if (Subtarget->has64BitLiterals()) {
8702 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8703 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8704 0);
8705 }
8706
8707 SDValue AddrLo = DAG.getTargetGlobalAddress(
8708 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8709 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8710
8711 SDValue AddrHi = DAG.getTargetGlobalAddress(
8712 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8713 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8714
8715 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8716 }
8717
8718 if (shouldEmitFixup(GV))
8719 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8720
8721 if (shouldEmitPCReloc(GV))
8722 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8724
8725 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8727 PointerType *PtrTy =
8729 const DataLayout &DataLayout = DAG.getDataLayout();
8730 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8731 MachinePointerInfo PtrInfo =
8733
8734 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8737}
8738
8740 const SDLoc &DL, SDValue V) const {
8741 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8742 // the destination register.
8743 //
8744 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8745 // so we will end up with redundant moves to m0.
8746 //
8747 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8748
8749 // A Null SDValue creates a glue result.
8750 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8751 V, Chain);
8752 return SDValue(M0, 0);
8753}
8754
8755SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8756 MVT VT,
8757 unsigned Offset) const {
8758 SDLoc SL(Op);
8759 SDValue Param = lowerKernargMemParameter(
8760 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8761 // The local size values will have the hi 16-bits as zero.
8762 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8763 DAG.getValueType(VT));
8764}
8765
8767 EVT VT) {
8770 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8771 return DAG.getPOISON(VT);
8772}
8773
8775 EVT VT) {
8778 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8779 return DAG.getPOISON(VT);
8780}
8781
8783 ArrayRef<SDValue> Elts) {
8784 assert(!Elts.empty());
8785 MVT Type;
8786 unsigned NumElts = Elts.size();
8787
8788 if (NumElts <= 12) {
8789 Type = MVT::getVectorVT(MVT::f32, NumElts);
8790 } else {
8791 assert(Elts.size() <= 16);
8792 Type = MVT::v16f32;
8793 NumElts = 16;
8794 }
8795
8796 SmallVector<SDValue, 16> VecElts(NumElts);
8797 for (unsigned i = 0; i < Elts.size(); ++i) {
8798 SDValue Elt = Elts[i];
8799 if (Elt.getValueType() != MVT::f32)
8800 Elt = DAG.getBitcast(MVT::f32, Elt);
8801 VecElts[i] = Elt;
8802 }
8803 for (unsigned i = Elts.size(); i < NumElts; ++i)
8804 VecElts[i] = DAG.getPOISON(MVT::f32);
8805
8806 if (NumElts == 1)
8807 return VecElts[0];
8808 return DAG.getBuildVector(Type, DL, VecElts);
8809}
8810
8811static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8812 SDValue Src, int ExtraElts) {
8813 EVT SrcVT = Src.getValueType();
8814
8816
8817 if (SrcVT.isVector())
8818 DAG.ExtractVectorElements(Src, Elts);
8819 else
8820 Elts.push_back(Src);
8821
8822 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
8823 while (ExtraElts--)
8824 Elts.push_back(Undef);
8825
8826 return DAG.getBuildVector(CastVT, DL, Elts);
8827}
8828
8829// Re-construct the required return value for a image load intrinsic.
8830// This is more complicated due to the optional use TexFailCtrl which means the
8831// required return type is an aggregate
8833 ArrayRef<EVT> ResultTypes, bool IsTexFail,
8834 bool Unpacked, bool IsD16, int DMaskPop,
8835 int NumVDataDwords, bool IsAtomicPacked16Bit,
8836 const SDLoc &DL) {
8837 // Determine the required return type. This is the same regardless of
8838 // IsTexFail flag
8839 EVT ReqRetVT = ResultTypes[0];
8840 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8841 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8842 ? (ReqRetNumElts + 1) / 2
8843 : ReqRetNumElts;
8844
8845 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8846
8847 MVT DataDwordVT =
8848 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8849
8850 MVT MaskPopVT =
8851 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8852
8853 SDValue Data(Result, 0);
8854 SDValue TexFail;
8855
8856 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8857 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8858 if (MaskPopVT.isVector()) {
8859 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8860 SDValue(Result, 0), ZeroIdx);
8861 } else {
8862 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8863 SDValue(Result, 0), ZeroIdx);
8864 }
8865 }
8866
8867 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8868 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8869 NumDataDwords - MaskPopDwords);
8870
8871 if (IsD16)
8872 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8873
8874 EVT LegalReqRetVT = ReqRetVT;
8875 if (!ReqRetVT.isVector()) {
8876 if (!Data.getValueType().isInteger())
8877 Data = DAG.getNode(ISD::BITCAST, DL,
8878 Data.getValueType().changeTypeToInteger(), Data);
8879 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8880 } else {
8881 // We need to widen the return vector to a legal type
8882 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8883 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8884 LegalReqRetVT =
8886 ReqRetVT.getVectorNumElements() + 1);
8887 }
8888 }
8889 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8890
8891 if (IsTexFail) {
8892 TexFail =
8893 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8894 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8895
8896 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8897 }
8898
8899 if (Result->getNumValues() == 1)
8900 return Data;
8901
8902 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8903}
8904
8905static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8906 SDValue *LWE, bool &IsTexFail) {
8907 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8908
8909 uint64_t Value = TexFailCtrlConst->getZExtValue();
8910 if (Value) {
8911 IsTexFail = true;
8912 }
8913
8914 SDLoc DL(TexFailCtrlConst);
8915 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8916 Value &= ~(uint64_t)0x1;
8917 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8918 Value &= ~(uint64_t)0x2;
8919
8920 return Value == 0;
8921}
8922
8924 MVT PackVectorVT,
8925 SmallVectorImpl<SDValue> &PackedAddrs,
8926 unsigned DimIdx, unsigned EndIdx,
8927 unsigned NumGradients) {
8928 SDLoc DL(Op);
8929 for (unsigned I = DimIdx; I < EndIdx; I++) {
8930 SDValue Addr = Op.getOperand(I);
8931
8932 // Gradients are packed with undef for each coordinate.
8933 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8934 // 1D: undef,dx/dh; undef,dx/dv
8935 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8936 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8937 if (((I + 1) >= EndIdx) ||
8938 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8939 I == DimIdx + NumGradients - 1))) {
8940 if (Addr.getValueType() != MVT::i16)
8941 Addr = DAG.getBitcast(MVT::i16, Addr);
8942 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8943 } else {
8944 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8945 I++;
8946 }
8947 Addr = DAG.getBitcast(MVT::f32, Addr);
8948 PackedAddrs.push_back(Addr);
8949 }
8950}
8951
8952SDValue SITargetLowering::lowerImage(SDValue Op,
8954 SelectionDAG &DAG, bool WithChain) const {
8955 SDLoc DL(Op);
8956 MachineFunction &MF = DAG.getMachineFunction();
8957 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8958 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8960 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
8961 unsigned IntrOpcode = Intr->BaseOpcode;
8962 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
8963 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8964 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
8965
8966 SmallVector<EVT, 3> ResultTypes(Op->values());
8967 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8968 bool IsD16 = false;
8969 bool IsG16 = false;
8970 bool IsA16 = false;
8971 SDValue VData;
8972 int NumVDataDwords = 0;
8973 bool AdjustRetType = false;
8974 bool IsAtomicPacked16Bit = false;
8975
8976 // Offset of intrinsic arguments
8977 const unsigned ArgOffset = WithChain ? 2 : 1;
8978
8979 unsigned DMask;
8980 unsigned DMaskLanes = 0;
8981
8982 if (BaseOpcode->Atomic) {
8983 VData = Op.getOperand(2);
8984
8985 IsAtomicPacked16Bit =
8986 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8987 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8988
8989 bool Is64Bit = VData.getValueSizeInBits() == 64;
8990 if (BaseOpcode->AtomicX2) {
8991 SDValue VData2 = Op.getOperand(3);
8992 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8993 {VData, VData2});
8994 if (Is64Bit)
8995 VData = DAG.getBitcast(MVT::v4i32, VData);
8996
8997 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8998 DMask = Is64Bit ? 0xf : 0x3;
8999 NumVDataDwords = Is64Bit ? 4 : 2;
9000 } else {
9001 DMask = Is64Bit ? 0x3 : 0x1;
9002 NumVDataDwords = Is64Bit ? 2 : 1;
9003 }
9004 } else {
9005 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9006 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9007
9008 if (BaseOpcode->Store) {
9009 VData = Op.getOperand(2);
9010
9011 MVT StoreVT = VData.getSimpleValueType();
9012 if (StoreVT.getScalarType() == MVT::f16) {
9013 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9014 return Op; // D16 is unsupported for this instruction
9015
9016 IsD16 = true;
9017 VData = handleD16VData(VData, DAG, true);
9018 }
9019
9020 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9021 } else if (!BaseOpcode->NoReturn) {
9022 // Work out the num dwords based on the dmask popcount and underlying type
9023 // and whether packing is supported.
9024 MVT LoadVT = ResultTypes[0].getSimpleVT();
9025 if (LoadVT.getScalarType() == MVT::f16) {
9026 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9027 return Op; // D16 is unsupported for this instruction
9028
9029 IsD16 = true;
9030 }
9031
9032 // Confirm that the return type is large enough for the dmask specified
9033 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9034 (!LoadVT.isVector() && DMaskLanes > 1))
9035 return Op;
9036
9037 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9038 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9039 // instructions.
9040 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9041 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9042 NumVDataDwords = (DMaskLanes + 1) / 2;
9043 else
9044 NumVDataDwords = DMaskLanes;
9045
9046 AdjustRetType = true;
9047 }
9048 }
9049
9050 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9052
9053 // Check for 16 bit addresses or derivatives and pack if true.
9054 MVT VAddrVT =
9055 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9056 MVT VAddrScalarVT = VAddrVT.getScalarType();
9057 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9058 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9059
9060 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9061 VAddrScalarVT = VAddrVT.getScalarType();
9062 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9063 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9064
9065 // Push back extra arguments.
9066 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9067 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9068 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9069 // Special handling of bias when A16 is on. Bias is of type half but
9070 // occupies full 32-bit.
9071 SDValue Bias = DAG.getBuildVector(
9072 MVT::v2f16, DL,
9073 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9074 VAddrs.push_back(Bias);
9075 } else {
9076 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9077 "Bias needs to be converted to 16 bit in A16 mode");
9078 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9079 }
9080 }
9081
9082 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9083 // 16 bit gradients are supported, but are tied to the A16 control
9084 // so both gradients and addresses must be 16 bit
9085 LLVM_DEBUG(
9086 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9087 "require 16 bit args for both gradients and addresses");
9088 return Op;
9089 }
9090
9091 if (IsA16) {
9092 if (!ST->hasA16()) {
9093 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9094 "support 16 bit addresses\n");
9095 return Op;
9096 }
9097 }
9098
9099 // We've dealt with incorrect input so we know that if IsA16, IsG16
9100 // are set then we have to compress/pack operands (either address,
9101 // gradient or both)
9102 // In the case where a16 and gradients are tied (no G16 support) then we
9103 // have already verified that both IsA16 and IsG16 are true
9104 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9105 // Activate g16
9106 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9108 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9109 }
9110
9111 // Add gradients (packed or unpacked)
9112 if (IsG16) {
9113 // Pack the gradients
9114 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9115 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9116 ArgOffset + Intr->GradientStart,
9117 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9118 } else {
9119 for (unsigned I = ArgOffset + Intr->GradientStart;
9120 I < ArgOffset + Intr->CoordStart; I++)
9121 VAddrs.push_back(Op.getOperand(I));
9122 }
9123
9124 // Add addresses (packed or unpacked)
9125 if (IsA16) {
9126 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9127 ArgOffset + Intr->CoordStart, VAddrEnd,
9128 0 /* No gradients */);
9129 } else {
9130 // Add uncompressed address
9131 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9132 VAddrs.push_back(Op.getOperand(I));
9133 }
9134
9135 // If the register allocator cannot place the address registers contiguously
9136 // without introducing moves, then using the non-sequential address encoding
9137 // is always preferable, since it saves VALU instructions and is usually a
9138 // wash in terms of code size or even better.
9139 //
9140 // However, we currently have no way of hinting to the register allocator that
9141 // MIMG addresses should be placed contiguously when it is possible to do so,
9142 // so force non-NSA for the common 2-address case as a heuristic.
9143 //
9144 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9145 // allocation when possible.
9146 //
9147 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9148 // set of the remaining addresses.
9149 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9150 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9151 const bool UseNSA = ST->hasNSAEncoding() &&
9152 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9153 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9154 const bool UsePartialNSA =
9155 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9156
9157 SDValue VAddr;
9158 if (UsePartialNSA) {
9159 VAddr = getBuildDwordsVector(DAG, DL,
9160 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9161 } else if (!UseNSA) {
9162 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9163 }
9164
9165 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9166 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9167 SDValue Unorm;
9168 if (!BaseOpcode->Sampler) {
9169 Unorm = True;
9170 } else {
9171 uint64_t UnormConst =
9172 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9173
9174 Unorm = UnormConst ? True : False;
9175 }
9176
9177 SDValue TFE;
9178 SDValue LWE;
9179 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9180 bool IsTexFail = false;
9181 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9182 return Op;
9183
9184 if (IsTexFail) {
9185 if (!DMaskLanes) {
9186 // Expecting to get an error flag since TFC is on - and dmask is 0
9187 // Force dmask to be at least 1 otherwise the instruction will fail
9188 DMask = 0x1;
9189 DMaskLanes = 1;
9190 NumVDataDwords = 1;
9191 }
9192 NumVDataDwords += 1;
9193 AdjustRetType = true;
9194 }
9195
9196 // Has something earlier tagged that the return type needs adjusting
9197 // This happens if the instruction is a load or has set TexFailCtrl flags
9198 if (AdjustRetType) {
9199 // NumVDataDwords reflects the true number of dwords required in the return
9200 // type
9201 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9202 // This is a no-op load. This can be eliminated
9203 SDValue Undef = DAG.getPOISON(Op.getValueType());
9204 if (isa<MemSDNode>(Op))
9205 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9206 return Undef;
9207 }
9208
9209 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9210 MVT::i32, NumVDataDwords)
9211 : MVT::i32;
9212
9213 ResultTypes[0] = NewVT;
9214 if (ResultTypes.size() == 3) {
9215 // Original result was aggregate type used for TexFailCtrl results
9216 // The actual instruction returns as a vector type which has now been
9217 // created. Remove the aggregate result.
9218 ResultTypes.erase(&ResultTypes[1]);
9219 }
9220 }
9221
9222 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9223 if (BaseOpcode->Atomic)
9224 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9225 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9227 return Op;
9228
9230 if (BaseOpcode->Store || BaseOpcode->Atomic)
9231 Ops.push_back(VData); // vdata
9232 if (UsePartialNSA) {
9233 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9234 Ops.push_back(VAddr);
9235 } else if (UseNSA)
9236 append_range(Ops, VAddrs);
9237 else
9238 Ops.push_back(VAddr);
9239 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9240 EVT RsrcVT = Rsrc.getValueType();
9241 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9242 return Op;
9243 Ops.push_back(Rsrc);
9244 if (BaseOpcode->Sampler) {
9245 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9246 if (Samp.getValueType() != MVT::v4i32)
9247 return Op;
9248 Ops.push_back(Samp);
9249 }
9250 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9251 if (IsGFX10Plus)
9252 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9253 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9254 Ops.push_back(Unorm);
9255 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9256 Ops.push_back(IsA16 && // r128, a16 for gfx9
9257 ST->hasFeature(AMDGPU::FeatureR128A16)
9258 ? True
9259 : False);
9260 if (IsGFX10Plus)
9261 Ops.push_back(IsA16 ? True : False);
9262
9263 if (!Subtarget->hasGFX90AInsts())
9264 Ops.push_back(TFE); // tfe
9265 else if (TFE->getAsZExtVal()) {
9266 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9268 "TFE is not supported on this GPU", DL.getDebugLoc()));
9269 }
9270
9271 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9272 Ops.push_back(LWE); // lwe
9273 if (!IsGFX10Plus)
9274 Ops.push_back(DimInfo->DA ? True : False);
9275 if (BaseOpcode->HasD16)
9276 Ops.push_back(IsD16 ? True : False);
9277 if (isa<MemSDNode>(Op))
9278 Ops.push_back(Op.getOperand(0)); // chain
9279
9280 int NumVAddrDwords =
9281 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9282 int Opcode = -1;
9283
9284 if (IsGFX12Plus) {
9285 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9286 NumVDataDwords, NumVAddrDwords);
9287 } else if (IsGFX11Plus) {
9288 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9289 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9290 : AMDGPU::MIMGEncGfx11Default,
9291 NumVDataDwords, NumVAddrDwords);
9292 } else if (IsGFX10Plus) {
9293 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9294 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9295 : AMDGPU::MIMGEncGfx10Default,
9296 NumVDataDwords, NumVAddrDwords);
9297 } else {
9298 if (Subtarget->hasGFX90AInsts()) {
9299 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9300 NumVDataDwords, NumVAddrDwords);
9301 if (Opcode == -1) {
9302 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9304 "requested image instruction is not supported on this GPU",
9305 DL.getDebugLoc()));
9306
9307 unsigned Idx = 0;
9308 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9309 for (EVT VT : OrigResultTypes) {
9310 if (VT == MVT::Other)
9311 RetValues[Idx++] = Op.getOperand(0); // Chain
9312 else
9313 RetValues[Idx++] = DAG.getPOISON(VT);
9314 }
9315
9316 return DAG.getMergeValues(RetValues, DL);
9317 }
9318 }
9319 if (Opcode == -1 &&
9320 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9321 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9322 NumVDataDwords, NumVAddrDwords);
9323 if (Opcode == -1)
9324 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9325 NumVDataDwords, NumVAddrDwords);
9326 }
9327 if (Opcode == -1)
9328 return Op;
9329
9330 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9331 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9332 MachineMemOperand *MemRef = MemOp->getMemOperand();
9333 DAG.setNodeMemRefs(NewNode, {MemRef});
9334 }
9335
9336 if (BaseOpcode->AtomicX2) {
9338 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9339 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9340 }
9341 if (BaseOpcode->NoReturn)
9342 return SDValue(NewNode, 0);
9343 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9344 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9345 NumVDataDwords, IsAtomicPacked16Bit, DL);
9346}
9347
9348SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9349 SDValue Offset, SDValue CachePolicy,
9350 SelectionDAG &DAG) const {
9351 MachineFunction &MF = DAG.getMachineFunction();
9352
9353 const DataLayout &DataLayout = DAG.getDataLayout();
9354 Align Alignment =
9355 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9356
9357 MachineMemOperand *MMO = MF.getMachineMemOperand(
9358 MachinePointerInfo(),
9361 VT.getStoreSize(), Alignment);
9362
9363 if (!Offset->isDivergent()) {
9364 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9365
9366 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9367 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9368 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9369 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9370 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9371 SDValue BufferLoad =
9373 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9374 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9375 }
9376
9377 // Widen vec3 load to vec4.
9378 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9379 !Subtarget->hasScalarDwordx3Loads()) {
9380 EVT WidenedVT =
9382 auto WidenedOp = DAG.getMemIntrinsicNode(
9383 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9384 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9385 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9386 DAG.getVectorIdxConstant(0, DL));
9387 return Subvector;
9388 }
9389
9391 DAG.getVTList(VT), Ops, VT, MMO);
9392 }
9393
9394 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9395 // assume that the buffer is unswizzled.
9396 SDValue Ops[] = {
9397 DAG.getEntryNode(), // Chain
9398 Rsrc, // rsrc
9399 DAG.getConstant(0, DL, MVT::i32), // vindex
9400 {}, // voffset
9401 {}, // soffset
9402 {}, // offset
9403 CachePolicy, // cachepolicy
9404 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9405 };
9406 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9407 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9408 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9409 }
9410
9412 unsigned NumLoads = 1;
9413 MVT LoadVT = VT.getSimpleVT();
9414 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9415 assert((LoadVT.getScalarType() == MVT::i32 ||
9416 LoadVT.getScalarType() == MVT::f32));
9417
9418 if (NumElts == 8 || NumElts == 16) {
9419 NumLoads = NumElts / 4;
9420 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9421 }
9422
9423 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9424
9425 // Use the alignment to ensure that the required offsets will fit into the
9426 // immediate offsets.
9427 setBufferOffsets(Offset, DAG, &Ops[3],
9428 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9429
9430 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9431 for (unsigned i = 0; i < NumLoads; ++i) {
9432 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9433 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9434 LoadVT, MMO, DAG));
9435 }
9436
9437 if (NumElts == 8 || NumElts == 16)
9438 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9439
9440 return Loads[0];
9441}
9442
9443SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9444 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9445 if (!Subtarget->hasArchitectedSGPRs())
9446 return {};
9447 SDLoc SL(Op);
9448 MVT VT = MVT::i32;
9449 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9450 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9451 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9452}
9453
9454SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9455 unsigned Dim,
9456 const ArgDescriptor &Arg) const {
9457 SDLoc SL(Op);
9458 MachineFunction &MF = DAG.getMachineFunction();
9459 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9460 if (MaxID == 0)
9461 return DAG.getConstant(0, SL, MVT::i32);
9462
9463 // It's undefined behavior if a function marked with the amdgpu-no-*
9464 // attributes uses the corresponding intrinsic.
9465 if (!Arg)
9466 return DAG.getPOISON(Op->getValueType(0));
9467
9468 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9469 SDLoc(DAG.getEntryNode()), Arg);
9470
9471 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9472 // masking operations anyway.
9473 //
9474 // TODO: We could assert the top bit is 0 for the source copy.
9475 if (Arg.isMasked())
9476 return Val;
9477
9478 // Preserve the known bits after expansion to a copy.
9479 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9480 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9481 DAG.getValueType(SmallVT));
9482}
9483
9484SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9485 SelectionDAG &DAG) const {
9486 MachineFunction &MF = DAG.getMachineFunction();
9487 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9488
9489 EVT VT = Op.getValueType();
9490 SDLoc DL(Op);
9491 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9492
9493 // TODO: Should this propagate fast-math-flags?
9494
9495 switch (IntrinsicID) {
9496 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9497 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9498 return emitNonHSAIntrinsicError(DAG, DL, VT);
9499 return getPreloadedValue(DAG, *MFI, VT,
9501 }
9502 case Intrinsic::amdgcn_dispatch_ptr:
9503 case Intrinsic::amdgcn_queue_ptr: {
9504 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9505 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9506 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9507 DL.getDebugLoc()));
9508 return DAG.getPOISON(VT);
9509 }
9510
9511 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9514 return getPreloadedValue(DAG, *MFI, VT, RegID);
9515 }
9516 case Intrinsic::amdgcn_implicitarg_ptr: {
9517 if (MFI->isEntryFunction())
9518 return getImplicitArgPtr(DAG, DL);
9519 return getPreloadedValue(DAG, *MFI, VT,
9521 }
9522 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9524 // This only makes sense to call in a kernel, so just lower to null.
9525 return DAG.getConstant(0, DL, VT);
9526 }
9527
9528 return getPreloadedValue(DAG, *MFI, VT,
9530 }
9531 case Intrinsic::amdgcn_dispatch_id: {
9532 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9533 }
9534 case Intrinsic::amdgcn_rcp:
9535 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9536 case Intrinsic::amdgcn_rsq:
9537 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9538 case Intrinsic::amdgcn_rsq_legacy:
9539 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9540 return emitRemovedIntrinsicError(DAG, DL, VT);
9541 return SDValue();
9542 case Intrinsic::amdgcn_rcp_legacy:
9543 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9544 return emitRemovedIntrinsicError(DAG, DL, VT);
9545 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9546 case Intrinsic::amdgcn_rsq_clamp: {
9547 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9548 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9549
9550 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9551 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9552 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9553
9554 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9555 SDValue Tmp =
9556 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9557 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9558 DAG.getConstantFP(Min, DL, VT));
9559 }
9560 case Intrinsic::r600_read_ngroups_x:
9561 if (Subtarget->isAmdHsaOS())
9562 return emitNonHSAIntrinsicError(DAG, DL, VT);
9563
9564 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9566 false);
9567 case Intrinsic::r600_read_ngroups_y:
9568 if (Subtarget->isAmdHsaOS())
9569 return emitNonHSAIntrinsicError(DAG, DL, VT);
9570
9571 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9573 false);
9574 case Intrinsic::r600_read_ngroups_z:
9575 if (Subtarget->isAmdHsaOS())
9576 return emitNonHSAIntrinsicError(DAG, DL, VT);
9577
9578 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9580 false);
9581 case Intrinsic::r600_read_local_size_x:
9582 if (Subtarget->isAmdHsaOS())
9583 return emitNonHSAIntrinsicError(DAG, DL, VT);
9584
9585 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9587 case Intrinsic::r600_read_local_size_y:
9588 if (Subtarget->isAmdHsaOS())
9589 return emitNonHSAIntrinsicError(DAG, DL, VT);
9590
9591 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9593 case Intrinsic::r600_read_local_size_z:
9594 if (Subtarget->isAmdHsaOS())
9595 return emitNonHSAIntrinsicError(DAG, DL, VT);
9596
9597 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9599 case Intrinsic::amdgcn_workgroup_id_x:
9600 return getPreloadedValue(DAG, *MFI, VT,
9602 case Intrinsic::amdgcn_workgroup_id_y:
9603 return getPreloadedValue(DAG, *MFI, VT,
9605 case Intrinsic::amdgcn_workgroup_id_z:
9606 return getPreloadedValue(DAG, *MFI, VT,
9608 case Intrinsic::amdgcn_wave_id:
9609 return lowerWaveID(DAG, Op);
9610 case Intrinsic::amdgcn_lds_kernel_id: {
9611 if (MFI->isEntryFunction())
9612 return getLDSKernelId(DAG, DL);
9613 return getPreloadedValue(DAG, *MFI, VT,
9615 }
9616 case Intrinsic::amdgcn_workitem_id_x:
9617 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9618 case Intrinsic::amdgcn_workitem_id_y:
9619 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9620 case Intrinsic::amdgcn_workitem_id_z:
9621 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9622 case Intrinsic::amdgcn_wavefrontsize:
9623 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9624 SDLoc(Op), MVT::i32);
9625 case Intrinsic::amdgcn_s_buffer_load: {
9626 unsigned CPol = Op.getConstantOperandVal(3);
9627 // s_buffer_load, because of how it's optimized, can't be volatile
9628 // so reject ones with the volatile bit set.
9629 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9632 return Op;
9633 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9634 Op.getOperand(3), DAG);
9635 }
9636 case Intrinsic::amdgcn_fdiv_fast:
9637 return lowerFDIV_FAST(Op, DAG);
9638 case Intrinsic::amdgcn_sin:
9639 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9640
9641 case Intrinsic::amdgcn_cos:
9642 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9643
9644 case Intrinsic::amdgcn_mul_u24:
9645 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9646 Op.getOperand(2));
9647 case Intrinsic::amdgcn_mul_i24:
9648 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9649 Op.getOperand(2));
9650
9651 case Intrinsic::amdgcn_log_clamp: {
9652 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9653 return SDValue();
9654
9655 return emitRemovedIntrinsicError(DAG, DL, VT);
9656 }
9657 case Intrinsic::amdgcn_fract:
9658 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9659
9660 case Intrinsic::amdgcn_class:
9661 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9662 Op.getOperand(2));
9663 case Intrinsic::amdgcn_div_fmas:
9664 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9665 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9666
9667 case Intrinsic::amdgcn_div_fixup:
9668 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9669 Op.getOperand(2), Op.getOperand(3));
9670
9671 case Intrinsic::amdgcn_div_scale: {
9672 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9673
9674 // Translate to the operands expected by the machine instruction. The
9675 // first parameter must be the same as the first instruction.
9676 SDValue Numerator = Op.getOperand(1);
9677 SDValue Denominator = Op.getOperand(2);
9678
9679 // Note this order is opposite of the machine instruction's operations,
9680 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9681 // intrinsic has the numerator as the first operand to match a normal
9682 // division operation.
9683
9684 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9685
9686 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9687 Denominator, Numerator);
9688 }
9689 case Intrinsic::amdgcn_icmp: {
9690 // There is a Pat that handles this variant, so return it as-is.
9691 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9692 Op.getConstantOperandVal(2) == 0 &&
9693 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9694 return Op;
9695 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9696 }
9697 case Intrinsic::amdgcn_fcmp: {
9698 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9699 }
9700 case Intrinsic::amdgcn_ballot:
9701 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9702 case Intrinsic::amdgcn_fmed3:
9703 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9704 Op.getOperand(2), Op.getOperand(3));
9705 case Intrinsic::amdgcn_fdot2:
9706 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9707 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9708 case Intrinsic::amdgcn_fmul_legacy:
9709 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9710 Op.getOperand(2));
9711 case Intrinsic::amdgcn_sffbh:
9712 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9713 case Intrinsic::amdgcn_sbfe:
9714 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9715 Op.getOperand(2), Op.getOperand(3));
9716 case Intrinsic::amdgcn_ubfe:
9717 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9718 Op.getOperand(2), Op.getOperand(3));
9719 case Intrinsic::amdgcn_cvt_pkrtz:
9720 case Intrinsic::amdgcn_cvt_pknorm_i16:
9721 case Intrinsic::amdgcn_cvt_pknorm_u16:
9722 case Intrinsic::amdgcn_cvt_pk_i16:
9723 case Intrinsic::amdgcn_cvt_pk_u16: {
9724 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9725 EVT VT = Op.getValueType();
9726 unsigned Opcode;
9727
9728 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9730 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9732 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9734 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9736 else
9738
9739 if (isTypeLegal(VT))
9740 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
9741
9742 SDValue Node =
9743 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
9744 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
9745 }
9746 case Intrinsic::amdgcn_fmad_ftz:
9747 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
9748 Op.getOperand(2), Op.getOperand(3));
9749
9750 case Intrinsic::amdgcn_if_break:
9751 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
9752 Op->getOperand(1), Op->getOperand(2)),
9753 0);
9754
9755 case Intrinsic::amdgcn_groupstaticsize: {
9757 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
9758 return Op;
9759
9760 const Module *M = MF.getFunction().getParent();
9761 const GlobalValue *GV =
9762 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
9763 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
9765 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9766 }
9767 case Intrinsic::amdgcn_is_shared:
9768 case Intrinsic::amdgcn_is_private: {
9769 SDLoc SL(Op);
9770 SDValue SrcVec =
9771 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9772 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
9773 DAG.getConstant(1, SL, MVT::i32));
9774
9775 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9777 : AMDGPUAS::PRIVATE_ADDRESS;
9778 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
9779 Subtarget->hasGloballyAddressableScratch()) {
9780 SDValue FlatScratchBaseHi(
9781 DAG.getMachineNode(
9782 AMDGPU::S_MOV_B32, DL, MVT::i32,
9783 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9784 0);
9785 // Test bits 63..58 against the aperture address.
9786 return DAG.getSetCC(
9787 SL, MVT::i1,
9788 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
9789 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
9790 }
9791
9792 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9793 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
9794 }
9795 case Intrinsic::amdgcn_perm:
9796 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
9797 Op.getOperand(2), Op.getOperand(3));
9798 case Intrinsic::amdgcn_reloc_constant: {
9799 Module *M = MF.getFunction().getParent();
9800 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
9801 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
9802 auto *RelocSymbol = cast<GlobalVariable>(
9803 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
9804 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
9806 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9807 }
9808 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9809 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9810 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9811 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9812 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9813 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9814 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9815 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9816 if (Op.getOperand(4).getValueType() == MVT::i32)
9817 return SDValue();
9818
9819 SDLoc SL(Op);
9820 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
9821 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9822 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9823 Op.getOperand(3), IndexKeyi32);
9824 }
9825 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9826 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9827 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9828 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9829 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9830 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9831 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9832 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9833 if (Op.getOperand(4).getValueType() == MVT::i64)
9834 return SDValue();
9835
9836 SDLoc SL(Op);
9837 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
9838 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9839 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9840 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9841 Op.getOperand(6)});
9842 }
9843 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9844 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9845 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9846 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9847 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9848 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9849 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9850 ? MVT::i64
9851 : MVT::i32;
9852 if (Op.getOperand(6).getValueType() == IndexKeyTy)
9853 return SDValue();
9854
9855 SDLoc SL(Op);
9856 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
9857 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9858 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9859 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9860 IndexKey, Op.getOperand(7),
9861 Op.getOperand(8)}); // No clamp operand
9862 }
9863 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9864 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9865 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9866 if (Op.getOperand(6).getValueType() == MVT::i32)
9867 return SDValue();
9868
9869 SDLoc SL(Op);
9870 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
9871 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9872 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9873 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9874 IndexKeyi32, Op.getOperand(7)});
9875 }
9876 case Intrinsic::amdgcn_addrspacecast_nonnull:
9877 return lowerADDRSPACECAST(Op, DAG);
9878 case Intrinsic::amdgcn_readlane:
9879 case Intrinsic::amdgcn_readfirstlane:
9880 case Intrinsic::amdgcn_writelane:
9881 case Intrinsic::amdgcn_permlane16:
9882 case Intrinsic::amdgcn_permlanex16:
9883 case Intrinsic::amdgcn_permlane64:
9884 case Intrinsic::amdgcn_set_inactive:
9885 case Intrinsic::amdgcn_set_inactive_chain_arg:
9886 case Intrinsic::amdgcn_mov_dpp8:
9887 case Intrinsic::amdgcn_update_dpp:
9888 return lowerLaneOp(*this, Op.getNode(), DAG);
9889 case Intrinsic::amdgcn_dead: {
9891 for (const EVT ValTy : Op.getNode()->values())
9892 Poisons.push_back(DAG.getPOISON(ValTy));
9893 return DAG.getMergeValues(Poisons, SDLoc(Op));
9894 }
9895 default:
9896 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9898 return lowerImage(Op, ImageDimIntr, DAG, false);
9899
9900 return Op;
9901 }
9902}
9903
9904// On targets not supporting constant in soffset field, turn zero to
9905// SGPR_NULL to avoid generating an extra s_mov with zero.
9907 const GCNSubtarget *Subtarget) {
9908 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9909 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9910 return SOffset;
9911}
9912
9913SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9914 SelectionDAG &DAG,
9915 unsigned NewOpcode) const {
9916 SDLoc DL(Op);
9917
9918 SDValue VData = Op.getOperand(2);
9919 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9920 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9921 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9922 SDValue Ops[] = {
9923 Op.getOperand(0), // Chain
9924 VData, // vdata
9925 Rsrc, // rsrc
9926 DAG.getConstant(0, DL, MVT::i32), // vindex
9927 VOffset, // voffset
9928 SOffset, // soffset
9929 Offset, // offset
9930 Op.getOperand(6), // cachepolicy
9931 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9932 };
9933
9934 auto *M = cast<MemSDNode>(Op);
9935
9936 EVT MemVT = VData.getValueType();
9937 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9938 M->getMemOperand());
9939}
9940
9941SDValue
9942SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9943 unsigned NewOpcode) const {
9944 SDLoc DL(Op);
9945
9946 SDValue VData = Op.getOperand(2);
9947 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9948 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
9949 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9950 SDValue Ops[] = {
9951 Op.getOperand(0), // Chain
9952 VData, // vdata
9953 Rsrc, // rsrc
9954 Op.getOperand(4), // vindex
9955 VOffset, // voffset
9956 SOffset, // soffset
9957 Offset, // offset
9958 Op.getOperand(7), // cachepolicy
9959 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9960 };
9961
9962 auto *M = cast<MemSDNode>(Op);
9963
9964 EVT MemVT = VData.getValueType();
9965 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9966 M->getMemOperand());
9967}
9968
9969SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9970 SelectionDAG &DAG) const {
9971 unsigned IntrID = Op.getConstantOperandVal(1);
9972 SDLoc DL(Op);
9973
9974 switch (IntrID) {
9975 case Intrinsic::amdgcn_ds_ordered_add:
9976 case Intrinsic::amdgcn_ds_ordered_swap: {
9977 MemSDNode *M = cast<MemSDNode>(Op);
9978 SDValue Chain = M->getOperand(0);
9979 SDValue M0 = M->getOperand(2);
9980 SDValue Value = M->getOperand(3);
9981 unsigned IndexOperand = M->getConstantOperandVal(7);
9982 unsigned WaveRelease = M->getConstantOperandVal(8);
9983 unsigned WaveDone = M->getConstantOperandVal(9);
9984
9985 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9986 IndexOperand &= ~0x3f;
9987 unsigned CountDw = 0;
9988
9989 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9990 CountDw = (IndexOperand >> 24) & 0xf;
9991 IndexOperand &= ~(0xf << 24);
9992
9993 if (CountDw < 1 || CountDw > 4) {
9994 const Function &Fn = DAG.getMachineFunction().getFunction();
9995 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9996 Fn, "ds_ordered_count: dword count must be between 1 and 4",
9997 DL.getDebugLoc()));
9998 CountDw = 1;
9999 }
10000 }
10001
10002 if (IndexOperand) {
10003 const Function &Fn = DAG.getMachineFunction().getFunction();
10004 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10005 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10006 }
10007
10008 if (WaveDone && !WaveRelease) {
10009 // TODO: Move this to IR verifier
10010 const Function &Fn = DAG.getMachineFunction().getFunction();
10011 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10012 Fn, "ds_ordered_count: wave_done requires wave_release",
10013 DL.getDebugLoc()));
10014 }
10015
10016 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10017 unsigned ShaderType =
10019 unsigned Offset0 = OrderedCountIndex << 2;
10020 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10021
10022 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10023 Offset1 |= (CountDw - 1) << 6;
10024
10025 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10026 Offset1 |= ShaderType << 2;
10027
10028 unsigned Offset = Offset0 | (Offset1 << 8);
10029
10030 SDValue Ops[] = {
10031 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10032 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10033 };
10035 M->getVTList(), Ops, M->getMemoryVT(),
10036 M->getMemOperand());
10037 }
10038 case Intrinsic::amdgcn_raw_buffer_load:
10039 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10040 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10041 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10042 case Intrinsic::amdgcn_raw_buffer_load_format:
10043 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10044 const bool IsFormat =
10045 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10046 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10047
10048 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10049 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10050 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10051 SDValue Ops[] = {
10052 Op.getOperand(0), // Chain
10053 Rsrc, // rsrc
10054 DAG.getConstant(0, DL, MVT::i32), // vindex
10055 VOffset, // voffset
10056 SOffset, // soffset
10057 Offset, // offset
10058 Op.getOperand(5), // cachepolicy, swizzled buffer
10059 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10060 };
10061
10062 auto *M = cast<MemSDNode>(Op);
10063 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10064 }
10065 case Intrinsic::amdgcn_struct_buffer_load:
10066 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10067 case Intrinsic::amdgcn_struct_buffer_load_format:
10068 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10069 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10070 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10071 const bool IsFormat =
10072 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10073 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10074
10075 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10076 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10077 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10078 SDValue Ops[] = {
10079 Op.getOperand(0), // Chain
10080 Rsrc, // rsrc
10081 Op.getOperand(3), // vindex
10082 VOffset, // voffset
10083 SOffset, // soffset
10084 Offset, // offset
10085 Op.getOperand(6), // cachepolicy, swizzled buffer
10086 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10087 };
10088
10089 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10090 }
10091 case Intrinsic::amdgcn_raw_tbuffer_load:
10092 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10093 MemSDNode *M = cast<MemSDNode>(Op);
10094 EVT LoadVT = Op.getValueType();
10095 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10096 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10097 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10098
10099 SDValue Ops[] = {
10100 Op.getOperand(0), // Chain
10101 Rsrc, // rsrc
10102 DAG.getConstant(0, DL, MVT::i32), // vindex
10103 VOffset, // voffset
10104 SOffset, // soffset
10105 Offset, // offset
10106 Op.getOperand(5), // format
10107 Op.getOperand(6), // cachepolicy, swizzled buffer
10108 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10109 };
10110
10111 if (LoadVT.getScalarType() == MVT::f16)
10112 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10113 Ops);
10114 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10115 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10116 DAG);
10117 }
10118 case Intrinsic::amdgcn_struct_tbuffer_load:
10119 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10120 MemSDNode *M = cast<MemSDNode>(Op);
10121 EVT LoadVT = Op.getValueType();
10122 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10123 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10124 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10125
10126 SDValue Ops[] = {
10127 Op.getOperand(0), // Chain
10128 Rsrc, // rsrc
10129 Op.getOperand(3), // vindex
10130 VOffset, // voffset
10131 SOffset, // soffset
10132 Offset, // offset
10133 Op.getOperand(6), // format
10134 Op.getOperand(7), // cachepolicy, swizzled buffer
10135 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10136 };
10137
10138 if (LoadVT.getScalarType() == MVT::f16)
10139 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10140 Ops);
10141 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10142 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10143 DAG);
10144 }
10145 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10147 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10148 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10149 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10150 return lowerStructBufferAtomicIntrin(Op, DAG,
10152 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10153 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10154 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10155 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10156 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10157 return lowerStructBufferAtomicIntrin(Op, DAG,
10159 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10160 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10161 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10162 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10163 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10164 return lowerStructBufferAtomicIntrin(Op, DAG,
10166 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10167 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10168 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10169 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10170 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10171 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10172 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10173 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10174 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10175 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10176 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10177 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10178 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10179 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10180 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10181 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10182 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10183 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10184 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10185 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10186 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10187 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10188 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10189 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10190 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10192 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10193 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10194 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10195 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10196 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10197 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10198 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10199 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10200 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10201 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10202 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10203 return lowerRawBufferAtomicIntrin(Op, DAG,
10205 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10206 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10207 return lowerStructBufferAtomicIntrin(Op, DAG,
10209 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10210 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10211 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10212 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10213 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10214 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10215 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10216 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10217 return lowerStructBufferAtomicIntrin(Op, DAG,
10219 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10220 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10221 return lowerStructBufferAtomicIntrin(Op, DAG,
10223 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10224 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10225 return lowerStructBufferAtomicIntrin(Op, DAG,
10227 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10229 return lowerStructBufferAtomicIntrin(Op, DAG,
10231 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10232 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10233 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10234 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10235 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10236 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10237 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10239 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10240 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10241 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10242 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10243 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10244 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10245 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10246 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10247 return lowerStructBufferAtomicIntrin(Op, DAG,
10249
10250 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10251 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10252 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10253 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10254 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10255 SDValue Ops[] = {
10256 Op.getOperand(0), // Chain
10257 Op.getOperand(2), // src
10258 Op.getOperand(3), // cmp
10259 Rsrc, // rsrc
10260 DAG.getConstant(0, DL, MVT::i32), // vindex
10261 VOffset, // voffset
10262 SOffset, // soffset
10263 Offset, // offset
10264 Op.getOperand(7), // cachepolicy
10265 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10266 };
10267 EVT VT = Op.getValueType();
10268 auto *M = cast<MemSDNode>(Op);
10269
10271 Op->getVTList(), Ops, VT,
10272 M->getMemOperand());
10273 }
10274 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10275 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10276 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10277 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10278 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10279 SDValue Ops[] = {
10280 Op.getOperand(0), // Chain
10281 Op.getOperand(2), // src
10282 Op.getOperand(3), // cmp
10283 Rsrc, // rsrc
10284 Op.getOperand(5), // vindex
10285 VOffset, // voffset
10286 SOffset, // soffset
10287 Offset, // offset
10288 Op.getOperand(8), // cachepolicy
10289 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10290 };
10291 EVT VT = Op.getValueType();
10292 auto *M = cast<MemSDNode>(Op);
10293
10295 Op->getVTList(), Ops, VT,
10296 M->getMemOperand());
10297 }
10298 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10299 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10300 MemSDNode *M = cast<MemSDNode>(Op);
10301 SDValue NodePtr = M->getOperand(2);
10302 SDValue RayExtent = M->getOperand(3);
10303 SDValue InstanceMask = M->getOperand(4);
10304 SDValue RayOrigin = M->getOperand(5);
10305 SDValue RayDir = M->getOperand(6);
10306 SDValue Offsets = M->getOperand(7);
10307 SDValue TDescr = M->getOperand(8);
10308
10309 assert(NodePtr.getValueType() == MVT::i64);
10310 assert(RayDir.getValueType() == MVT::v3f32);
10311
10312 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10313 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10314 return SDValue();
10315 }
10316
10317 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10318 const unsigned NumVDataDwords = 10;
10319 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10320 int Opcode = AMDGPU::getMIMGOpcode(
10321 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10322 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10323 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10324 assert(Opcode != -1);
10325
10327 Ops.push_back(NodePtr);
10328 Ops.push_back(DAG.getBuildVector(
10329 MVT::v2i32, DL,
10330 {DAG.getBitcast(MVT::i32, RayExtent),
10331 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10332 Ops.push_back(RayOrigin);
10333 Ops.push_back(RayDir);
10334 Ops.push_back(Offsets);
10335 Ops.push_back(TDescr);
10336 Ops.push_back(M->getChain());
10337
10338 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10339 MachineMemOperand *MemRef = M->getMemOperand();
10340 DAG.setNodeMemRefs(NewNode, {MemRef});
10341 return SDValue(NewNode, 0);
10342 }
10343 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10344 MemSDNode *M = cast<MemSDNode>(Op);
10345 SDValue NodePtr = M->getOperand(2);
10346 SDValue RayExtent = M->getOperand(3);
10347 SDValue RayOrigin = M->getOperand(4);
10348 SDValue RayDir = M->getOperand(5);
10349 SDValue RayInvDir = M->getOperand(6);
10350 SDValue TDescr = M->getOperand(7);
10351
10352 assert(NodePtr.getValueType() == MVT::i32 ||
10353 NodePtr.getValueType() == MVT::i64);
10354 assert(RayDir.getValueType() == MVT::v3f16 ||
10355 RayDir.getValueType() == MVT::v3f32);
10356
10357 if (!Subtarget->hasGFX10_AEncoding()) {
10358 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10359 return SDValue();
10360 }
10361
10362 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10363 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10364 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10365 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10366 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10367 const unsigned NumVDataDwords = 4;
10368 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10369 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10370 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10371 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10372 IsGFX12Plus;
10373 const unsigned BaseOpcodes[2][2] = {
10374 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10375 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10376 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10377 int Opcode;
10378 if (UseNSA) {
10379 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10380 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10381 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10382 : AMDGPU::MIMGEncGfx10NSA,
10383 NumVDataDwords, NumVAddrDwords);
10384 } else {
10385 assert(!IsGFX12Plus);
10386 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10387 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10388 : AMDGPU::MIMGEncGfx10Default,
10389 NumVDataDwords, NumVAddrDwords);
10390 }
10391 assert(Opcode != -1);
10392
10394
10395 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10397 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10398 if (Lanes[0].getValueSizeInBits() == 32) {
10399 for (unsigned I = 0; I < 3; ++I)
10400 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10401 } else {
10402 if (IsAligned) {
10403 Ops.push_back(DAG.getBitcast(
10404 MVT::i32,
10405 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10406 Ops.push_back(Lanes[2]);
10407 } else {
10408 SDValue Elt0 = Ops.pop_back_val();
10409 Ops.push_back(DAG.getBitcast(
10410 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10411 Ops.push_back(DAG.getBitcast(
10412 MVT::i32,
10413 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10414 }
10415 }
10416 };
10417
10418 if (UseNSA && IsGFX11Plus) {
10419 Ops.push_back(NodePtr);
10420 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10421 Ops.push_back(RayOrigin);
10422 if (IsA16) {
10423 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10424 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10425 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10426 for (unsigned I = 0; I < 3; ++I) {
10427 MergedLanes.push_back(DAG.getBitcast(
10428 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10429 {DirLanes[I], InvDirLanes[I]})));
10430 }
10431 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10432 } else {
10433 Ops.push_back(RayDir);
10434 Ops.push_back(RayInvDir);
10435 }
10436 } else {
10437 if (Is64)
10438 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10439 2);
10440 else
10441 Ops.push_back(NodePtr);
10442
10443 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10444 packLanes(RayOrigin, true);
10445 packLanes(RayDir, true);
10446 packLanes(RayInvDir, false);
10447 }
10448
10449 if (!UseNSA) {
10450 // Build a single vector containing all the operands so far prepared.
10451 if (NumVAddrDwords > 12) {
10452 SDValue Undef = DAG.getPOISON(MVT::i32);
10453 Ops.append(16 - Ops.size(), Undef);
10454 }
10455 assert(Ops.size() >= 8 && Ops.size() <= 12);
10456 SDValue MergedOps =
10457 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10458 Ops.clear();
10459 Ops.push_back(MergedOps);
10460 }
10461
10462 Ops.push_back(TDescr);
10463 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10464 Ops.push_back(M->getChain());
10465
10466 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10467 MachineMemOperand *MemRef = M->getMemOperand();
10468 DAG.setNodeMemRefs(NewNode, {MemRef});
10469 return SDValue(NewNode, 0);
10470 }
10471 case Intrinsic::amdgcn_global_atomic_fmin_num:
10472 case Intrinsic::amdgcn_global_atomic_fmax_num:
10473 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10474 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10475 MemSDNode *M = cast<MemSDNode>(Op);
10476 SDValue Ops[] = {
10477 M->getOperand(0), // Chain
10478 M->getOperand(2), // Ptr
10479 M->getOperand(3) // Value
10480 };
10481 unsigned Opcode = 0;
10482 switch (IntrID) {
10483 case Intrinsic::amdgcn_global_atomic_fmin_num:
10484 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10485 Opcode = ISD::ATOMIC_LOAD_FMIN;
10486 break;
10487 }
10488 case Intrinsic::amdgcn_global_atomic_fmax_num:
10489 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10490 Opcode = ISD::ATOMIC_LOAD_FMAX;
10491 break;
10492 }
10493 default:
10494 llvm_unreachable("unhandled atomic opcode");
10495 }
10496 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10497 Ops, M->getMemOperand());
10498 }
10499 case Intrinsic::amdgcn_s_get_barrier_state:
10500 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10501 SDValue Chain = Op->getOperand(0);
10503 unsigned Opc;
10504
10505 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10506 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10507 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10508 BarID = (BarID >> 4) & 0x3F;
10509 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10510 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10511 Ops.push_back(K);
10512 Ops.push_back(Chain);
10513 } else {
10514 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10515 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10516 SDValue M0Val;
10517 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10518 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10519 M0Val = SDValue(
10520 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10521 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10522 0);
10523 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10524 } else
10525 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10526 }
10527
10528 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10529 return SDValue(NewMI, 0);
10530 }
10531 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10532 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10533 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10534 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10535 SDValue Chain = Op->getOperand(0);
10536 SDValue Ptr = Op->getOperand(2);
10537 EVT VT = Op->getValueType(0);
10538 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10539 Chain, Ptr, MII->getMemOperand());
10540 }
10541 default:
10542
10543 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10545 return lowerImage(Op, ImageDimIntr, DAG, true);
10546
10547 return SDValue();
10548 }
10549}
10550
10551// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10552// dwordx4 if on SI and handle TFE loads.
10553SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10554 SDVTList VTList,
10555 ArrayRef<SDValue> Ops, EVT MemVT,
10556 MachineMemOperand *MMO,
10557 SelectionDAG &DAG) const {
10558 LLVMContext &C = *DAG.getContext();
10559 MachineFunction &MF = DAG.getMachineFunction();
10560 EVT VT = VTList.VTs[0];
10561
10562 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10563 bool IsTFE = VTList.NumVTs == 3;
10564 if (IsTFE) {
10565 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10566 unsigned NumOpDWords = NumValueDWords + 1;
10567 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10568 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10569 MachineMemOperand *OpDWordsMMO =
10570 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10571 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10572 OpDWordsVT, OpDWordsMMO, DAG);
10573 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10574 DAG.getVectorIdxConstant(NumValueDWords, DL));
10575 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10576 SDValue ValueDWords =
10577 NumValueDWords == 1
10578 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10580 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10581 ZeroIdx);
10582 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10583 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10584 }
10585
10586 if (!Subtarget->hasDwordx3LoadStores() &&
10587 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10588 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10589 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10590 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10591 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10592 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10593 WidenedMemVT, WidenedMMO);
10595 DAG.getVectorIdxConstant(0, DL));
10596 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10597 }
10598
10599 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10600}
10601
10602SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10603 bool ImageStore) const {
10604 EVT StoreVT = VData.getValueType();
10605
10606 // No change for f16 and legal vector D16 types.
10607 if (!StoreVT.isVector())
10608 return VData;
10609
10610 SDLoc DL(VData);
10611 unsigned NumElements = StoreVT.getVectorNumElements();
10612
10613 if (Subtarget->hasUnpackedD16VMem()) {
10614 // We need to unpack the packed data to store.
10615 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10616 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10617
10618 EVT EquivStoreVT =
10619 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10620 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10621 return DAG.UnrollVectorOp(ZExt.getNode());
10622 }
10623
10624 // The sq block of gfx8.1 does not estimate register use correctly for d16
10625 // image store instructions. The data operand is computed as if it were not a
10626 // d16 image instruction.
10627 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10628 // Bitcast to i16
10629 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10630 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10631
10632 // Decompose into scalars
10634 DAG.ExtractVectorElements(IntVData, Elts);
10635
10636 // Group pairs of i16 into v2i16 and bitcast to i32
10637 SmallVector<SDValue, 4> PackedElts;
10638 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10639 SDValue Pair =
10640 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10641 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10642 PackedElts.push_back(IntPair);
10643 }
10644 if ((NumElements % 2) == 1) {
10645 // Handle v3i16
10646 unsigned I = Elts.size() / 2;
10647 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10648 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10649 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10650 PackedElts.push_back(IntPair);
10651 }
10652
10653 // Pad using UNDEF
10654 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10655
10656 // Build final vector
10657 EVT VecVT =
10658 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10659 return DAG.getBuildVector(VecVT, DL, PackedElts);
10660 }
10661
10662 if (NumElements == 3) {
10663 EVT IntStoreVT =
10665 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10666
10667 EVT WidenedStoreVT = EVT::getVectorVT(
10668 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10669 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10670 WidenedStoreVT.getStoreSizeInBits());
10671 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10672 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10673 }
10674
10675 assert(isTypeLegal(StoreVT));
10676 return VData;
10677}
10678
10679SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10680 SelectionDAG &DAG) const {
10681 SDLoc DL(Op);
10682 SDValue Chain = Op.getOperand(0);
10683 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10684 MachineFunction &MF = DAG.getMachineFunction();
10685
10686 switch (IntrinsicID) {
10687 case Intrinsic::amdgcn_exp_compr: {
10688 if (!Subtarget->hasCompressedExport()) {
10689 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10691 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10692 }
10693 SDValue Src0 = Op.getOperand(4);
10694 SDValue Src1 = Op.getOperand(5);
10695 // Hack around illegal type on SI by directly selecting it.
10696 if (isTypeLegal(Src0.getValueType()))
10697 return SDValue();
10698
10699 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10700 SDValue Undef = DAG.getPOISON(MVT::f32);
10701 const SDValue Ops[] = {
10702 Op.getOperand(2), // tgt
10703 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10704 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10705 Undef, // src2
10706 Undef, // src3
10707 Op.getOperand(7), // vm
10708 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10709 Op.getOperand(3), // en
10710 Op.getOperand(0) // Chain
10711 };
10712
10713 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10714 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10715 }
10716
10717 case Intrinsic::amdgcn_struct_tbuffer_store:
10718 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10719 SDValue VData = Op.getOperand(2);
10720 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10721 if (IsD16)
10722 VData = handleD16VData(VData, DAG);
10723 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10724 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10725 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10726 SDValue Ops[] = {
10727 Chain,
10728 VData, // vdata
10729 Rsrc, // rsrc
10730 Op.getOperand(4), // vindex
10731 VOffset, // voffset
10732 SOffset, // soffset
10733 Offset, // offset
10734 Op.getOperand(7), // format
10735 Op.getOperand(8), // cachepolicy, swizzled buffer
10736 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10737 };
10738 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10740 MemSDNode *M = cast<MemSDNode>(Op);
10741 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10742 M->getMemoryVT(), M->getMemOperand());
10743 }
10744
10745 case Intrinsic::amdgcn_raw_tbuffer_store:
10746 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10747 SDValue VData = Op.getOperand(2);
10748 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10749 if (IsD16)
10750 VData = handleD16VData(VData, DAG);
10751 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10752 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10753 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10754 SDValue Ops[] = {
10755 Chain,
10756 VData, // vdata
10757 Rsrc, // rsrc
10758 DAG.getConstant(0, DL, MVT::i32), // vindex
10759 VOffset, // voffset
10760 SOffset, // soffset
10761 Offset, // offset
10762 Op.getOperand(6), // format
10763 Op.getOperand(7), // cachepolicy, swizzled buffer
10764 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10765 };
10766 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10768 MemSDNode *M = cast<MemSDNode>(Op);
10769 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10770 M->getMemoryVT(), M->getMemOperand());
10771 }
10772
10773 case Intrinsic::amdgcn_raw_buffer_store:
10774 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10775 case Intrinsic::amdgcn_raw_buffer_store_format:
10776 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10777 const bool IsFormat =
10778 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10779 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10780
10781 SDValue VData = Op.getOperand(2);
10782 EVT VDataVT = VData.getValueType();
10783 EVT EltType = VDataVT.getScalarType();
10784 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10785 if (IsD16) {
10786 VData = handleD16VData(VData, DAG);
10787 VDataVT = VData.getValueType();
10788 }
10789
10790 if (!isTypeLegal(VDataVT)) {
10791 VData =
10792 DAG.getNode(ISD::BITCAST, DL,
10793 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10794 }
10795
10796 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10797 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10798 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10799 SDValue Ops[] = {
10800 Chain,
10801 VData,
10802 Rsrc,
10803 DAG.getConstant(0, DL, MVT::i32), // vindex
10804 VOffset, // voffset
10805 SOffset, // soffset
10806 Offset, // offset
10807 Op.getOperand(6), // cachepolicy, swizzled buffer
10808 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10809 };
10810 unsigned Opc =
10813 MemSDNode *M = cast<MemSDNode>(Op);
10814
10815 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10816 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10817 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
10818
10819 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10820 M->getMemoryVT(), M->getMemOperand());
10821 }
10822
10823 case Intrinsic::amdgcn_struct_buffer_store:
10824 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10825 case Intrinsic::amdgcn_struct_buffer_store_format:
10826 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10827 const bool IsFormat =
10828 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10829 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10830
10831 SDValue VData = Op.getOperand(2);
10832 EVT VDataVT = VData.getValueType();
10833 EVT EltType = VDataVT.getScalarType();
10834 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10835
10836 if (IsD16) {
10837 VData = handleD16VData(VData, DAG);
10838 VDataVT = VData.getValueType();
10839 }
10840
10841 if (!isTypeLegal(VDataVT)) {
10842 VData =
10843 DAG.getNode(ISD::BITCAST, DL,
10844 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10845 }
10846
10847 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10848 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10849 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10850 SDValue Ops[] = {
10851 Chain,
10852 VData,
10853 Rsrc,
10854 Op.getOperand(4), // vindex
10855 VOffset, // voffset
10856 SOffset, // soffset
10857 Offset, // offset
10858 Op.getOperand(7), // cachepolicy, swizzled buffer
10859 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10860 };
10861 unsigned Opc =
10864 MemSDNode *M = cast<MemSDNode>(Op);
10865
10866 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10867 EVT VDataType = VData.getValueType().getScalarType();
10868 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10869 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
10870
10871 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10872 M->getMemoryVT(), M->getMemOperand());
10873 }
10874 case Intrinsic::amdgcn_raw_buffer_load_lds:
10875 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10876 case Intrinsic::amdgcn_struct_buffer_load_lds:
10877 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10878 if (!Subtarget->hasVMemToLDSLoad())
10879 return SDValue();
10880 unsigned Opc;
10881 bool HasVIndex =
10882 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10883 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10884 unsigned OpOffset = HasVIndex ? 1 : 0;
10885 SDValue VOffset = Op.getOperand(5 + OpOffset);
10886 bool HasVOffset = !isNullConstant(VOffset);
10887 unsigned Size = Op->getConstantOperandVal(4);
10888
10889 switch (Size) {
10890 default:
10891 return SDValue();
10892 case 1:
10893 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10894 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10895 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10896 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10897 break;
10898 case 2:
10899 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10900 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10901 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10902 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10903 break;
10904 case 4:
10905 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10906 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10907 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10908 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10909 break;
10910 case 12:
10911 if (!Subtarget->hasLDSLoadB96_B128())
10912 return SDValue();
10913 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10914 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10915 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10916 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10917 break;
10918 case 16:
10919 if (!Subtarget->hasLDSLoadB96_B128())
10920 return SDValue();
10921 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10922 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10923 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10924 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10925 break;
10926 }
10927
10928 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10929
10931
10932 if (HasVIndex && HasVOffset)
10933 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10934 {Op.getOperand(5), // VIndex
10935 VOffset}));
10936 else if (HasVIndex)
10937 Ops.push_back(Op.getOperand(5));
10938 else if (HasVOffset)
10939 Ops.push_back(VOffset);
10940
10941 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10942 Ops.push_back(Rsrc);
10943 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10944 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10945 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10946 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
10947 Ops.push_back(DAG.getTargetConstant(
10948 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10949 DL, MVT::i8)); // cpol
10950 Ops.push_back(DAG.getTargetConstant(
10951 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10952 ? 1
10953 : 0,
10954 DL, MVT::i8)); // swz
10955 Ops.push_back(M0Val.getValue(0)); // Chain
10956 Ops.push_back(M0Val.getValue(1)); // Glue
10957
10958 auto *M = cast<MemSDNode>(Op);
10959 MachineMemOperand *LoadMMO = M->getMemOperand();
10960 // Don't set the offset value here because the pointer points to the base of
10961 // the buffer.
10962 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10963
10964 MachinePointerInfo StorePtrI = LoadPtrI;
10965 LoadPtrI.V = PoisonValue::get(
10969
10970 auto F = LoadMMO->getFlags() &
10972 LoadMMO =
10974 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10975
10976 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10977 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
10978 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
10979
10980 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
10981 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
10982
10983 return SDValue(Load, 0);
10984 }
10985 // Buffers are handled by LowerBufferFatPointers, and we're going to go
10986 // for "trust me" that the remaining cases are global pointers until
10987 // such time as we can put two mem operands on an intrinsic.
10988 case Intrinsic::amdgcn_load_to_lds:
10989 case Intrinsic::amdgcn_global_load_lds: {
10990 if (!Subtarget->hasVMemToLDSLoad())
10991 return SDValue();
10992
10993 unsigned Opc;
10994 unsigned Size = Op->getConstantOperandVal(4);
10995 switch (Size) {
10996 default:
10997 return SDValue();
10998 case 1:
10999 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11000 break;
11001 case 2:
11002 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11003 break;
11004 case 4:
11005 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11006 break;
11007 case 12:
11008 if (!Subtarget->hasLDSLoadB96_B128())
11009 return SDValue();
11010 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11011 break;
11012 case 16:
11013 if (!Subtarget->hasLDSLoadB96_B128())
11014 return SDValue();
11015 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11016 break;
11017 }
11018
11019 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11020
11022
11023 SDValue Addr = Op.getOperand(2); // Global ptr
11024 SDValue VOffset;
11025 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11026 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11027 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
11028 SDValue LHS = Addr.getOperand(0);
11029 SDValue RHS = Addr.getOperand(1);
11030
11031 if (LHS->isDivergent())
11032 std::swap(LHS, RHS);
11033
11034 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11035 RHS.getOperand(0).getValueType() == MVT::i32) {
11036 // add (i64 sgpr), (zero_extend (i32 vgpr))
11037 Addr = LHS;
11038 VOffset = RHS.getOperand(0);
11039 }
11040 }
11041
11042 Ops.push_back(Addr);
11043 if (!Addr->isDivergent()) {
11045 if (!VOffset)
11046 VOffset =
11047 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11048 DAG.getTargetConstant(0, DL, MVT::i32)),
11049 0);
11050 Ops.push_back(VOffset);
11051 }
11052
11053 Ops.push_back(Op.getOperand(5)); // Offset
11054 Ops.push_back(Op.getOperand(6)); // CPol
11055 Ops.push_back(M0Val.getValue(0)); // Chain
11056 Ops.push_back(M0Val.getValue(1)); // Glue
11057
11058 auto *M = cast<MemSDNode>(Op);
11059 MachineMemOperand *LoadMMO = M->getMemOperand();
11060 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11061 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11062 MachinePointerInfo StorePtrI = LoadPtrI;
11063 LoadPtrI.V = PoisonValue::get(
11067 auto F = LoadMMO->getFlags() &
11069 LoadMMO =
11071 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11072 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11073 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11074 LoadMMO->getAAInfo());
11075
11076 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11077 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11078
11079 return SDValue(Load, 0);
11080 }
11081 case Intrinsic::amdgcn_end_cf:
11082 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11083 Op->getOperand(2), Chain),
11084 0);
11085 case Intrinsic::amdgcn_s_barrier_init:
11086 case Intrinsic::amdgcn_s_barrier_signal_var: {
11087 // these two intrinsics have two operands: barrier pointer and member count
11088 SDValue Chain = Op->getOperand(0);
11090 SDValue BarOp = Op->getOperand(2);
11091 SDValue CntOp = Op->getOperand(3);
11092 SDValue M0Val;
11093 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11094 ? AMDGPU::S_BARRIER_INIT_M0
11095 : AMDGPU::S_BARRIER_SIGNAL_M0;
11096 // extract the BarrierID from bits 4-9 of BarOp
11097 SDValue BarID;
11098 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11099 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11100 BarID =
11101 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11102 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11103 0);
11104 // Member count should be put into M0[ShAmt:+6]
11105 // Barrier ID should be put into M0[5:0]
11106 M0Val =
11107 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11108 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11109 0);
11110 constexpr unsigned ShAmt = 16;
11111 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11112 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11113
11114 M0Val = SDValue(
11115 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11116
11117 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11118
11119 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11120 return SDValue(NewMI, 0);
11121 }
11122 case Intrinsic::amdgcn_s_barrier_join: {
11123 // these three intrinsics have one operand: barrier pointer
11124 SDValue Chain = Op->getOperand(0);
11126 SDValue BarOp = Op->getOperand(2);
11127 unsigned Opc;
11128
11129 if (isa<ConstantSDNode>(BarOp)) {
11130 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11131 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11132
11133 // extract the BarrierID from bits 4-9 of the immediate
11134 unsigned BarID = (BarVal >> 4) & 0x3F;
11135 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11136 Ops.push_back(K);
11137 Ops.push_back(Chain);
11138 } else {
11139 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11140
11141 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11142 SDValue M0Val;
11143 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11144 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11145 M0Val =
11146 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11147 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11148 0);
11149 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11150 }
11151
11152 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11153 return SDValue(NewMI, 0);
11154 }
11155 case Intrinsic::amdgcn_s_prefetch_data: {
11156 // For non-global address space preserve the chain and remove the call.
11158 return Op.getOperand(0);
11159 return Op;
11160 }
11161 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11162 SDValue Ops[] = {
11163 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11164 Op.getOperand(3), // offset
11165 Op.getOperand(4), // length
11166 };
11167
11168 MemSDNode *M = cast<MemSDNode>(Op);
11170 Op->getVTList(), Ops, M->getMemoryVT(),
11171 M->getMemOperand());
11172 }
11173 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11174 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11175 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11176 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11177 SDValue Chain = Op->getOperand(0);
11178 SDValue Ptr = Op->getOperand(2);
11179 SDValue Val = Op->getOperand(3);
11180 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11181 Ptr, MII->getMemOperand());
11182 }
11183 default: {
11184 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11186 return lowerImage(Op, ImageDimIntr, DAG, true);
11187
11188 return Op;
11189 }
11190 }
11191}
11192
11193// Return whether the operation has NoUnsignedWrap property.
11194static bool isNoUnsignedWrap(SDValue Addr) {
11195 return (Addr.getOpcode() == ISD::ADD &&
11196 Addr->getFlags().hasNoUnsignedWrap()) ||
11197 Addr->getOpcode() == ISD::OR;
11198}
11199
11201 EVT PtrVT) const {
11202 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11203}
11204
11205// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11206// offset (the offset that is included in bounds checking and swizzling, to be
11207// split between the instruction's voffset and immoffset fields) and soffset
11208// (the offset that is excluded from bounds checking and swizzling, to go in
11209// the instruction's soffset field). This function takes the first kind of
11210// offset and figures out how to split it between voffset and immoffset.
11211std::pair<SDValue, SDValue>
11212SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11213 SDLoc DL(Offset);
11214 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11215 SDValue N0 = Offset;
11216 ConstantSDNode *C1 = nullptr;
11217
11218 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11219 N0 = SDValue();
11220 else if (DAG.isBaseWithConstantOffset(N0)) {
11221 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11222 // being added, so we can only safely match a 32-bit addition with no
11223 // unsigned overflow.
11224 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11225 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11226 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11227 N0 = N0.getOperand(0);
11228 }
11229 }
11230
11231 if (C1) {
11232 unsigned ImmOffset = C1->getZExtValue();
11233 // If the immediate value is too big for the immoffset field, put only bits
11234 // that would normally fit in the immoffset field. The remaining value that
11235 // is copied/added for the voffset field is a large power of 2, and it
11236 // stands more chance of being CSEd with the copy/add for another similar
11237 // load/store.
11238 // However, do not do that rounding down if that is a negative
11239 // number, as it appears to be illegal to have a negative offset in the
11240 // vgpr, even if adding the immediate offset makes it positive.
11241 unsigned Overflow = ImmOffset & ~MaxImm;
11242 ImmOffset -= Overflow;
11243 if ((int32_t)Overflow < 0) {
11244 Overflow += ImmOffset;
11245 ImmOffset = 0;
11246 }
11247 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11248 if (Overflow) {
11249 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11250 if (!N0)
11251 N0 = OverflowVal;
11252 else {
11253 SDValue Ops[] = {N0, OverflowVal};
11254 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11255 }
11256 }
11257 }
11258 if (!N0)
11259 N0 = DAG.getConstant(0, DL, MVT::i32);
11260 if (!C1)
11261 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11262 return {N0, SDValue(C1, 0)};
11263}
11264
11265// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11266// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11267// pointed to by Offsets.
11268void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11269 SelectionDAG &DAG, SDValue *Offsets,
11270 Align Alignment) const {
11271 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11272 SDLoc DL(CombinedOffset);
11273 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11274 uint32_t Imm = C->getZExtValue();
11275 uint32_t SOffset, ImmOffset;
11276 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11277 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11278 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11279 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11280 return;
11281 }
11282 }
11283 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11284 SDValue N0 = CombinedOffset.getOperand(0);
11285 SDValue N1 = CombinedOffset.getOperand(1);
11286 uint32_t SOffset, ImmOffset;
11287 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11288 if (Offset >= 0 &&
11289 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11290 Offsets[0] = N0;
11291 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11292 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11293 return;
11294 }
11295 }
11296
11297 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11298 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11299 : DAG.getConstant(0, DL, MVT::i32);
11300
11301 Offsets[0] = CombinedOffset;
11302 Offsets[1] = SOffsetZero;
11303 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11304}
11305
11306SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11307 SelectionDAG &DAG) const {
11308 if (!MaybePointer.getValueType().isScalarInteger())
11309 return MaybePointer;
11310
11311 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11312 return Rsrc;
11313}
11314
11315// Wrap a global or flat pointer into a buffer intrinsic using the flags
11316// specified in the intrinsic.
11317SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11318 SelectionDAG &DAG) const {
11319 SDLoc Loc(Op);
11320
11321 SDValue Pointer = Op->getOperand(1);
11322 SDValue Stride = Op->getOperand(2);
11323 SDValue NumRecords = Op->getOperand(3);
11324 SDValue Flags = Op->getOperand(4);
11325
11326 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11327 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11328 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11329 std::optional<uint32_t> ConstStride = std::nullopt;
11330 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11331 ConstStride = ConstNode->getZExtValue();
11332
11333 SDValue NewHighHalf = Masked;
11334 if (!ConstStride || *ConstStride != 0) {
11335 SDValue ShiftedStride;
11336 if (ConstStride) {
11337 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11338 } else {
11339 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11340 ShiftedStride =
11341 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11342 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11343 }
11344 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11345 }
11346
11347 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11348 NewHighHalf, NumRecords, Flags);
11349 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11350 return RsrcPtr;
11351}
11352
11353// Handle 8 bit and 16 bit buffer loads
11354SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11355 EVT LoadVT, SDLoc DL,
11357 MachineMemOperand *MMO,
11358 bool IsTFE) const {
11359 EVT IntVT = LoadVT.changeTypeToInteger();
11360
11361 if (IsTFE) {
11362 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11365 MachineFunction &MF = DAG.getMachineFunction();
11366 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11367 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11368 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11369 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11370 DAG.getConstant(1, DL, MVT::i32));
11371 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11372 DAG.getConstant(0, DL, MVT::i32));
11373 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11374 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11375 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11376 }
11377
11378 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11381
11382 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11383 SDValue BufferLoad =
11384 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11385 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11386 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11387
11388 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11389}
11390
11391// Handle 8 bit and 16 bit buffer stores
11392SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11393 EVT VDataType, SDLoc DL,
11394 SDValue Ops[],
11395 MemSDNode *M) const {
11396 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11397 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11398
11399 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11400 Ops[1] = BufferStoreExt;
11401 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11402 : AMDGPUISD::BUFFER_STORE_SHORT;
11403 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11404 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11405 M->getMemOperand());
11406}
11407
11409 SDValue Op, const SDLoc &SL, EVT VT) {
11410 if (VT.bitsLT(Op.getValueType()))
11411 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11412
11413 switch (ExtType) {
11414 case ISD::SEXTLOAD:
11415 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11416 case ISD::ZEXTLOAD:
11417 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11418 case ISD::EXTLOAD:
11419 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11420 case ISD::NON_EXTLOAD:
11421 return Op;
11422 }
11423
11424 llvm_unreachable("invalid ext type");
11425}
11426
11427// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11428// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11429SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11430 DAGCombinerInfo &DCI) const {
11431 SelectionDAG &DAG = DCI.DAG;
11432 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11433 return SDValue();
11434
11435 // FIXME: Constant loads should all be marked invariant.
11436 unsigned AS = Ld->getAddressSpace();
11437 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11439 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11440 return SDValue();
11441
11442 // Don't do this early, since it may interfere with adjacent load merging for
11443 // illegal types. We can avoid losing alignment information for exotic types
11444 // pre-legalize.
11445 EVT MemVT = Ld->getMemoryVT();
11446 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11447 MemVT.getSizeInBits() >= 32)
11448 return SDValue();
11449
11450 SDLoc SL(Ld);
11451
11452 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11453 "unexpected vector extload");
11454
11455 // TODO: Drop only high part of range.
11456 SDValue Ptr = Ld->getBasePtr();
11457 SDValue NewLoad = DAG.getLoad(
11458 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11459 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11460 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11461 nullptr); // Drop ranges
11462
11463 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11464 if (MemVT.isFloatingPoint()) {
11466 "unexpected fp extload");
11467 TruncVT = MemVT.changeTypeToInteger();
11468 }
11469
11470 SDValue Cvt = NewLoad;
11471 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11472 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11473 DAG.getValueType(TruncVT));
11474 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11476 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11477 } else {
11479 }
11480
11481 EVT VT = Ld->getValueType(0);
11482 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11483
11484 DCI.AddToWorklist(Cvt.getNode());
11485
11486 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11487 // the appropriate extension from the 32-bit load.
11488 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11489 DCI.AddToWorklist(Cvt.getNode());
11490
11491 // Handle conversion back to floating point if necessary.
11492 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11493
11494 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11495}
11496
11498 const SIMachineFunctionInfo &Info) {
11499 // TODO: Should check if the address can definitely not access stack.
11500 if (Info.isEntryFunction())
11501 return Info.getUserSGPRInfo().hasFlatScratchInit();
11502 return true;
11503}
11504
11505SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11506 SDLoc DL(Op);
11507 LoadSDNode *Load = cast<LoadSDNode>(Op);
11508 ISD::LoadExtType ExtType = Load->getExtensionType();
11509 EVT MemVT = Load->getMemoryVT();
11510 MachineMemOperand *MMO = Load->getMemOperand();
11511
11512 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11513 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11514 return SDValue();
11515
11516 // FIXME: Copied from PPC
11517 // First, load into 32 bits, then truncate to 1 bit.
11518
11519 SDValue Chain = Load->getChain();
11520 SDValue BasePtr = Load->getBasePtr();
11521
11522 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11523
11524 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11525 RealMemVT, MMO);
11526
11527 if (!MemVT.isVector()) {
11528 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11529 NewLD.getValue(1)};
11530
11531 return DAG.getMergeValues(Ops, DL);
11532 }
11533
11535 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11536 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11537 DAG.getConstant(I, DL, MVT::i32));
11538
11539 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11540 }
11541
11542 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11543
11544 return DAG.getMergeValues(Ops, DL);
11545 }
11546
11547 if (!MemVT.isVector())
11548 return SDValue();
11549
11550 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11551 "Custom lowering for non-i32 vectors hasn't been implemented.");
11552
11553 Align Alignment = Load->getAlign();
11554 unsigned AS = Load->getAddressSpace();
11555 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11556 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11557 return SplitVectorLoad(Op, DAG);
11558 }
11559
11560 MachineFunction &MF = DAG.getMachineFunction();
11561 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11562 // If there is a possibility that flat instruction access scratch memory
11563 // then we need to use the same legalization rules we use for private.
11564 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11565 !Subtarget->hasMultiDwordFlatScratchAddressing())
11566 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11569
11570 unsigned NumElements = MemVT.getVectorNumElements();
11571
11572 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11574 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11575 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11577 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11578 Alignment >= Align(4) && NumElements < 32) {
11579 if (MemVT.isPow2VectorType() ||
11580 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11581 return SDValue();
11582 return WidenOrSplitVectorLoad(Op, DAG);
11583 }
11584 // Non-uniform loads will be selected to MUBUF instructions, so they
11585 // have the same legalization requirements as global and private
11586 // loads.
11587 //
11588 }
11589 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11592 if (NumElements > 4)
11593 return SplitVectorLoad(Op, DAG);
11594 // v3 loads not supported on SI.
11595 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11596 return WidenOrSplitVectorLoad(Op, DAG);
11597
11598 // v3 and v4 loads are supported for private and global memory.
11599 return SDValue();
11600 }
11601 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11602 // Depending on the setting of the private_element_size field in the
11603 // resource descriptor, we can only make private accesses up to a certain
11604 // size.
11605 switch (Subtarget->getMaxPrivateElementSize()) {
11606 case 4: {
11607 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11608 return DAG.getMergeValues({Op0, Op1}, DL);
11609 }
11610 case 8:
11611 if (NumElements > 2)
11612 return SplitVectorLoad(Op, DAG);
11613 return SDValue();
11614 case 16:
11615 // Same as global/flat
11616 if (NumElements > 4)
11617 return SplitVectorLoad(Op, DAG);
11618 // v3 loads not supported on SI.
11619 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11620 return WidenOrSplitVectorLoad(Op, DAG);
11621
11622 return SDValue();
11623 default:
11624 llvm_unreachable("unsupported private_element_size");
11625 }
11626 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11627 unsigned Fast = 0;
11628 auto Flags = Load->getMemOperand()->getFlags();
11630 Load->getAlign(), Flags, &Fast) &&
11631 Fast > 1)
11632 return SDValue();
11633
11634 if (MemVT.isVector())
11635 return SplitVectorLoad(Op, DAG);
11636 }
11637
11639 MemVT, *Load->getMemOperand())) {
11640 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11641 return DAG.getMergeValues({Op0, Op1}, DL);
11642 }
11643
11644 return SDValue();
11645}
11646
11647SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11648 EVT VT = Op.getValueType();
11649 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11650 VT.getSizeInBits() == 512)
11651 return splitTernaryVectorOp(Op, DAG);
11652
11653 assert(VT.getSizeInBits() == 64);
11654
11655 SDLoc DL(Op);
11656 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11657
11658 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11659 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11660
11661 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11662 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11663
11664 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11665 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11666
11667 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11668
11669 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11670 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11671
11672 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11673
11674 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11675 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11676}
11677
11678// Catch division cases where we can use shortcuts with rcp and rsq
11679// instructions.
11680SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11681 SelectionDAG &DAG) const {
11682 SDLoc SL(Op);
11683 SDValue LHS = Op.getOperand(0);
11684 SDValue RHS = Op.getOperand(1);
11685 EVT VT = Op.getValueType();
11686 const SDNodeFlags Flags = Op->getFlags();
11687
11688 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11689
11690 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11691 // Without !fpmath accuracy information, we can't do more because we don't
11692 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11693 // f16 is always accurate enough
11694 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11695 return SDValue();
11696
11697 if (CLHS->isExactlyValue(1.0)) {
11698 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11699 // the CI documentation has a worst case error of 1 ulp.
11700 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11701 // use it as long as we aren't trying to use denormals.
11702 //
11703 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11704
11705 // 1.0 / sqrt(x) -> rsq(x)
11706
11707 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11708 // error seems really high at 2^29 ULP.
11709 // 1.0 / x -> rcp(x)
11710 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11711 }
11712
11713 // Same as for 1.0, but expand the sign out of the constant.
11714 if (CLHS->isExactlyValue(-1.0)) {
11715 // -1.0 / x -> rcp (fneg x)
11716 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11717 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11718 }
11719 }
11720
11721 // For f16 and bf16 require afn or arcp.
11722 // For f32 require afn.
11723 if (!AllowInaccurateRcp &&
11724 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
11725 return SDValue();
11726
11727 // Turn into multiply by the reciprocal.
11728 // x / y -> x * (1.0 / y)
11729 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11730 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
11731}
11732
11733SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
11734 SelectionDAG &DAG) const {
11735 SDLoc SL(Op);
11736 SDValue X = Op.getOperand(0);
11737 SDValue Y = Op.getOperand(1);
11738 EVT VT = Op.getValueType();
11739 const SDNodeFlags Flags = Op->getFlags();
11740
11741 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
11742 if (!AllowInaccurateDiv)
11743 return SDValue();
11744
11745 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
11746 SDValue One = DAG.getConstantFP(1.0, SL, VT);
11747
11748 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
11749 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11750
11751 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
11752 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11753 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
11754 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
11755 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
11756 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
11757}
11758
11759static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11760 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
11761 SDNodeFlags Flags) {
11762 if (GlueChain->getNumValues() <= 1) {
11763 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
11764 }
11765
11766 assert(GlueChain->getNumValues() == 3);
11767
11768 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11769 switch (Opcode) {
11770 default:
11771 llvm_unreachable("no chain equivalent for opcode");
11772 case ISD::FMUL:
11773 Opcode = AMDGPUISD::FMUL_W_CHAIN;
11774 break;
11775 }
11776
11777 return DAG.getNode(Opcode, SL, VTList,
11778 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
11779 Flags);
11780}
11781
11782static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11783 EVT VT, SDValue A, SDValue B, SDValue C,
11784 SDValue GlueChain, SDNodeFlags Flags) {
11785 if (GlueChain->getNumValues() <= 1) {
11786 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
11787 }
11788
11789 assert(GlueChain->getNumValues() == 3);
11790
11791 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11792 switch (Opcode) {
11793 default:
11794 llvm_unreachable("no chain equivalent for opcode");
11795 case ISD::FMA:
11796 Opcode = AMDGPUISD::FMA_W_CHAIN;
11797 break;
11798 }
11799
11800 return DAG.getNode(Opcode, SL, VTList,
11801 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
11802 Flags);
11803}
11804
11805SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
11806 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11807 return FastLowered;
11808
11809 SDLoc SL(Op);
11810 EVT VT = Op.getValueType();
11811 SDValue LHS = Op.getOperand(0);
11812 SDValue RHS = Op.getOperand(1);
11813
11814 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
11815 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
11816
11817 if (VT == MVT::bf16) {
11818 SDValue ExtDiv =
11819 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
11820 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
11821 DAG.getTargetConstant(0, SL, MVT::i32));
11822 }
11823
11824 assert(VT == MVT::f16);
11825
11826 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
11827 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
11828 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
11829 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
11830 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11831 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
11832 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11833 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
11834 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
11835 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
11836 // q16.u = opx(V_CVT_F16_F32, q32.u);
11837 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
11838
11839 // We will use ISD::FMA on targets that don't support ISD::FMAD.
11840 unsigned FMADOpCode =
11842 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11843 SDValue Rcp =
11844 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
11845 SDValue Quot =
11846 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
11847 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11848 Op->getFlags());
11849 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
11850 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11851 Op->getFlags());
11852 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
11853 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
11854 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
11855 DAG.getConstant(0xff800000, SL, MVT::i32));
11856 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11857 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
11858 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
11859 DAG.getTargetConstant(0, SL, MVT::i32));
11860 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
11861 Op->getFlags());
11862}
11863
11864// Faster 2.5 ULP division that does not support denormals.
11865SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
11866 SDNodeFlags Flags = Op->getFlags();
11867 SDLoc SL(Op);
11868 SDValue LHS = Op.getOperand(1);
11869 SDValue RHS = Op.getOperand(2);
11870
11871 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
11872
11873 const APFloat K0Val(0x1p+96f);
11874 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
11875
11876 const APFloat K1Val(0x1p-32f);
11877 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
11878
11879 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11880
11881 EVT SetCCVT =
11882 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
11883
11884 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
11885
11886 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
11887
11888 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
11889
11890 // rcp does not support denormals.
11891 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
11892
11893 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
11894
11895 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
11896}
11897
11898// Returns immediate value for setting the F32 denorm mode when using the
11899// S_DENORM_MODE instruction.
11902 const GCNSubtarget *ST) {
11903 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
11904 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
11905 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11906 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
11907}
11908
11909SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
11910 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11911 return FastLowered;
11912
11913 // The selection matcher assumes anything with a chain selecting to a
11914 // mayRaiseFPException machine instruction. Since we're introducing a chain
11915 // here, we need to explicitly report nofpexcept for the regular fdiv
11916 // lowering.
11917 SDNodeFlags Flags = Op->getFlags();
11918 Flags.setNoFPExcept(true);
11919
11920 SDLoc SL(Op);
11921 SDValue LHS = Op.getOperand(0);
11922 SDValue RHS = Op.getOperand(1);
11923
11924 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11925
11926 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
11927
11928 SDValue DenominatorScaled =
11929 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
11930 SDValue NumeratorScaled =
11931 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
11932
11933 // Denominator is scaled to not be denormal, so using rcp is ok.
11934 SDValue ApproxRcp =
11935 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
11936 SDValue NegDivScale0 =
11937 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11938
11939 using namespace AMDGPU::Hwreg;
11940 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11941 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
11942
11943 const MachineFunction &MF = DAG.getMachineFunction();
11944 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
11945 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
11946
11947 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
11948 const bool HasDynamicDenormals =
11949 (DenormMode.Input == DenormalMode::Dynamic) ||
11950 (DenormMode.Output == DenormalMode::Dynamic);
11951
11952 SDValue SavedDenormMode;
11953
11954 if (!PreservesDenormals) {
11955 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
11956 // lowering. The chain dependence is insufficient, and we need glue. We do
11957 // not need the glue variants in a strictfp function.
11958
11959 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
11960
11961 SDValue Glue = DAG.getEntryNode();
11962 if (HasDynamicDenormals) {
11963 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
11964 DAG.getVTList(MVT::i32, MVT::Glue),
11965 {BitField, Glue});
11966 SavedDenormMode = SDValue(GetReg, 0);
11967
11968 Glue = DAG.getMergeValues(
11969 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
11970 }
11971
11972 SDNode *EnableDenorm;
11973 if (Subtarget->hasDenormModeInst()) {
11974 const SDValue EnableDenormValue =
11976
11977 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
11978 EnableDenormValue)
11979 .getNode();
11980 } else {
11981 const SDValue EnableDenormValue =
11982 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
11983 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
11984 {EnableDenormValue, BitField, Glue});
11985 }
11986
11987 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11988 SDValue(EnableDenorm, 1)};
11989
11990 NegDivScale0 = DAG.getMergeValues(Ops, SL);
11991 }
11992
11993 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
11994 ApproxRcp, One, NegDivScale0, Flags);
11995
11996 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
11997 ApproxRcp, Fma0, Flags);
11998
11999 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12000 Fma1, Flags);
12001
12002 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12003 NumeratorScaled, Mul, Flags);
12004
12005 SDValue Fma3 =
12006 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12007
12008 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12009 NumeratorScaled, Fma3, Flags);
12010
12011 if (!PreservesDenormals) {
12012 SDNode *DisableDenorm;
12013 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12014 const SDValue DisableDenormValue = getSPDenormModeValue(
12015 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12016
12017 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12018 DisableDenorm =
12019 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12020 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12021 .getNode();
12022 } else {
12023 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12024 const SDValue DisableDenormValue =
12025 HasDynamicDenormals
12026 ? SavedDenormMode
12027 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12028
12029 DisableDenorm = DAG.getMachineNode(
12030 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12031 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12032 }
12033
12034 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12035 SDValue(DisableDenorm, 0), DAG.getRoot());
12036 DAG.setRoot(OutputChain);
12037 }
12038
12039 SDValue Scale = NumeratorScaled.getValue(1);
12040 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12041 {Fma4, Fma1, Fma3, Scale}, Flags);
12042
12043 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12044}
12045
12046SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12047 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12048 return FastLowered;
12049
12050 SDLoc SL(Op);
12051 SDValue X = Op.getOperand(0);
12052 SDValue Y = Op.getOperand(1);
12053
12054 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12055
12056 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12057
12058 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12059
12060 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12061
12062 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12063
12064 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12065
12066 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12067
12068 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12069
12070 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12071
12072 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12073 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12074
12075 SDValue Fma4 =
12076 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12077
12078 SDValue Scale;
12079
12080 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12081 // Workaround a hardware bug on SI where the condition output from div_scale
12082 // is not usable.
12083
12084 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12085
12086 // Figure out if the scale to use for div_fmas.
12087 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12088 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12089 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12090 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12091
12092 SDValue NumHi =
12093 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12094 SDValue DenHi =
12095 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12096
12097 SDValue Scale0Hi =
12098 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12099 SDValue Scale1Hi =
12100 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12101
12102 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12103 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12104 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12105 } else {
12106 Scale = DivScale1.getValue(1);
12107 }
12108
12109 SDValue Fmas =
12110 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12111
12112 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12113}
12114
12115SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12116 EVT VT = Op.getValueType();
12117
12118 if (VT == MVT::f32)
12119 return LowerFDIV32(Op, DAG);
12120
12121 if (VT == MVT::f64)
12122 return LowerFDIV64(Op, DAG);
12123
12124 if (VT == MVT::f16 || VT == MVT::bf16)
12125 return LowerFDIV16(Op, DAG);
12126
12127 llvm_unreachable("Unexpected type for fdiv");
12128}
12129
12130SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12131 SDLoc dl(Op);
12132 SDValue Val = Op.getOperand(0);
12133 EVT VT = Val.getValueType();
12134 EVT ResultExpVT = Op->getValueType(1);
12135 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12136
12137 SDValue Mant = DAG.getNode(
12139 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12140
12141 SDValue Exp = DAG.getNode(
12142 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12143 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12144
12145 if (Subtarget->hasFractBug()) {
12146 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12147 SDValue Inf =
12149
12150 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12151 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12152 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12153 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12154 }
12155
12156 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12157 return DAG.getMergeValues({Mant, CastExp}, dl);
12158}
12159
12160SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12161 SDLoc DL(Op);
12162 StoreSDNode *Store = cast<StoreSDNode>(Op);
12163 EVT VT = Store->getMemoryVT();
12164
12165 if (VT == MVT::i1) {
12166 return DAG.getTruncStore(
12167 Store->getChain(), DL,
12168 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12169 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12170 }
12171
12172 assert(VT.isVector() &&
12173 Store->getValue().getValueType().getScalarType() == MVT::i32);
12174
12175 unsigned AS = Store->getAddressSpace();
12176 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12177 Store->getAlign().value() < VT.getStoreSize() &&
12178 VT.getSizeInBits() > 32) {
12179 return SplitVectorStore(Op, DAG);
12180 }
12181
12182 MachineFunction &MF = DAG.getMachineFunction();
12183 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12184 // If there is a possibility that flat instruction access scratch memory
12185 // then we need to use the same legalization rules we use for private.
12186 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12187 !Subtarget->hasMultiDwordFlatScratchAddressing())
12188 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12191
12192 unsigned NumElements = VT.getVectorNumElements();
12194 if (NumElements > 4)
12195 return SplitVectorStore(Op, DAG);
12196 // v3 stores not supported on SI.
12197 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12198 return SplitVectorStore(Op, DAG);
12199
12201 VT, *Store->getMemOperand()))
12202 return expandUnalignedStore(Store, DAG);
12203
12204 return SDValue();
12205 }
12206 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12207 switch (Subtarget->getMaxPrivateElementSize()) {
12208 case 4:
12209 return scalarizeVectorStore(Store, DAG);
12210 case 8:
12211 if (NumElements > 2)
12212 return SplitVectorStore(Op, DAG);
12213 return SDValue();
12214 case 16:
12215 if (NumElements > 4 ||
12216 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12217 return SplitVectorStore(Op, DAG);
12218 return SDValue();
12219 default:
12220 llvm_unreachable("unsupported private_element_size");
12221 }
12222 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12223 unsigned Fast = 0;
12224 auto Flags = Store->getMemOperand()->getFlags();
12226 Store->getAlign(), Flags, &Fast) &&
12227 Fast > 1)
12228 return SDValue();
12229
12230 if (VT.isVector())
12231 return SplitVectorStore(Op, DAG);
12232
12233 return expandUnalignedStore(Store, DAG);
12234 }
12235
12236 // Probably an invalid store. If so we'll end up emitting a selection error.
12237 return SDValue();
12238}
12239
12240// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12241SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12242 SDLoc SL(Op);
12243 assert(!Subtarget->has16BitInsts());
12244 SDNodeFlags Flags = Op->getFlags();
12245 SDValue Ext =
12246 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12247
12248 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12249 SDValue Sqrt =
12250 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12251
12252 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12253 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12254}
12255
12256SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12257 SDLoc DL(Op);
12258 SDNodeFlags Flags = Op->getFlags();
12259 MVT VT = Op.getValueType().getSimpleVT();
12260 const SDValue X = Op.getOperand(0);
12261
12262 if (allowApproxFunc(DAG, Flags)) {
12263 // Instruction is 1ulp but ignores denormals.
12264 return DAG.getNode(
12266 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12267 }
12268
12269 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12270 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12271
12272 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12273
12274 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12275
12276 SDValue SqrtX =
12277 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12278
12279 SDValue SqrtS;
12280 if (needsDenormHandlingF32(DAG, X, Flags)) {
12281 SDValue SqrtID =
12282 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12283 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12284
12285 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12286 SDValue SqrtSNextDownInt =
12287 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12288 DAG.getAllOnesConstant(DL, MVT::i32));
12289 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12290
12291 SDValue NegSqrtSNextDown =
12292 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12293
12294 SDValue SqrtVP =
12295 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12296
12297 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12298 DAG.getConstant(1, DL, MVT::i32));
12299 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12300
12301 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12302 SDValue SqrtVS =
12303 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12304
12305 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12306 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12307
12308 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12309 Flags);
12310
12311 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12312 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12313 Flags);
12314 } else {
12315 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12316
12317 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12318
12319 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12320 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12321 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12322
12323 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12324 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12325 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12326
12327 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12328 SDValue SqrtD =
12329 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12330 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12331 }
12332
12333 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12334
12335 SDValue ScaledDown =
12336 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12337
12338 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12339 SDValue IsZeroOrInf =
12340 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12341 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12342
12343 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12344}
12345
12346SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12347 // For double type, the SQRT and RSQ instructions don't have required
12348 // precision, we apply Goldschmidt's algorithm to improve the result:
12349 //
12350 // y0 = rsq(x)
12351 // g0 = x * y0
12352 // h0 = 0.5 * y0
12353 //
12354 // r0 = 0.5 - h0 * g0
12355 // g1 = g0 * r0 + g0
12356 // h1 = h0 * r0 + h0
12357 //
12358 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12359 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12360 // h2 = h1 * r1 + h1
12361 //
12362 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12363 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12364 //
12365 // sqrt(x) = g3
12366
12367 SDNodeFlags Flags = Op->getFlags();
12368
12369 SDLoc DL(Op);
12370
12371 SDValue X = Op.getOperand(0);
12372 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12373
12374 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12375
12376 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12377
12378 // Scale up input if it is too small.
12379 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12380 SDValue ScaleUp =
12381 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12382 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12383
12384 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12385
12386 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12387
12388 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12389 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12390
12391 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12392 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12393
12394 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12395
12396 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12397
12398 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12399 SDValue SqrtD0 =
12400 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12401
12402 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12403
12404 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12405 SDValue SqrtD1 =
12406 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12407
12408 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12409
12410 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12411 SDValue ScaleDown =
12412 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12413 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12414
12415 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12416 // with finite only or nsz because rsq(+/-0) = +/-inf
12417
12418 // TODO: Check for DAZ and expand to subnormals
12419 SDValue IsZeroOrInf =
12420 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12421 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12422
12423 // If x is +INF, +0, or -0, use its original value
12424 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12425 Flags);
12426}
12427
12428SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12429 SDLoc DL(Op);
12430 EVT VT = Op.getValueType();
12431 SDValue Arg = Op.getOperand(0);
12432 SDValue TrigVal;
12433
12434 // Propagate fast-math flags so that the multiply we introduce can be folded
12435 // if Arg is already the result of a multiply by constant.
12436 auto Flags = Op->getFlags();
12437
12438 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12439
12440 if (Subtarget->hasTrigReducedRange()) {
12441 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12442 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12443 } else {
12444 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12445 }
12446
12447 switch (Op.getOpcode()) {
12448 case ISD::FCOS:
12449 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12450 case ISD::FSIN:
12451 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12452 default:
12453 llvm_unreachable("Wrong trig opcode");
12454 }
12455}
12456
12457SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12458 SelectionDAG &DAG) const {
12459 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12460 assert(AtomicNode->isCompareAndSwap());
12461 unsigned AS = AtomicNode->getAddressSpace();
12462
12463 // No custom lowering required for local address space
12465 return Op;
12466
12467 // Non-local address space requires custom lowering for atomic compare
12468 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12469 SDLoc DL(Op);
12470 SDValue ChainIn = Op.getOperand(0);
12471 SDValue Addr = Op.getOperand(1);
12472 SDValue Old = Op.getOperand(2);
12473 SDValue New = Op.getOperand(3);
12474 EVT VT = Op.getValueType();
12475 MVT SimpleVT = VT.getSimpleVT();
12476 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12477
12478 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12479 SDValue Ops[] = {ChainIn, Addr, NewOld};
12480
12482 Op->getVTList(), Ops, VT,
12483 AtomicNode->getMemOperand());
12484}
12485
12486//===----------------------------------------------------------------------===//
12487// Custom DAG optimizations
12488//===----------------------------------------------------------------------===//
12489
12490SDValue
12491SITargetLowering::performUCharToFloatCombine(SDNode *N,
12492 DAGCombinerInfo &DCI) const {
12493 EVT VT = N->getValueType(0);
12494 EVT ScalarVT = VT.getScalarType();
12495 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12496 return SDValue();
12497
12498 SelectionDAG &DAG = DCI.DAG;
12499 SDLoc DL(N);
12500
12501 SDValue Src = N->getOperand(0);
12502 EVT SrcVT = Src.getValueType();
12503
12504 // TODO: We could try to match extracting the higher bytes, which would be
12505 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12506 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12507 // about in practice.
12508 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12509 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12510 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12511 DCI.AddToWorklist(Cvt.getNode());
12512
12513 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12514 if (ScalarVT != MVT::f32) {
12515 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12516 DAG.getTargetConstant(0, DL, MVT::i32));
12517 }
12518 return Cvt;
12519 }
12520 }
12521
12522 return SDValue();
12523}
12524
12525SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12526 DAGCombinerInfo &DCI) const {
12527 SDValue MagnitudeOp = N->getOperand(0);
12528 SDValue SignOp = N->getOperand(1);
12529
12530 // The generic combine for fcopysign + fp cast is too conservative with
12531 // vectors, and also gets confused by the splitting we will perform here, so
12532 // peek through FP casts.
12533 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12534 SignOp.getOpcode() == ISD::FP_ROUND)
12535 SignOp = SignOp.getOperand(0);
12536
12537 SelectionDAG &DAG = DCI.DAG;
12538 SDLoc DL(N);
12539 EVT SignVT = SignOp.getValueType();
12540
12541 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12542 // lower half with a copy.
12543 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12544 EVT MagVT = MagnitudeOp.getValueType();
12545
12546 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12547
12548 if (MagVT.getScalarType() == MVT::f64) {
12549 EVT F32VT = MagVT.isVector()
12550 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12551 : MVT::v2f32;
12552
12553 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12554
12556 for (unsigned I = 0; I != NumElts; ++I) {
12557 SDValue MagLo =
12558 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12559 DAG.getConstant(2 * I, DL, MVT::i32));
12560 SDValue MagHi =
12561 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12562 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12563
12564 SDValue SignOpElt =
12565 MagVT.isVector()
12567 SignOp, DAG.getConstant(I, DL, MVT::i32))
12568 : SignOp;
12569
12570 SDValue HiOp =
12571 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12572
12573 SDValue Vector =
12574 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12575
12576 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12577 NewElts.push_back(NewElt);
12578 }
12579
12580 if (NewElts.size() == 1)
12581 return NewElts[0];
12582
12583 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12584 }
12585
12586 if (SignVT.getScalarType() != MVT::f64)
12587 return SDValue();
12588
12589 // Reduce width of sign operand, we only need the highest bit.
12590 //
12591 // fcopysign f64:x, f64:y ->
12592 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12593 // TODO: In some cases it might make sense to go all the way to f16.
12594
12595 EVT F32VT = MagVT.isVector()
12596 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12597 : MVT::v2f32;
12598
12599 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12600
12601 SmallVector<SDValue, 8> F32Signs;
12602 for (unsigned I = 0; I != NumElts; ++I) {
12603 // Take sign from odd elements of cast vector
12604 SDValue SignAsF32 =
12605 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12606 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12607 F32Signs.push_back(SignAsF32);
12608 }
12609
12610 SDValue NewSign =
12611 NumElts == 1
12612 ? F32Signs.back()
12614 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12615 F32Signs);
12616
12617 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12618 NewSign);
12619}
12620
12621// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12622// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12623// bits
12624
12625// This is a variant of
12626// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12627//
12628// The normal DAG combiner will do this, but only if the add has one use since
12629// that would increase the number of instructions.
12630//
12631// This prevents us from seeing a constant offset that can be folded into a
12632// memory instruction's addressing mode. If we know the resulting add offset of
12633// a pointer can be folded into an addressing offset, we can replace the pointer
12634// operand with the add of new constant offset. This eliminates one of the uses,
12635// and may allow the remaining use to also be simplified.
12636//
12637SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12638 EVT MemVT,
12639 DAGCombinerInfo &DCI) const {
12640 SDValue N0 = N->getOperand(0);
12641 SDValue N1 = N->getOperand(1);
12642
12643 // We only do this to handle cases where it's profitable when there are
12644 // multiple uses of the add, so defer to the standard combine.
12645 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12646 N0->hasOneUse())
12647 return SDValue();
12648
12649 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12650 if (!CN1)
12651 return SDValue();
12652
12653 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12654 if (!CAdd)
12655 return SDValue();
12656
12657 SelectionDAG &DAG = DCI.DAG;
12658
12659 if (N0->getOpcode() == ISD::OR &&
12660 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12661 return SDValue();
12662
12663 // If the resulting offset is too large, we can't fold it into the
12664 // addressing mode offset.
12665 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12666 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12667
12668 AddrMode AM;
12669 AM.HasBaseReg = true;
12670 AM.BaseOffs = Offset.getSExtValue();
12671 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12672 return SDValue();
12673
12674 SDLoc SL(N);
12675 EVT VT = N->getValueType(0);
12676
12677 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12678 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12679
12680 SDNodeFlags Flags;
12681 Flags.setNoUnsignedWrap(
12682 N->getFlags().hasNoUnsignedWrap() &&
12683 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12684
12685 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12686}
12687
12688/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12689/// by the chain and intrinsic ID. Theoretically we would also need to check the
12690/// specific intrinsic, but they all place the pointer operand first.
12691static unsigned getBasePtrIndex(const MemSDNode *N) {
12692 switch (N->getOpcode()) {
12693 case ISD::STORE:
12696 return 2;
12697 default:
12698 return 1;
12699 }
12700}
12701
12702SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12703 DAGCombinerInfo &DCI) const {
12704 SelectionDAG &DAG = DCI.DAG;
12705
12706 unsigned PtrIdx = getBasePtrIndex(N);
12707 SDValue Ptr = N->getOperand(PtrIdx);
12708
12709 // TODO: We could also do this for multiplies.
12710 if (Ptr.getOpcode() == ISD::SHL) {
12711 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12712 N->getMemoryVT(), DCI);
12713 if (NewPtr) {
12714 SmallVector<SDValue, 8> NewOps(N->ops());
12715
12716 NewOps[PtrIdx] = NewPtr;
12717 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12718 }
12719 }
12720
12721 return SDValue();
12722}
12723
12724static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12725 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12726 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12727 (Opc == ISD::XOR && Val == 0);
12728}
12729
12730// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12731// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
12732// integer combine opportunities since most 64-bit operations are decomposed
12733// this way. TODO: We won't want this for SALU especially if it is an inline
12734// immediate.
12735SDValue SITargetLowering::splitBinaryBitConstantOp(
12736 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
12737 const ConstantSDNode *CRHS) const {
12738 uint64_t Val = CRHS->getZExtValue();
12739 uint32_t ValLo = Lo_32(Val);
12740 uint32_t ValHi = Hi_32(Val);
12741 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12742
12743 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
12745 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
12746 // We have 64-bit scalar and/or/xor, but do not have vector forms.
12747 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
12748 !CRHS->user_begin()->isDivergent())
12749 return SDValue();
12750
12751 // If we need to materialize a 64-bit immediate, it will be split up later
12752 // anyway. Avoid creating the harder to understand 64-bit immediate
12753 // materialization.
12754 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
12755 }
12756
12757 return SDValue();
12758}
12759
12761 if (V.getValueType() != MVT::i1)
12762 return false;
12763 switch (V.getOpcode()) {
12764 default:
12765 break;
12766 case ISD::SETCC:
12767 case ISD::IS_FPCLASS:
12769 return true;
12770 case ISD::AND:
12771 case ISD::OR:
12772 case ISD::XOR:
12773 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
12774 case ISD::SADDO:
12775 case ISD::UADDO:
12776 case ISD::SSUBO:
12777 case ISD::USUBO:
12778 case ISD::SMULO:
12779 case ISD::UMULO:
12780 return V.getResNo() == 1;
12782 unsigned IntrinsicID = V.getConstantOperandVal(0);
12783 switch (IntrinsicID) {
12784 case Intrinsic::amdgcn_is_shared:
12785 case Intrinsic::amdgcn_is_private:
12786 return true;
12787 default:
12788 return false;
12789 }
12790
12791 return false;
12792 }
12793 }
12794 return false;
12795}
12796
12797// If a constant has all zeroes or all ones within each byte return it.
12798// Otherwise return 0.
12800 // 0xff for any zero byte in the mask
12801 uint32_t ZeroByteMask = 0;
12802 if (!(C & 0x000000ff))
12803 ZeroByteMask |= 0x000000ff;
12804 if (!(C & 0x0000ff00))
12805 ZeroByteMask |= 0x0000ff00;
12806 if (!(C & 0x00ff0000))
12807 ZeroByteMask |= 0x00ff0000;
12808 if (!(C & 0xff000000))
12809 ZeroByteMask |= 0xff000000;
12810 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
12811 if ((NonZeroByteMask & C) != NonZeroByteMask)
12812 return 0; // Partial bytes selected.
12813 return C;
12814}
12815
12816// Check if a node selects whole bytes from its operand 0 starting at a byte
12817// boundary while masking the rest. Returns select mask as in the v_perm_b32
12818// or -1 if not succeeded.
12819// Note byte select encoding:
12820// value 0-3 selects corresponding source byte;
12821// value 0xc selects zero;
12822// value 0xff selects 0xff.
12824 assert(V.getValueSizeInBits() == 32);
12825
12826 if (V.getNumOperands() != 2)
12827 return ~0;
12828
12829 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
12830 if (!N1)
12831 return ~0;
12832
12833 uint32_t C = N1->getZExtValue();
12834
12835 switch (V.getOpcode()) {
12836 default:
12837 break;
12838 case ISD::AND:
12839 if (uint32_t ConstMask = getConstantPermuteMask(C))
12840 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12841 break;
12842
12843 case ISD::OR:
12844 if (uint32_t ConstMask = getConstantPermuteMask(C))
12845 return (0x03020100 & ~ConstMask) | ConstMask;
12846 break;
12847
12848 case ISD::SHL:
12849 if (C % 8)
12850 return ~0;
12851
12852 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
12853
12854 case ISD::SRL:
12855 if (C % 8)
12856 return ~0;
12857
12858 return uint32_t(0x0c0c0c0c03020100ull >> C);
12859 }
12860
12861 return ~0;
12862}
12863
12864SDValue SITargetLowering::performAndCombine(SDNode *N,
12865 DAGCombinerInfo &DCI) const {
12866 if (DCI.isBeforeLegalize())
12867 return SDValue();
12868
12869 SelectionDAG &DAG = DCI.DAG;
12870 EVT VT = N->getValueType(0);
12871 SDValue LHS = N->getOperand(0);
12872 SDValue RHS = N->getOperand(1);
12873
12874 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12875 if (VT == MVT::i64 && CRHS) {
12876 if (SDValue Split =
12877 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
12878 return Split;
12879 }
12880
12881 if (CRHS && VT == MVT::i32) {
12882 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
12883 // nb = number of trailing zeroes in mask
12884 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
12885 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
12886 uint64_t Mask = CRHS->getZExtValue();
12887 unsigned Bits = llvm::popcount(Mask);
12888 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
12889 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
12890 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
12891 unsigned Shift = CShift->getZExtValue();
12892 unsigned NB = CRHS->getAPIntValue().countr_zero();
12893 unsigned Offset = NB + Shift;
12894 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
12895 SDLoc SL(N);
12896 SDValue BFE =
12897 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
12898 DAG.getConstant(Offset, SL, MVT::i32),
12899 DAG.getConstant(Bits, SL, MVT::i32));
12900 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
12901 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
12902 DAG.getValueType(NarrowVT));
12903 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
12904 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
12905 return Shl;
12906 }
12907 }
12908 }
12909
12910 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12911 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
12912 isa<ConstantSDNode>(LHS.getOperand(2))) {
12913 uint32_t Sel = getConstantPermuteMask(Mask);
12914 if (!Sel)
12915 return SDValue();
12916
12917 // Select 0xc for all zero bytes
12918 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12919 SDLoc DL(N);
12920 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12921 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12922 }
12923 }
12924
12925 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
12926 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
12927 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
12928 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12929 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
12930
12931 SDValue X = LHS.getOperand(0);
12932 SDValue Y = RHS.getOperand(0);
12933 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
12934 !isTypeLegal(X.getValueType()))
12935 return SDValue();
12936
12937 if (LCC == ISD::SETO) {
12938 if (X != LHS.getOperand(1))
12939 return SDValue();
12940
12941 if (RCC == ISD::SETUNE) {
12942 const ConstantFPSDNode *C1 =
12943 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
12944 if (!C1 || !C1->isInfinity() || C1->isNegative())
12945 return SDValue();
12946
12947 const uint32_t Mask = SIInstrFlags::N_NORMAL |
12951
12952 static_assert(
12955 0x3ff) == Mask,
12956 "mask not equal");
12957
12958 SDLoc DL(N);
12959 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
12960 DAG.getConstant(Mask, DL, MVT::i32));
12961 }
12962 }
12963 }
12964
12965 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
12966 std::swap(LHS, RHS);
12967
12968 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12969 RHS.hasOneUse()) {
12970 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12971 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
12972 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
12973 // | n_nan)
12974 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12975 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
12976 (RHS.getOperand(0) == LHS.getOperand(0) &&
12977 LHS.getOperand(0) == LHS.getOperand(1))) {
12978 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
12979 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
12980 : Mask->getZExtValue() & OrdMask;
12981
12982 SDLoc DL(N);
12983 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
12984 DAG.getConstant(NewMask, DL, MVT::i32));
12985 }
12986 }
12987
12988 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
12989 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12990 // and x, (sext cc from i1) => select cc, x, 0
12991 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
12992 std::swap(LHS, RHS);
12993 if (isBoolSGPR(RHS.getOperand(0)))
12994 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
12995 DAG.getConstant(0, SDLoc(N), MVT::i32));
12996 }
12997
12998 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12999 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13000 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13001 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13002 uint32_t LHSMask = getPermuteMask(LHS);
13003 uint32_t RHSMask = getPermuteMask(RHS);
13004 if (LHSMask != ~0u && RHSMask != ~0u) {
13005 // Canonicalize the expression in an attempt to have fewer unique masks
13006 // and therefore fewer registers used to hold the masks.
13007 if (LHSMask > RHSMask) {
13008 std::swap(LHSMask, RHSMask);
13009 std::swap(LHS, RHS);
13010 }
13011
13012 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13013 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13014 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13015 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13016
13017 // Check of we need to combine values from two sources within a byte.
13018 if (!(LHSUsedLanes & RHSUsedLanes) &&
13019 // If we select high and lower word keep it for SDWA.
13020 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13021 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13022 // Each byte in each mask is either selector mask 0-3, or has higher
13023 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13024 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13025 // mask which is not 0xff wins. By anding both masks we have a correct
13026 // result except that 0x0c shall be corrected to give 0x0c only.
13027 uint32_t Mask = LHSMask & RHSMask;
13028 for (unsigned I = 0; I < 32; I += 8) {
13029 uint32_t ByteSel = 0xff << I;
13030 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13031 Mask &= (0x0c << I) & 0xffffffff;
13032 }
13033
13034 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13035 // or 0x0c.
13036 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13037 SDLoc DL(N);
13038
13039 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13040 RHS.getOperand(0),
13041 DAG.getConstant(Sel, DL, MVT::i32));
13042 }
13043 }
13044 }
13045
13046 return SDValue();
13047}
13048
13049// A key component of v_perm is a mapping between byte position of the src
13050// operands, and the byte position of the dest. To provide such, we need: 1. the
13051// node that provides x byte of the dest of the OR, and 2. the byte of the node
13052// used to provide that x byte. calculateByteProvider finds which node provides
13053// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13054// and finds an ultimate src and byte position For example: The supported
13055// LoadCombine pattern for vector loads is as follows
13056// t1
13057// or
13058// / \
13059// t2 t3
13060// zext shl
13061// | | \
13062// t4 t5 16
13063// or anyext
13064// / \ |
13065// t6 t7 t8
13066// srl shl or
13067// / | / \ / \
13068// t9 t10 t11 t12 t13 t14
13069// trunc* 8 trunc* 8 and and
13070// | | / | | \
13071// t15 t16 t17 t18 t19 t20
13072// trunc* 255 srl -256
13073// | / \
13074// t15 t15 16
13075//
13076// *In this example, the truncs are from i32->i16
13077//
13078// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13079// respectively. calculateSrcByte would find (given node) -> ultimate src &
13080// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13081// After finding the mapping, we can combine the tree into vperm t15, t16,
13082// 0x05000407
13083
13084// Find the source and byte position from a node.
13085// \p DestByte is the byte position of the dest of the or that the src
13086// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13087// dest of the or byte. \p Depth tracks how many recursive iterations we have
13088// performed.
13089static const std::optional<ByteProvider<SDValue>>
13090calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13091 unsigned Depth = 0) {
13092 // We may need to recursively traverse a series of SRLs
13093 if (Depth >= 6)
13094 return std::nullopt;
13095
13096 if (Op.getValueSizeInBits() < 8)
13097 return std::nullopt;
13098
13099 if (Op.getValueType().isVector())
13100 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13101
13102 switch (Op->getOpcode()) {
13103 case ISD::TRUNCATE: {
13104 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13105 }
13106
13107 case ISD::SIGN_EXTEND:
13108 case ISD::ZERO_EXTEND:
13110 SDValue NarrowOp = Op->getOperand(0);
13111 auto NarrowVT = NarrowOp.getValueType();
13112 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13113 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13114 NarrowVT = VTSign->getVT();
13115 }
13116 if (!NarrowVT.isByteSized())
13117 return std::nullopt;
13118 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13119
13120 if (SrcIndex >= NarrowByteWidth)
13121 return std::nullopt;
13122 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13123 }
13124
13125 case ISD::SRA:
13126 case ISD::SRL: {
13127 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13128 if (!ShiftOp)
13129 return std::nullopt;
13130
13131 uint64_t BitShift = ShiftOp->getZExtValue();
13132
13133 if (BitShift % 8 != 0)
13134 return std::nullopt;
13135
13136 SrcIndex += BitShift / 8;
13137
13138 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13139 }
13140
13141 default: {
13142 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13143 }
13144 }
13145 llvm_unreachable("fully handled switch");
13146}
13147
13148// For a byte position in the result of an Or, traverse the tree and find the
13149// node (and the byte of the node) which ultimately provides this {Or,
13150// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13151// the byte position of the Op that corresponds with the originally requested
13152// byte of the Or \p Depth tracks how many recursive iterations we have
13153// performed. \p StartingIndex is the originally requested byte of the Or
13154static const std::optional<ByteProvider<SDValue>>
13155calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13156 unsigned StartingIndex = 0) {
13157 // Finding Src tree of RHS of or typically requires at least 1 additional
13158 // depth
13159 if (Depth > 6)
13160 return std::nullopt;
13161
13162 unsigned BitWidth = Op.getScalarValueSizeInBits();
13163 if (BitWidth % 8 != 0)
13164 return std::nullopt;
13165 if (Index > BitWidth / 8 - 1)
13166 return std::nullopt;
13167
13168 bool IsVec = Op.getValueType().isVector();
13169 switch (Op.getOpcode()) {
13170 case ISD::OR: {
13171 if (IsVec)
13172 return std::nullopt;
13173
13174 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13175 StartingIndex);
13176 if (!RHS)
13177 return std::nullopt;
13178 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13179 StartingIndex);
13180 if (!LHS)
13181 return std::nullopt;
13182 // A well formed Or will have two ByteProviders for each byte, one of which
13183 // is constant zero
13184 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13185 return std::nullopt;
13186 if (!LHS || LHS->isConstantZero())
13187 return RHS;
13188 if (!RHS || RHS->isConstantZero())
13189 return LHS;
13190 return std::nullopt;
13191 }
13192
13193 case ISD::AND: {
13194 if (IsVec)
13195 return std::nullopt;
13196
13197 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13198 if (!BitMaskOp)
13199 return std::nullopt;
13200
13201 uint32_t BitMask = BitMaskOp->getZExtValue();
13202 // Bits we expect for our StartingIndex
13203 uint32_t IndexMask = 0xFF << (Index * 8);
13204
13205 if ((IndexMask & BitMask) != IndexMask) {
13206 // If the result of the and partially provides the byte, then it
13207 // is not well formatted
13208 if (IndexMask & BitMask)
13209 return std::nullopt;
13211 }
13212
13213 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13214 }
13215
13216 case ISD::FSHR: {
13217 if (IsVec)
13218 return std::nullopt;
13219
13220 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13221 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13222 if (!ShiftOp || Op.getValueType().isVector())
13223 return std::nullopt;
13224
13225 uint64_t BitsProvided = Op.getValueSizeInBits();
13226 if (BitsProvided % 8 != 0)
13227 return std::nullopt;
13228
13229 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13230 if (BitShift % 8)
13231 return std::nullopt;
13232
13233 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13234 uint64_t ByteShift = BitShift / 8;
13235
13236 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13237 uint64_t BytesProvided = BitsProvided / 8;
13238 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13239 NewIndex %= BytesProvided;
13240 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13241 }
13242
13243 case ISD::SRA:
13244 case ISD::SRL: {
13245 if (IsVec)
13246 return std::nullopt;
13247
13248 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13249 if (!ShiftOp)
13250 return std::nullopt;
13251
13252 uint64_t BitShift = ShiftOp->getZExtValue();
13253 if (BitShift % 8)
13254 return std::nullopt;
13255
13256 auto BitsProvided = Op.getScalarValueSizeInBits();
13257 if (BitsProvided % 8 != 0)
13258 return std::nullopt;
13259
13260 uint64_t BytesProvided = BitsProvided / 8;
13261 uint64_t ByteShift = BitShift / 8;
13262 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13263 // If the byte we are trying to provide (as tracked by index) falls in this
13264 // range, then the SRL provides the byte. The byte of interest of the src of
13265 // the SRL is Index + ByteShift
13266 return BytesProvided - ByteShift > Index
13267 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13268 Index + ByteShift)
13270 }
13271
13272 case ISD::SHL: {
13273 if (IsVec)
13274 return std::nullopt;
13275
13276 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13277 if (!ShiftOp)
13278 return std::nullopt;
13279
13280 uint64_t BitShift = ShiftOp->getZExtValue();
13281 if (BitShift % 8 != 0)
13282 return std::nullopt;
13283 uint64_t ByteShift = BitShift / 8;
13284
13285 // If we are shifting by an amount greater than (or equal to)
13286 // the index we are trying to provide, then it provides 0s. If not,
13287 // then this bytes are not definitively 0s, and the corresponding byte
13288 // of interest is Index - ByteShift of the src
13289 return Index < ByteShift
13291 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13292 Depth + 1, StartingIndex);
13293 }
13294 case ISD::ANY_EXTEND:
13295 case ISD::SIGN_EXTEND:
13296 case ISD::ZERO_EXTEND:
13298 case ISD::AssertZext:
13299 case ISD::AssertSext: {
13300 if (IsVec)
13301 return std::nullopt;
13302
13303 SDValue NarrowOp = Op->getOperand(0);
13304 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13305 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13306 Op->getOpcode() == ISD::AssertZext ||
13307 Op->getOpcode() == ISD::AssertSext) {
13308 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13309 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13310 }
13311 if (NarrowBitWidth % 8 != 0)
13312 return std::nullopt;
13313 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13314
13315 if (Index >= NarrowByteWidth)
13316 return Op.getOpcode() == ISD::ZERO_EXTEND
13317 ? std::optional<ByteProvider<SDValue>>(
13319 : std::nullopt;
13320 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13321 }
13322
13323 case ISD::TRUNCATE: {
13324 if (IsVec)
13325 return std::nullopt;
13326
13327 uint64_t NarrowByteWidth = BitWidth / 8;
13328
13329 if (NarrowByteWidth >= Index) {
13330 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13331 StartingIndex);
13332 }
13333
13334 return std::nullopt;
13335 }
13336
13337 case ISD::CopyFromReg: {
13338 if (BitWidth / 8 > Index)
13339 return calculateSrcByte(Op, StartingIndex, Index);
13340
13341 return std::nullopt;
13342 }
13343
13344 case ISD::LOAD: {
13345 auto *L = cast<LoadSDNode>(Op.getNode());
13346
13347 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13348 if (NarrowBitWidth % 8 != 0)
13349 return std::nullopt;
13350 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13351
13352 // If the width of the load does not reach byte we are trying to provide for
13353 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13354 // question
13355 if (Index >= NarrowByteWidth) {
13356 return L->getExtensionType() == ISD::ZEXTLOAD
13357 ? std::optional<ByteProvider<SDValue>>(
13359 : std::nullopt;
13360 }
13361
13362 if (NarrowByteWidth > Index) {
13363 return calculateSrcByte(Op, StartingIndex, Index);
13364 }
13365
13366 return std::nullopt;
13367 }
13368
13369 case ISD::BSWAP: {
13370 if (IsVec)
13371 return std::nullopt;
13372
13373 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13374 Depth + 1, StartingIndex);
13375 }
13376
13378 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13379 if (!IdxOp)
13380 return std::nullopt;
13381 auto VecIdx = IdxOp->getZExtValue();
13382 auto ScalarSize = Op.getScalarValueSizeInBits();
13383 if (ScalarSize < 32)
13384 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13385 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13386 StartingIndex, Index);
13387 }
13388
13389 case AMDGPUISD::PERM: {
13390 if (IsVec)
13391 return std::nullopt;
13392
13393 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13394 if (!PermMask)
13395 return std::nullopt;
13396
13397 auto IdxMask =
13398 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13399 if (IdxMask > 0x07 && IdxMask != 0x0c)
13400 return std::nullopt;
13401
13402 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13403 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13404
13405 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13408 }
13409
13410 default: {
13411 return std::nullopt;
13412 }
13413 }
13414
13415 llvm_unreachable("fully handled switch");
13416}
13417
13418// Returns true if the Operand is a scalar and is 16 bits
13419static bool isExtendedFrom16Bits(SDValue &Operand) {
13420
13421 switch (Operand.getOpcode()) {
13422 case ISD::ANY_EXTEND:
13423 case ISD::SIGN_EXTEND:
13424 case ISD::ZERO_EXTEND: {
13425 auto OpVT = Operand.getOperand(0).getValueType();
13426 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13427 }
13428 case ISD::LOAD: {
13429 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13430 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13431 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13432 ExtType == ISD::EXTLOAD) {
13433 auto MemVT = L->getMemoryVT();
13434 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13435 }
13436 return L->getMemoryVT().getSizeInBits() == 16;
13437 }
13438 default:
13439 return false;
13440 }
13441}
13442
13443// Returns true if the mask matches consecutive bytes, and the first byte
13444// begins at a power of 2 byte offset from 0th byte
13445static bool addresses16Bits(int Mask) {
13446 int Low8 = Mask & 0xff;
13447 int Hi8 = (Mask & 0xff00) >> 8;
13448
13449 assert(Low8 < 8 && Hi8 < 8);
13450 // Are the bytes contiguous in the order of increasing addresses.
13451 bool IsConsecutive = (Hi8 - Low8 == 1);
13452 // Is the first byte at location that is aligned for 16 bit instructions.
13453 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13454 // In this case, we still need code to extract the 16 bit operand, so it
13455 // is better to use i8 v_perm
13456 bool Is16Aligned = !(Low8 % 2);
13457
13458 return IsConsecutive && Is16Aligned;
13459}
13460
13461// Do not lower into v_perm if the operands are actually 16 bit
13462// and the selected bits (based on PermMask) correspond with two
13463// easily addressable 16 bit operands.
13465 SDValue &OtherOp) {
13466 int Low16 = PermMask & 0xffff;
13467 int Hi16 = (PermMask & 0xffff0000) >> 16;
13468
13469 auto TempOp = peekThroughBitcasts(Op);
13470 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13471
13472 auto OpIs16Bit =
13473 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13474 if (!OpIs16Bit)
13475 return true;
13476
13477 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13478 isExtendedFrom16Bits(TempOtherOp);
13479 if (!OtherOpIs16Bit)
13480 return true;
13481
13482 // Do we cleanly address both
13483 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13484}
13485
13487 unsigned DWordOffset) {
13488 SDValue Ret;
13489
13490 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13491 // ByteProvider must be at least 8 bits
13492 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13493
13494 if (TypeSize <= 32)
13495 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13496
13497 if (Src.getValueType().isVector()) {
13498 auto ScalarTySize = Src.getScalarValueSizeInBits();
13499 auto ScalarTy = Src.getValueType().getScalarType();
13500 if (ScalarTySize == 32) {
13501 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13502 DAG.getConstant(DWordOffset, SL, MVT::i32));
13503 }
13504 if (ScalarTySize > 32) {
13505 Ret = DAG.getNode(
13506 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13507 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13508 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13509 if (ShiftVal)
13510 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13511 DAG.getConstant(ShiftVal, SL, MVT::i32));
13512 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13513 }
13514
13515 assert(ScalarTySize < 32);
13516 auto NumElements = TypeSize / ScalarTySize;
13517 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13518 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13519 auto NumElementsIn32 = 32 / ScalarTySize;
13520 auto NumAvailElements = DWordOffset < Trunc32Elements
13521 ? NumElementsIn32
13522 : NumElements - NormalizedTrunc;
13523
13525 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13526 NumAvailElements);
13527
13528 Ret = DAG.getBuildVector(
13529 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13530 VecSrcs);
13531 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13532 }
13533
13534 /// Scalar Type
13535 auto ShiftVal = 32 * DWordOffset;
13536 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13537 DAG.getConstant(ShiftVal, SL, MVT::i32));
13538 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13539}
13540
13542 SelectionDAG &DAG = DCI.DAG;
13543 [[maybe_unused]] EVT VT = N->getValueType(0);
13545
13546 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13547 assert(VT == MVT::i32);
13548 for (int i = 0; i < 4; i++) {
13549 // Find the ByteProvider that provides the ith byte of the result of OR
13550 std::optional<ByteProvider<SDValue>> P =
13551 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13552 // TODO support constantZero
13553 if (!P || P->isConstantZero())
13554 return SDValue();
13555
13556 PermNodes.push_back(*P);
13557 }
13558 if (PermNodes.size() != 4)
13559 return SDValue();
13560
13561 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13562 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13563 uint64_t PermMask = 0x00000000;
13564 for (size_t i = 0; i < PermNodes.size(); i++) {
13565 auto PermOp = PermNodes[i];
13566 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13567 // by sizeof(Src2) = 4
13568 int SrcByteAdjust = 4;
13569
13570 // If the Src uses a byte from a different DWORD, then it corresponds
13571 // with a difference source
13572 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13573 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13574 if (SecondSrc)
13575 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13576 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13577 return SDValue();
13578
13579 // Set the index of the second distinct Src node
13580 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13581 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13582 SrcByteAdjust = 0;
13583 }
13584 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13586 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13587 }
13588 SDLoc DL(N);
13589 SDValue Op = *PermNodes[FirstSrc.first].Src;
13590 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13591 assert(Op.getValueSizeInBits() == 32);
13592
13593 // Check that we are not just extracting the bytes in order from an op
13594 if (!SecondSrc) {
13595 int Low16 = PermMask & 0xffff;
13596 int Hi16 = (PermMask & 0xffff0000) >> 16;
13597
13598 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13599 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13600
13601 // The perm op would really just produce Op. So combine into Op
13602 if (WellFormedLow && WellFormedHi)
13603 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13604 }
13605
13606 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13607
13608 if (SecondSrc) {
13609 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13610 assert(OtherOp.getValueSizeInBits() == 32);
13611 }
13612
13613 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13614
13615 assert(Op.getValueType().isByteSized() &&
13616 OtherOp.getValueType().isByteSized());
13617
13618 // If the ultimate src is less than 32 bits, then we will only be
13619 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13620 // CalculateByteProvider would not have returned Op as source if we
13621 // used a byte that is outside its ValueType. Thus, we are free to
13622 // ANY_EXTEND as the extended bits are dont-cares.
13623 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13624 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13625
13626 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13627 DAG.getConstant(PermMask, DL, MVT::i32));
13628 }
13629 return SDValue();
13630}
13631
13632SDValue SITargetLowering::performOrCombine(SDNode *N,
13633 DAGCombinerInfo &DCI) const {
13634 SelectionDAG &DAG = DCI.DAG;
13635 SDValue LHS = N->getOperand(0);
13636 SDValue RHS = N->getOperand(1);
13637
13638 EVT VT = N->getValueType(0);
13639 if (VT == MVT::i1) {
13640 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13641 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13642 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13643 SDValue Src = LHS.getOperand(0);
13644 if (Src != RHS.getOperand(0))
13645 return SDValue();
13646
13647 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13648 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13649 if (!CLHS || !CRHS)
13650 return SDValue();
13651
13652 // Only 10 bits are used.
13653 static const uint32_t MaxMask = 0x3ff;
13654
13655 uint32_t NewMask =
13656 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13657 SDLoc DL(N);
13658 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13659 DAG.getConstant(NewMask, DL, MVT::i32));
13660 }
13661
13662 return SDValue();
13663 }
13664
13665 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13667 LHS.getOpcode() == AMDGPUISD::PERM &&
13668 isa<ConstantSDNode>(LHS.getOperand(2))) {
13669 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13670 if (!Sel)
13671 return SDValue();
13672
13673 Sel |= LHS.getConstantOperandVal(2);
13674 SDLoc DL(N);
13675 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13676 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13677 }
13678
13679 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13680 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13681 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13682 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13683
13684 // If all the uses of an or need to extract the individual elements, do not
13685 // attempt to lower into v_perm
13686 auto usesCombinedOperand = [](SDNode *OrUse) {
13687 // If we have any non-vectorized use, then it is a candidate for v_perm
13688 if (OrUse->getOpcode() != ISD::BITCAST ||
13689 !OrUse->getValueType(0).isVector())
13690 return true;
13691
13692 // If we have any non-vectorized use, then it is a candidate for v_perm
13693 for (auto *VUser : OrUse->users()) {
13694 if (!VUser->getValueType(0).isVector())
13695 return true;
13696
13697 // If the use of a vector is a store, then combining via a v_perm
13698 // is beneficial.
13699 // TODO -- whitelist more uses
13700 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13701 if (VUser->getOpcode() == VectorwiseOp)
13702 return true;
13703 }
13704 return false;
13705 };
13706
13707 if (!any_of(N->users(), usesCombinedOperand))
13708 return SDValue();
13709
13710 uint32_t LHSMask = getPermuteMask(LHS);
13711 uint32_t RHSMask = getPermuteMask(RHS);
13712
13713 if (LHSMask != ~0u && RHSMask != ~0u) {
13714 // Canonicalize the expression in an attempt to have fewer unique masks
13715 // and therefore fewer registers used to hold the masks.
13716 if (LHSMask > RHSMask) {
13717 std::swap(LHSMask, RHSMask);
13718 std::swap(LHS, RHS);
13719 }
13720
13721 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13722 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13723 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13724 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13725
13726 // Check of we need to combine values from two sources within a byte.
13727 if (!(LHSUsedLanes & RHSUsedLanes) &&
13728 // If we select high and lower word keep it for SDWA.
13729 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13730 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13731 // Kill zero bytes selected by other mask. Zero value is 0xc.
13732 LHSMask &= ~RHSUsedLanes;
13733 RHSMask &= ~LHSUsedLanes;
13734 // Add 4 to each active LHS lane
13735 LHSMask |= LHSUsedLanes & 0x04040404;
13736 // Combine masks
13737 uint32_t Sel = LHSMask | RHSMask;
13738 SDLoc DL(N);
13739
13740 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13741 RHS.getOperand(0),
13742 DAG.getConstant(Sel, DL, MVT::i32));
13743 }
13744 }
13745 if (LHSMask == ~0u || RHSMask == ~0u) {
13746 if (SDValue Perm = matchPERM(N, DCI))
13747 return Perm;
13748 }
13749 }
13750
13751 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13752 return SDValue();
13753
13754 // TODO: This could be a generic combine with a predicate for extracting the
13755 // high half of an integer being free.
13756
13757 // (or i64:x, (zero_extend i32:y)) ->
13758 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
13759 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
13760 RHS.getOpcode() != ISD::ZERO_EXTEND)
13761 std::swap(LHS, RHS);
13762
13763 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
13764 SDValue ExtSrc = RHS.getOperand(0);
13765 EVT SrcVT = ExtSrc.getValueType();
13766 if (SrcVT == MVT::i32) {
13767 SDLoc SL(N);
13768 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
13769 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
13770
13771 DCI.AddToWorklist(LowOr.getNode());
13772 DCI.AddToWorklist(HiBits.getNode());
13773
13774 SDValue Vec =
13775 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
13776 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13777 }
13778 }
13779
13780 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
13781 if (CRHS) {
13782 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
13783 N->getOperand(0), CRHS))
13784 return Split;
13785 }
13786
13787 return SDValue();
13788}
13789
13790SDValue SITargetLowering::performXorCombine(SDNode *N,
13791 DAGCombinerInfo &DCI) const {
13792 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
13793 return RV;
13794
13795 SDValue LHS = N->getOperand(0);
13796 SDValue RHS = N->getOperand(1);
13797
13798 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13799 SelectionDAG &DAG = DCI.DAG;
13800
13801 EVT VT = N->getValueType(0);
13802 if (CRHS && VT == MVT::i64) {
13803 if (SDValue Split =
13804 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
13805 return Split;
13806 }
13807
13808 // Make sure to apply the 64-bit constant splitting fold before trying to fold
13809 // fneg-like xors into 64-bit select.
13810 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
13811 // This looks like an fneg, try to fold as a source modifier.
13812 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
13814 // xor (select c, a, b), 0x80000000 ->
13815 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
13816 SDLoc DL(N);
13817 SDValue CastLHS =
13818 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
13819 SDValue CastRHS =
13820 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
13821 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
13822 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
13823 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
13824 LHS->getOperand(0), FNegLHS, FNegRHS);
13825 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13826 }
13827 }
13828
13829 return SDValue();
13830}
13831
13832SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
13833 DAGCombinerInfo &DCI) const {
13834 if (!Subtarget->has16BitInsts() ||
13835 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13836 return SDValue();
13837
13838 EVT VT = N->getValueType(0);
13839 if (VT != MVT::i32)
13840 return SDValue();
13841
13842 SDValue Src = N->getOperand(0);
13843 if (Src.getValueType() != MVT::i16)
13844 return SDValue();
13845
13846 return SDValue();
13847}
13848
13849SDValue
13850SITargetLowering::performSignExtendInRegCombine(SDNode *N,
13851 DAGCombinerInfo &DCI) const {
13852 SDValue Src = N->getOperand(0);
13853 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
13854
13855 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
13856 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
13857 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
13858 VTSign->getVT() == MVT::i8) ||
13859 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
13860 VTSign->getVT() == MVT::i16))) {
13861 assert(Subtarget->hasScalarSubwordLoads() &&
13862 "s_buffer_load_{u8, i8} are supported "
13863 "in GFX12 (or newer) architectures.");
13864 EVT VT = Src.getValueType();
13865 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
13868 SDLoc DL(N);
13869 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13870 SDValue Ops[] = {
13871 Src.getOperand(0), // source register
13872 Src.getOperand(1), // offset
13873 Src.getOperand(2) // cachePolicy
13874 };
13875 auto *M = cast<MemSDNode>(Src);
13876 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13877 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13878 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
13879 return LoadVal;
13880 }
13881 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
13882 VTSign->getVT() == MVT::i8) ||
13883 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
13884 VTSign->getVT() == MVT::i16)) &&
13885 Src.hasOneUse()) {
13886 auto *M = cast<MemSDNode>(Src);
13887 SDValue Ops[] = {Src.getOperand(0), // Chain
13888 Src.getOperand(1), // rsrc
13889 Src.getOperand(2), // vindex
13890 Src.getOperand(3), // voffset
13891 Src.getOperand(4), // soffset
13892 Src.getOperand(5), // offset
13893 Src.getOperand(6), Src.getOperand(7)};
13894 // replace with BUFFER_LOAD_BYTE/SHORT
13895 SDVTList ResList =
13896 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13897 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
13900 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13901 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13902 return DCI.DAG.getMergeValues(
13903 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
13904 }
13905 return SDValue();
13906}
13907
13908SDValue SITargetLowering::performClassCombine(SDNode *N,
13909 DAGCombinerInfo &DCI) const {
13910 SelectionDAG &DAG = DCI.DAG;
13911 SDValue Mask = N->getOperand(1);
13912
13913 // fp_class x, 0 -> false
13914 if (isNullConstant(Mask))
13915 return DAG.getConstant(0, SDLoc(N), MVT::i1);
13916
13917 if (N->getOperand(0).isUndef())
13918 return DAG.getUNDEF(MVT::i1);
13919
13920 return SDValue();
13921}
13922
13923SDValue SITargetLowering::performRcpCombine(SDNode *N,
13924 DAGCombinerInfo &DCI) const {
13925 EVT VT = N->getValueType(0);
13926 SDValue N0 = N->getOperand(0);
13927
13928 if (N0.isUndef()) {
13929 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
13930 SDLoc(N), VT);
13931 }
13932
13933 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
13934 N0.getOpcode() == ISD::SINT_TO_FP)) {
13935 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
13936 N->getFlags());
13937 }
13938
13939 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
13940 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
13941 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
13942 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
13943 N->getFlags());
13944 }
13945
13947}
13948
13950 unsigned MaxDepth) const {
13951 unsigned Opcode = Op.getOpcode();
13952 if (Opcode == ISD::FCANONICALIZE)
13953 return true;
13954
13955 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13956 const auto &F = CFP->getValueAPF();
13957 if (F.isNaN() && F.isSignaling())
13958 return false;
13959 if (!F.isDenormal())
13960 return true;
13961
13962 DenormalMode Mode =
13963 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
13964 return Mode == DenormalMode::getIEEE();
13965 }
13966
13967 // If source is a result of another standard FP operation it is already in
13968 // canonical form.
13969 if (MaxDepth == 0)
13970 return false;
13971
13972 switch (Opcode) {
13973 // These will flush denorms if required.
13974 case ISD::FADD:
13975 case ISD::FSUB:
13976 case ISD::FMUL:
13977 case ISD::FCEIL:
13978 case ISD::FFLOOR:
13979 case ISD::FMA:
13980 case ISD::FMAD:
13981 case ISD::FSQRT:
13982 case ISD::FDIV:
13983 case ISD::FREM:
13984 case ISD::FP_ROUND:
13985 case ISD::FP_EXTEND:
13986 case ISD::FP16_TO_FP:
13987 case ISD::FP_TO_FP16:
13988 case ISD::BF16_TO_FP:
13989 case ISD::FP_TO_BF16:
13990 case ISD::FLDEXP:
13993 case AMDGPUISD::RCP:
13994 case AMDGPUISD::RSQ:
13998 case AMDGPUISD::LOG:
13999 case AMDGPUISD::EXP:
14003 case AMDGPUISD::FRACT:
14010 case AMDGPUISD::SIN_HW:
14011 case AMDGPUISD::COS_HW:
14012 return true;
14013
14014 // It can/will be lowered or combined as a bit operation.
14015 // Need to check their input recursively to handle.
14016 case ISD::FNEG:
14017 case ISD::FABS:
14018 case ISD::FCOPYSIGN:
14019 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14020
14021 case ISD::AND:
14022 if (Op.getValueType() == MVT::i32) {
14023 // Be careful as we only know it is a bitcast floating point type. It
14024 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14025 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14026 // is valid to optimize for all types.
14027 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14028 if (RHS->getZExtValue() == 0xffff0000) {
14029 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14030 }
14031 }
14032 }
14033 break;
14034
14035 case ISD::FSIN:
14036 case ISD::FCOS:
14037 case ISD::FSINCOS:
14038 return Op.getValueType().getScalarType() != MVT::f16;
14039
14040 case ISD::FMINNUM:
14041 case ISD::FMAXNUM:
14042 case ISD::FMINNUM_IEEE:
14043 case ISD::FMAXNUM_IEEE:
14044 case ISD::FMINIMUM:
14045 case ISD::FMAXIMUM:
14046 case ISD::FMINIMUMNUM:
14047 case ISD::FMAXIMUMNUM:
14048 case AMDGPUISD::CLAMP:
14049 case AMDGPUISD::FMED3:
14050 case AMDGPUISD::FMAX3:
14051 case AMDGPUISD::FMIN3:
14053 case AMDGPUISD::FMINIMUM3: {
14054 // FIXME: Shouldn't treat the generic operations different based these.
14055 // However, we aren't really required to flush the result from
14056 // minnum/maxnum..
14057
14058 // snans will be quieted, so we only need to worry about denormals.
14059 if (Subtarget->supportsMinMaxDenormModes() ||
14060 // FIXME: denormalsEnabledForType is broken for dynamic
14061 denormalsEnabledForType(DAG, Op.getValueType()))
14062 return true;
14063
14064 // Flushing may be required.
14065 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14066 // targets need to check their input recursively.
14067
14068 // FIXME: Does this apply with clamp? It's implemented with max.
14069 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14070 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14071 return false;
14072 }
14073
14074 return true;
14075 }
14076 case ISD::SELECT: {
14077 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14078 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14079 }
14080 case ISD::BUILD_VECTOR: {
14081 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14082 SDValue SrcOp = Op.getOperand(i);
14083 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14084 return false;
14085 }
14086
14087 return true;
14088 }
14091 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14092 }
14094 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14095 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14096 }
14097 case ISD::UNDEF:
14098 // Could be anything.
14099 return false;
14100
14101 case ISD::BITCAST:
14102 // TODO: This is incorrect as it loses track of the operand's type. We may
14103 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14104 // same bits that are canonicalized in one type need not be in the other.
14105 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14106 case ISD::TRUNCATE: {
14107 // Hack round the mess we make when legalizing extract_vector_elt
14108 if (Op.getValueType() == MVT::i16) {
14109 SDValue TruncSrc = Op.getOperand(0);
14110 if (TruncSrc.getValueType() == MVT::i32 &&
14111 TruncSrc.getOpcode() == ISD::BITCAST &&
14112 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14113 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14114 }
14115 }
14116 return false;
14117 }
14119 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14120 // TODO: Handle more intrinsics
14121 switch (IntrinsicID) {
14122 case Intrinsic::amdgcn_cvt_pkrtz:
14123 case Intrinsic::amdgcn_cubeid:
14124 case Intrinsic::amdgcn_frexp_mant:
14125 case Intrinsic::amdgcn_fdot2:
14126 case Intrinsic::amdgcn_rcp:
14127 case Intrinsic::amdgcn_rsq:
14128 case Intrinsic::amdgcn_rsq_clamp:
14129 case Intrinsic::amdgcn_rcp_legacy:
14130 case Intrinsic::amdgcn_rsq_legacy:
14131 case Intrinsic::amdgcn_trig_preop:
14132 case Intrinsic::amdgcn_tanh:
14133 case Intrinsic::amdgcn_log:
14134 case Intrinsic::amdgcn_exp2:
14135 case Intrinsic::amdgcn_sqrt:
14136 return true;
14137 default:
14138 break;
14139 }
14140
14141 break;
14142 }
14143 default:
14144 break;
14145 }
14146
14147 // FIXME: denormalsEnabledForType is broken for dynamic
14148 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14149 DAG.isKnownNeverSNaN(Op);
14150}
14151
14153 unsigned MaxDepth) const {
14154 const MachineRegisterInfo &MRI = MF.getRegInfo();
14155 MachineInstr *MI = MRI.getVRegDef(Reg);
14156 unsigned Opcode = MI->getOpcode();
14157
14158 if (Opcode == AMDGPU::G_FCANONICALIZE)
14159 return true;
14160
14161 std::optional<FPValueAndVReg> FCR;
14162 // Constant splat (can be padded with undef) or scalar constant.
14164 if (FCR->Value.isSignaling())
14165 return false;
14166 if (!FCR->Value.isDenormal())
14167 return true;
14168
14169 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14170 return Mode == DenormalMode::getIEEE();
14171 }
14172
14173 if (MaxDepth == 0)
14174 return false;
14175
14176 switch (Opcode) {
14177 case AMDGPU::G_FADD:
14178 case AMDGPU::G_FSUB:
14179 case AMDGPU::G_FMUL:
14180 case AMDGPU::G_FCEIL:
14181 case AMDGPU::G_FFLOOR:
14182 case AMDGPU::G_FRINT:
14183 case AMDGPU::G_FNEARBYINT:
14184 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14185 case AMDGPU::G_INTRINSIC_TRUNC:
14186 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14187 case AMDGPU::G_FMA:
14188 case AMDGPU::G_FMAD:
14189 case AMDGPU::G_FSQRT:
14190 case AMDGPU::G_FDIV:
14191 case AMDGPU::G_FREM:
14192 case AMDGPU::G_FPOW:
14193 case AMDGPU::G_FPEXT:
14194 case AMDGPU::G_FLOG:
14195 case AMDGPU::G_FLOG2:
14196 case AMDGPU::G_FLOG10:
14197 case AMDGPU::G_FPTRUNC:
14198 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14199 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14200 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14201 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14202 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14203 return true;
14204 case AMDGPU::G_FNEG:
14205 case AMDGPU::G_FABS:
14206 case AMDGPU::G_FCOPYSIGN:
14207 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14208 case AMDGPU::G_FMINNUM:
14209 case AMDGPU::G_FMAXNUM:
14210 case AMDGPU::G_FMINNUM_IEEE:
14211 case AMDGPU::G_FMAXNUM_IEEE:
14212 case AMDGPU::G_FMINIMUM:
14213 case AMDGPU::G_FMAXIMUM:
14214 case AMDGPU::G_FMINIMUMNUM:
14215 case AMDGPU::G_FMAXIMUMNUM: {
14216 if (Subtarget->supportsMinMaxDenormModes() ||
14217 // FIXME: denormalsEnabledForType is broken for dynamic
14218 denormalsEnabledForType(MRI.getType(Reg), MF))
14219 return true;
14220
14221 [[fallthrough]];
14222 }
14223 case AMDGPU::G_BUILD_VECTOR:
14224 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14225 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14226 return false;
14227 return true;
14228 case AMDGPU::G_INTRINSIC:
14229 case AMDGPU::G_INTRINSIC_CONVERGENT:
14230 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14231 case Intrinsic::amdgcn_fmul_legacy:
14232 case Intrinsic::amdgcn_fmad_ftz:
14233 case Intrinsic::amdgcn_sqrt:
14234 case Intrinsic::amdgcn_fmed3:
14235 case Intrinsic::amdgcn_sin:
14236 case Intrinsic::amdgcn_cos:
14237 case Intrinsic::amdgcn_log:
14238 case Intrinsic::amdgcn_exp2:
14239 case Intrinsic::amdgcn_log_clamp:
14240 case Intrinsic::amdgcn_rcp:
14241 case Intrinsic::amdgcn_rcp_legacy:
14242 case Intrinsic::amdgcn_rsq:
14243 case Intrinsic::amdgcn_rsq_clamp:
14244 case Intrinsic::amdgcn_rsq_legacy:
14245 case Intrinsic::amdgcn_div_scale:
14246 case Intrinsic::amdgcn_div_fmas:
14247 case Intrinsic::amdgcn_div_fixup:
14248 case Intrinsic::amdgcn_fract:
14249 case Intrinsic::amdgcn_cvt_pkrtz:
14250 case Intrinsic::amdgcn_cubeid:
14251 case Intrinsic::amdgcn_cubema:
14252 case Intrinsic::amdgcn_cubesc:
14253 case Intrinsic::amdgcn_cubetc:
14254 case Intrinsic::amdgcn_frexp_mant:
14255 case Intrinsic::amdgcn_fdot2:
14256 case Intrinsic::amdgcn_trig_preop:
14257 case Intrinsic::amdgcn_tanh:
14258 return true;
14259 default:
14260 break;
14261 }
14262
14263 [[fallthrough]];
14264 default:
14265 return false;
14266 }
14267
14268 llvm_unreachable("invalid operation");
14269}
14270
14271// Constant fold canonicalize.
14272SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14273 const SDLoc &SL, EVT VT,
14274 const APFloat &C) const {
14275 // Flush denormals to 0 if not enabled.
14276 if (C.isDenormal()) {
14277 DenormalMode Mode =
14278 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14279 if (Mode == DenormalMode::getPreserveSign()) {
14280 return DAG.getConstantFP(
14281 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14282 }
14283
14284 if (Mode != DenormalMode::getIEEE())
14285 return SDValue();
14286 }
14287
14288 if (C.isNaN()) {
14289 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14290 if (C.isSignaling()) {
14291 // Quiet a signaling NaN.
14292 // FIXME: Is this supposed to preserve payload bits?
14293 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14294 }
14295
14296 // Make sure it is the canonical NaN bitpattern.
14297 //
14298 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14299 // immediate?
14300 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14301 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14302 }
14303
14304 // Already canonical.
14305 return DAG.getConstantFP(C, SL, VT);
14306}
14307
14309 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14310}
14311
14312SDValue
14313SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14314 DAGCombinerInfo &DCI) const {
14315 SelectionDAG &DAG = DCI.DAG;
14316 SDValue N0 = N->getOperand(0);
14317 EVT VT = N->getValueType(0);
14318
14319 // fcanonicalize undef -> qnan
14320 if (N0.isUndef()) {
14322 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14323 }
14324
14325 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14326 EVT VT = N->getValueType(0);
14327 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14328 }
14329
14330 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14331 // (fcanonicalize k)
14332 //
14333 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14334
14335 // TODO: This could be better with wider vectors that will be split to v2f16,
14336 // and to consider uses since there aren't that many packed operations.
14337 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14338 isTypeLegal(MVT::v2f16)) {
14339 SDLoc SL(N);
14340 SDValue NewElts[2];
14341 SDValue Lo = N0.getOperand(0);
14342 SDValue Hi = N0.getOperand(1);
14343 EVT EltVT = Lo.getValueType();
14344
14346 for (unsigned I = 0; I != 2; ++I) {
14347 SDValue Op = N0.getOperand(I);
14348 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14349 NewElts[I] =
14350 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14351 } else if (Op.isUndef()) {
14352 // Handled below based on what the other operand is.
14353 NewElts[I] = Op;
14354 } else {
14355 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14356 }
14357 }
14358
14359 // If one half is undef, and one is constant, prefer a splat vector rather
14360 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14361 // cheaper to use and may be free with a packed operation.
14362 if (NewElts[0].isUndef()) {
14363 if (isa<ConstantFPSDNode>(NewElts[1]))
14364 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14365 ? NewElts[1]
14366 : DAG.getConstantFP(0.0f, SL, EltVT);
14367 }
14368
14369 if (NewElts[1].isUndef()) {
14370 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14371 ? NewElts[0]
14372 : DAG.getConstantFP(0.0f, SL, EltVT);
14373 }
14374
14375 return DAG.getBuildVector(VT, SL, NewElts);
14376 }
14377 }
14378
14379 return SDValue();
14380}
14381
14382static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14383 switch (Opc) {
14384 case ISD::FMAXNUM:
14385 case ISD::FMAXNUM_IEEE:
14386 case ISD::FMAXIMUMNUM:
14387 return AMDGPUISD::FMAX3;
14388 case ISD::FMAXIMUM:
14389 return AMDGPUISD::FMAXIMUM3;
14390 case ISD::SMAX:
14391 return AMDGPUISD::SMAX3;
14392 case ISD::UMAX:
14393 return AMDGPUISD::UMAX3;
14394 case ISD::FMINNUM:
14395 case ISD::FMINNUM_IEEE:
14396 case ISD::FMINIMUMNUM:
14397 return AMDGPUISD::FMIN3;
14398 case ISD::FMINIMUM:
14399 return AMDGPUISD::FMINIMUM3;
14400 case ISD::SMIN:
14401 return AMDGPUISD::SMIN3;
14402 case ISD::UMIN:
14403 return AMDGPUISD::UMIN3;
14404 default:
14405 llvm_unreachable("Not a min/max opcode");
14406 }
14407}
14408
14409SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14410 const SDLoc &SL, SDValue Src,
14411 SDValue MinVal,
14412 SDValue MaxVal,
14413 bool Signed) const {
14414
14415 // med3 comes from
14416 // min(max(x, K0), K1), K0 < K1
14417 // max(min(x, K0), K1), K1 < K0
14418 //
14419 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14420 // min/max op.
14421 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14422 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14423
14424 if (!MinK || !MaxK)
14425 return SDValue();
14426
14427 if (Signed) {
14428 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14429 return SDValue();
14430 } else {
14431 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14432 return SDValue();
14433 }
14434
14435 EVT VT = MinK->getValueType(0);
14436 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14437 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14438 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14439
14440 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14441 // not available, but this is unlikely to be profitable as constants
14442 // will often need to be materialized & extended, especially on
14443 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14444 return SDValue();
14445}
14446
14449 return C;
14450
14452 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14453 return C;
14454 }
14455
14456 return nullptr;
14457}
14458
14459SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14460 const SDLoc &SL, SDValue Op0,
14461 SDValue Op1) const {
14462 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14463 if (!K1)
14464 return SDValue();
14465
14466 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14467 if (!K0)
14468 return SDValue();
14469
14470 // Ordered >= (although NaN inputs should have folded away by now).
14471 if (K0->getValueAPF() > K1->getValueAPF())
14472 return SDValue();
14473
14474 // med3 with a nan input acts like
14475 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14476 //
14477 // So the result depends on whether the IEEE mode bit is enabled or not with a
14478 // signaling nan input.
14479 // ieee=1
14480 // s0 snan: yields s2
14481 // s1 snan: yields s2
14482 // s2 snan: qnan
14483
14484 // s0 qnan: min(s1, s2)
14485 // s1 qnan: min(s0, s2)
14486 // s2 qnan: min(s0, s1)
14487
14488 // ieee=0
14489 // s0 snan: min(s1, s2)
14490 // s1 snan: min(s0, s2)
14491 // s2 snan: qnan
14492
14493 // s0 qnan: min(s1, s2)
14494 // s1 qnan: min(s0, s2)
14495 // s2 qnan: min(s0, s1)
14496 const MachineFunction &MF = DAG.getMachineFunction();
14497 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14498
14499 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14500 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14501 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14502 EVT VT = Op0.getValueType();
14503 if (Info->getMode().DX10Clamp) {
14504 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14505 // hardware fmed3 behavior converting to a min.
14506 // FIXME: Should this be allowing -0.0?
14507 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14508 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14509 }
14510
14511 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14512 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14513 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14514 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14515 // then give the other result, which is different from med3 with a NaN
14516 // input.
14517 SDValue Var = Op0.getOperand(0);
14518 if (!DAG.isKnownNeverSNaN(Var))
14519 return SDValue();
14520
14521 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14522
14523 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14524 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14525 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14526 SDValue(K0, 0), SDValue(K1, 0));
14527 }
14528 }
14529
14530 return SDValue();
14531}
14532
14533/// \return true if the subtarget supports minimum3 and maximum3 with the given
14534/// base min/max opcode \p Opc for type \p VT.
14535static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14536 EVT VT) {
14537 switch (Opc) {
14538 case ISD::FMINNUM:
14539 case ISD::FMAXNUM:
14540 case ISD::FMINNUM_IEEE:
14541 case ISD::FMAXNUM_IEEE:
14542 case ISD::FMINIMUMNUM:
14543 case ISD::FMAXIMUMNUM:
14546 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14547 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14548 case ISD::FMINIMUM:
14549 case ISD::FMAXIMUM:
14550 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14551 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14552 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14553 case ISD::SMAX:
14554 case ISD::SMIN:
14555 case ISD::UMAX:
14556 case ISD::UMIN:
14557 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14558 default:
14559 return false;
14560 }
14561
14562 llvm_unreachable("not a min/max opcode");
14563}
14564
14565SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14566 DAGCombinerInfo &DCI) const {
14567 SelectionDAG &DAG = DCI.DAG;
14568
14569 EVT VT = N->getValueType(0);
14570 unsigned Opc = N->getOpcode();
14571 SDValue Op0 = N->getOperand(0);
14572 SDValue Op1 = N->getOperand(1);
14573
14574 // Only do this if the inner op has one use since this will just increases
14575 // register pressure for no benefit.
14576
14577 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14578 // max(max(a, b), c) -> max3(a, b, c)
14579 // min(min(a, b), c) -> min3(a, b, c)
14580 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14581 SDLoc DL(N);
14582 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14583 Op0.getOperand(0), Op0.getOperand(1), Op1);
14584 }
14585
14586 // Try commuted.
14587 // max(a, max(b, c)) -> max3(a, b, c)
14588 // min(a, min(b, c)) -> min3(a, b, c)
14589 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14590 SDLoc DL(N);
14591 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14592 Op0, Op1.getOperand(0), Op1.getOperand(1));
14593 }
14594 }
14595
14596 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14597 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14598 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14599 if (SDValue Med3 = performIntMed3ImmCombine(
14600 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14601 return Med3;
14602 }
14603 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14604 if (SDValue Med3 = performIntMed3ImmCombine(
14605 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14606 return Med3;
14607 }
14608
14609 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14610 if (SDValue Med3 = performIntMed3ImmCombine(
14611 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14612 return Med3;
14613 }
14614 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14615 if (SDValue Med3 = performIntMed3ImmCombine(
14616 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14617 return Med3;
14618 }
14619
14620 // if !is_snan(x):
14621 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14622 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14623 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14624 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14625 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14626 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14627 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14629 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14630 (VT == MVT::f32 || VT == MVT::f64 ||
14631 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14632 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14633 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14634 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14635 Op0.hasOneUse()) {
14636 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14637 return Res;
14638 }
14639
14640 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14641 // for some types, but at a higher cost since it's implemented with a 3
14642 // operand form.
14643 const SDNodeFlags Flags = N->getFlags();
14644 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14645 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14646 unsigned NewOpc =
14647 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14648 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14649 }
14650
14651 return SDValue();
14652}
14653
14657 // FIXME: Should this be allowing -0.0?
14658 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14659 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14660 }
14661 }
14662
14663 return false;
14664}
14665
14666// FIXME: Should only worry about snans for version with chain.
14667SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14668 DAGCombinerInfo &DCI) const {
14669 EVT VT = N->getValueType(0);
14670 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14671 // NaNs. With a NaN input, the order of the operands may change the result.
14672
14673 SelectionDAG &DAG = DCI.DAG;
14674 SDLoc SL(N);
14675
14676 SDValue Src0 = N->getOperand(0);
14677 SDValue Src1 = N->getOperand(1);
14678 SDValue Src2 = N->getOperand(2);
14679
14680 if (isClampZeroToOne(Src0, Src1)) {
14681 // const_a, const_b, x -> clamp is safe in all cases including signaling
14682 // nans.
14683 // FIXME: Should this be allowing -0.0?
14684 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
14685 }
14686
14687 const MachineFunction &MF = DAG.getMachineFunction();
14688 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14689
14690 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
14691 // handling no dx10-clamp?
14692 if (Info->getMode().DX10Clamp) {
14693 // If NaNs is clamped to 0, we are free to reorder the inputs.
14694
14695 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14696 std::swap(Src0, Src1);
14697
14698 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
14699 std::swap(Src1, Src2);
14700
14701 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14702 std::swap(Src0, Src1);
14703
14704 if (isClampZeroToOne(Src1, Src2))
14705 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
14706 }
14707
14708 return SDValue();
14709}
14710
14711SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
14712 DAGCombinerInfo &DCI) const {
14713 SDValue Src0 = N->getOperand(0);
14714 SDValue Src1 = N->getOperand(1);
14715 if (Src0.isUndef() && Src1.isUndef())
14716 return DCI.DAG.getUNDEF(N->getValueType(0));
14717 return SDValue();
14718}
14719
14720// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
14721// expanded into a set of cmp/select instructions.
14723 unsigned NumElem,
14724 bool IsDivergentIdx,
14725 const GCNSubtarget *Subtarget) {
14727 return false;
14728
14729 unsigned VecSize = EltSize * NumElem;
14730
14731 // Sub-dword vectors of size 2 dword or less have better implementation.
14732 if (VecSize <= 64 && EltSize < 32)
14733 return false;
14734
14735 // Always expand the rest of sub-dword instructions, otherwise it will be
14736 // lowered via memory.
14737 if (EltSize < 32)
14738 return true;
14739
14740 // Always do this if var-idx is divergent, otherwise it will become a loop.
14741 if (IsDivergentIdx)
14742 return true;
14743
14744 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
14745 unsigned NumInsts = NumElem /* Number of compares */ +
14746 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
14747
14748 // On some architectures (GFX9) movrel is not available and it's better
14749 // to expand.
14750 if (Subtarget->useVGPRIndexMode())
14751 return NumInsts <= 16;
14752
14753 // If movrel is available, use it instead of expanding for vector of 8
14754 // elements.
14755 if (Subtarget->hasMovrel())
14756 return NumInsts <= 15;
14757
14758 return true;
14759}
14760
14762 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
14763 if (isa<ConstantSDNode>(Idx))
14764 return false;
14765
14766 SDValue Vec = N->getOperand(0);
14767 EVT VecVT = Vec.getValueType();
14768 EVT EltVT = VecVT.getVectorElementType();
14769 unsigned EltSize = EltVT.getSizeInBits();
14770 unsigned NumElem = VecVT.getVectorNumElements();
14771
14773 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
14774}
14775
14776SDValue
14777SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14778 DAGCombinerInfo &DCI) const {
14779 SDValue Vec = N->getOperand(0);
14780 SelectionDAG &DAG = DCI.DAG;
14781
14782 EVT VecVT = Vec.getValueType();
14783 EVT VecEltVT = VecVT.getVectorElementType();
14784 EVT ResVT = N->getValueType(0);
14785
14786 unsigned VecSize = VecVT.getSizeInBits();
14787 unsigned VecEltSize = VecEltVT.getSizeInBits();
14788
14789 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14791 SDLoc SL(N);
14792 SDValue Idx = N->getOperand(1);
14793 SDValue Elt =
14794 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
14795 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
14796 }
14797
14798 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14799 // =>
14800 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14801 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14802 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14803 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14804 SDLoc SL(N);
14805 SDValue Idx = N->getOperand(1);
14806 unsigned Opc = Vec.getOpcode();
14807
14808 switch (Opc) {
14809 default:
14810 break;
14811 // TODO: Support other binary operations.
14812 case ISD::FADD:
14813 case ISD::FSUB:
14814 case ISD::FMUL:
14815 case ISD::ADD:
14816 case ISD::UMIN:
14817 case ISD::UMAX:
14818 case ISD::SMIN:
14819 case ISD::SMAX:
14820 case ISD::FMAXNUM:
14821 case ISD::FMINNUM:
14822 case ISD::FMAXNUM_IEEE:
14823 case ISD::FMINNUM_IEEE:
14824 case ISD::FMAXIMUM:
14825 case ISD::FMINIMUM: {
14826 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14827 Vec.getOperand(0), Idx);
14828 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14829 Vec.getOperand(1), Idx);
14830
14831 DCI.AddToWorklist(Elt0.getNode());
14832 DCI.AddToWorklist(Elt1.getNode());
14833 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
14834 }
14835 }
14836 }
14837
14838 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14840 SDLoc SL(N);
14841 SDValue Idx = N->getOperand(1);
14842 SDValue V;
14843 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14844 SDValue IC = DAG.getVectorIdxConstant(I, SL);
14845 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
14846 if (I == 0)
14847 V = Elt;
14848 else
14849 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
14850 }
14851 return V;
14852 }
14853
14854 if (!DCI.isBeforeLegalize())
14855 return SDValue();
14856
14857 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14858 // elements. This exposes more load reduction opportunities by replacing
14859 // multiple small extract_vector_elements with a single 32-bit extract.
14860 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
14861 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14862 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14863 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
14864
14865 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14866 unsigned EltIdx = BitIndex / 32;
14867 unsigned LeftoverBitIdx = BitIndex % 32;
14868 SDLoc SL(N);
14869
14870 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
14871 DCI.AddToWorklist(Cast.getNode());
14872
14873 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
14874 DAG.getConstant(EltIdx, SL, MVT::i32));
14875 DCI.AddToWorklist(Elt.getNode());
14876 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
14877 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
14878 DCI.AddToWorklist(Srl.getNode());
14879
14880 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14881 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
14882 DCI.AddToWorklist(Trunc.getNode());
14883
14884 if (VecEltVT == ResVT) {
14885 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14886 }
14887
14888 assert(ResVT.isScalarInteger());
14889 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
14890 }
14891
14892 return SDValue();
14893}
14894
14895SDValue
14896SITargetLowering::performInsertVectorEltCombine(SDNode *N,
14897 DAGCombinerInfo &DCI) const {
14898 SDValue Vec = N->getOperand(0);
14899 SDValue Idx = N->getOperand(2);
14900 EVT VecVT = Vec.getValueType();
14901 EVT EltVT = VecVT.getVectorElementType();
14902
14903 // INSERT_VECTOR_ELT (<n x e>, var-idx)
14904 // => BUILD_VECTOR n x select (e, const-idx)
14906 return SDValue();
14907
14908 SelectionDAG &DAG = DCI.DAG;
14909 SDLoc SL(N);
14910 SDValue Ins = N->getOperand(1);
14911 EVT IdxVT = Idx.getValueType();
14912
14914 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14915 SDValue IC = DAG.getConstant(I, SL, IdxVT);
14916 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
14917 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
14918 Ops.push_back(V);
14919 }
14920
14921 return DAG.getBuildVector(VecVT, SL, Ops);
14922}
14923
14924/// Return the source of an fp_extend from f16 to f32, or a converted FP
14925/// constant.
14927 if (Src.getOpcode() == ISD::FP_EXTEND &&
14928 Src.getOperand(0).getValueType() == MVT::f16) {
14929 return Src.getOperand(0);
14930 }
14931
14932 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
14933 APFloat Val = CFP->getValueAPF();
14934 bool LosesInfo = true;
14936 if (!LosesInfo)
14937 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
14938 }
14939
14940 return SDValue();
14941}
14942
14943SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
14944 DAGCombinerInfo &DCI) const {
14945 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14946 "combine only useful on gfx8");
14947
14948 SDValue TruncSrc = N->getOperand(0);
14949 EVT VT = N->getValueType(0);
14950 if (VT != MVT::f16)
14951 return SDValue();
14952
14953 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
14954 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
14955 return SDValue();
14956
14957 SelectionDAG &DAG = DCI.DAG;
14958 SDLoc SL(N);
14959
14960 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
14961 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
14962 // casting back.
14963
14964 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
14965 // fmin(fmax(a, b), fmax(fmin(a, b), c))
14966 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
14967 if (!A)
14968 return SDValue();
14969
14970 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
14971 if (!B)
14972 return SDValue();
14973
14974 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
14975 if (!C)
14976 return SDValue();
14977
14978 // This changes signaling nan behavior. If an input is a signaling nan, it
14979 // would have been quieted by the fpext originally. We don't care because
14980 // these are unconstrained ops. If we needed to insert quieting canonicalizes
14981 // we would be worse off than just doing the promotion.
14982 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
14983 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
14984 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
14985 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
14986}
14987
14988unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
14989 const SDNode *N0,
14990 const SDNode *N1) const {
14991 EVT VT = N0->getValueType(0);
14992
14993 // Only do this if we are not trying to support denormals. v_mad_f32 does not
14994 // support denormals ever.
14995 if (((VT == MVT::f32 &&
14997 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15000 return ISD::FMAD;
15001
15002 const TargetOptions &Options = DAG.getTarget().Options;
15003 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15004 (N0->getFlags().hasAllowContract() &&
15005 N1->getFlags().hasAllowContract())) &&
15007 return ISD::FMA;
15008 }
15009
15010 return 0;
15011}
15012
15013// For a reassociatable opcode perform:
15014// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15015SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15016 SelectionDAG &DAG) const {
15017 EVT VT = N->getValueType(0);
15018 if (VT != MVT::i32 && VT != MVT::i64)
15019 return SDValue();
15020
15021 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15022 return SDValue();
15023
15024 unsigned Opc = N->getOpcode();
15025 SDValue Op0 = N->getOperand(0);
15026 SDValue Op1 = N->getOperand(1);
15027
15028 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15029 return SDValue();
15030
15031 if (Op0->isDivergent())
15032 std::swap(Op0, Op1);
15033
15034 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15035 return SDValue();
15036
15037 SDValue Op2 = Op1.getOperand(1);
15038 Op1 = Op1.getOperand(0);
15039 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15040 return SDValue();
15041
15042 if (Op1->isDivergent())
15043 std::swap(Op1, Op2);
15044
15045 SDLoc SL(N);
15046 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15047 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15048}
15049
15050static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15051 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15053 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15054 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15055 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15056}
15057
15058// Fold
15059// y = lshr i64 x, 32
15060// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15061// with Const.hi == -1
15062// To
15063// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15065 SDValue MulLHS, SDValue MulRHS,
15066 SDValue AddRHS) {
15067 if (MulRHS.getOpcode() == ISD::SRL)
15068 std::swap(MulLHS, MulRHS);
15069
15070 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15071 return SDValue();
15072
15073 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15074 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15075 MulLHS.getOperand(0) != AddRHS)
15076 return SDValue();
15077
15079 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15080 return SDValue();
15081
15082 SDValue ConstMul =
15083 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15084 return getMad64_32(DAG, SL, MVT::i64,
15085 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15086 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15087}
15088
15089// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15090// multiplies, if any.
15091//
15092// Full 64-bit multiplies that feed into an addition are lowered here instead
15093// of using the generic expansion. The generic expansion ends up with
15094// a tree of ADD nodes that prevents us from using the "add" part of the
15095// MAD instruction. The expansion produced here results in a chain of ADDs
15096// instead of a tree.
15097SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15098 DAGCombinerInfo &DCI) const {
15099 assert(N->isAnyAdd());
15100
15101 SelectionDAG &DAG = DCI.DAG;
15102 EVT VT = N->getValueType(0);
15103 SDLoc SL(N);
15104 SDValue LHS = N->getOperand(0);
15105 SDValue RHS = N->getOperand(1);
15106
15107 if (VT.isVector())
15108 return SDValue();
15109
15110 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15111 // result in scalar registers for uniform values.
15112 if (!N->isDivergent() && Subtarget->hasSMulHi())
15113 return SDValue();
15114
15115 unsigned NumBits = VT.getScalarSizeInBits();
15116 if (NumBits <= 32 || NumBits > 64)
15117 return SDValue();
15118
15119 if (LHS.getOpcode() != ISD::MUL) {
15120 assert(RHS.getOpcode() == ISD::MUL);
15121 std::swap(LHS, RHS);
15122 }
15123
15124 // Avoid the fold if it would unduly increase the number of multiplies due to
15125 // multiple uses, except on hardware with full-rate multiply-add (which is
15126 // part of full-rate 64-bit ops).
15127 if (!Subtarget->hasFullRate64Ops()) {
15128 unsigned NumUsers = 0;
15129 for (SDNode *User : LHS->users()) {
15130 // There is a use that does not feed into addition, so the multiply can't
15131 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15132 if (!User->isAnyAdd())
15133 return SDValue();
15134
15135 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15136 // MUL + 3xADD + 3xADDC over 3xMAD.
15137 ++NumUsers;
15138 if (NumUsers >= 3)
15139 return SDValue();
15140 }
15141 }
15142
15143 SDValue MulLHS = LHS.getOperand(0);
15144 SDValue MulRHS = LHS.getOperand(1);
15145 SDValue AddRHS = RHS;
15146
15147 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15148 return FoldedMAD;
15149
15150 // Always check whether operands are small unsigned values, since that
15151 // knowledge is useful in more cases. Check for small signed values only if
15152 // doing so can unlock a shorter code sequence.
15153 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15154 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15155
15156 bool MulSignedLo = false;
15157 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15158 MulSignedLo =
15159 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15160 }
15161
15162 // The operands and final result all have the same number of bits. If
15163 // operands need to be extended, they can be extended with garbage. The
15164 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15165 // truncated away in the end.
15166 if (VT != MVT::i64) {
15167 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15168 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15169 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15170 }
15171
15172 // The basic code generated is conceptually straightforward. Pseudo code:
15173 //
15174 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15175 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15176 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15177 //
15178 // The second and third lines are optional, depending on whether the factors
15179 // are {sign,zero}-extended or not.
15180 //
15181 // The actual DAG is noisier than the pseudo code, but only due to
15182 // instructions that disassemble values into low and high parts, and
15183 // assemble the final result.
15184 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15185
15186 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15187 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15188 SDValue Accum =
15189 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15190
15191 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15192 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15193
15194 if (!MulLHSUnsigned32) {
15195 auto MulLHSHi =
15196 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15197 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15198 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15199 }
15200
15201 if (!MulRHSUnsigned32) {
15202 auto MulRHSHi =
15203 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15204 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15205 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15206 }
15207
15208 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15209 Accum = DAG.getBitcast(MVT::i64, Accum);
15210 }
15211
15212 if (VT != MVT::i64)
15213 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15214 return Accum;
15215}
15216
15217SDValue
15218SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15219 DAGCombinerInfo &DCI) const {
15220 SDValue RHS = N->getOperand(1);
15221 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15222 if (!CRHS)
15223 return SDValue();
15224
15225 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15226 // common.
15227 uint64_t Val = CRHS->getZExtValue();
15228 if (countr_zero(Val) >= 32) {
15229 SelectionDAG &DAG = DCI.DAG;
15230 SDLoc SL(N);
15231 SDValue LHS = N->getOperand(0);
15232
15233 // Avoid carry machinery if we know the low half of the add does not
15234 // contribute to the final result.
15235 //
15236 // add i64:x, K if computeTrailingZeros(K) >= 32
15237 // => build_pair (add x.hi, K.hi), x.lo
15238
15239 // Breaking the 64-bit add here with this strange constant is unlikely
15240 // to interfere with addressing mode patterns.
15241
15242 SDValue Hi = getHiHalf64(LHS, DAG);
15243 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15244 unsigned Opcode = N->getOpcode();
15245 if (Opcode == ISD::PTRADD)
15246 Opcode = ISD::ADD;
15247 SDValue AddHi =
15248 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15249
15250 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15251 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15252 }
15253
15254 return SDValue();
15255}
15256
15257// Collect the ultimate src of each of the mul node's operands, and confirm
15258// each operand is 8 bytes.
15259static std::optional<ByteProvider<SDValue>>
15260handleMulOperand(const SDValue &MulOperand) {
15261 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15262 if (!Byte0 || Byte0->isConstantZero()) {
15263 return std::nullopt;
15264 }
15265 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15266 if (Byte1 && !Byte1->isConstantZero()) {
15267 return std::nullopt;
15268 }
15269 return Byte0;
15270}
15271
15272static unsigned addPermMasks(unsigned First, unsigned Second) {
15273 unsigned FirstCs = First & 0x0c0c0c0c;
15274 unsigned SecondCs = Second & 0x0c0c0c0c;
15275 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15276 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15277
15278 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15279 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15280 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15281 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15282
15283 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15284}
15285
15286struct DotSrc {
15288 int64_t PermMask;
15290};
15291
15295 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15296
15297 assert(Src0.Src.has_value() && Src1.Src.has_value());
15298 // Src0s and Src1s are empty, just place arbitrarily.
15299 if (Step == 0) {
15300 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15301 Src0.SrcOffset / 4});
15302 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15303 Src1.SrcOffset / 4});
15304 return;
15305 }
15306
15307 for (int BPI = 0; BPI < 2; BPI++) {
15308 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15309 if (BPI == 1) {
15310 BPP = {Src1, Src0};
15311 }
15312 unsigned ZeroMask = 0x0c0c0c0c;
15313 unsigned FMask = 0xFF << (8 * (3 - Step));
15314
15315 unsigned FirstMask =
15316 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15317 unsigned SecondMask =
15318 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15319 // Attempt to find Src vector which contains our SDValue, if so, add our
15320 // perm mask to the existing one. If we are unable to find a match for the
15321 // first SDValue, attempt to find match for the second.
15322 int FirstGroup = -1;
15323 for (int I = 0; I < 2; I++) {
15324 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15325 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15326 return IterElt.SrcOp == *BPP.first.Src &&
15327 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15328 };
15329
15330 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15331 if (Match != Srcs.end()) {
15332 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15333 FirstGroup = I;
15334 break;
15335 }
15336 }
15337 if (FirstGroup != -1) {
15338 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15339 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15340 return IterElt.SrcOp == *BPP.second.Src &&
15341 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15342 };
15343 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15344 if (Match != Srcs.end()) {
15345 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15346 } else
15347 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15348 return;
15349 }
15350 }
15351
15352 // If we have made it here, then we could not find a match in Src0s or Src1s
15353 // for either Src0 or Src1, so just place them arbitrarily.
15354
15355 unsigned ZeroMask = 0x0c0c0c0c;
15356 unsigned FMask = 0xFF << (8 * (3 - Step));
15357
15358 Src0s.push_back(
15359 {*Src0.Src,
15360 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15361 Src0.SrcOffset / 4});
15362 Src1s.push_back(
15363 {*Src1.Src,
15364 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15365 Src1.SrcOffset / 4});
15366}
15367
15369 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15370 bool IsAny) {
15371
15372 // If we just have one source, just permute it accordingly.
15373 if (Srcs.size() == 1) {
15374 auto *Elt = Srcs.begin();
15375 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15376
15377 // v_perm will produce the original value
15378 if (Elt->PermMask == 0x3020100)
15379 return EltOp;
15380
15381 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15382 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15383 }
15384
15385 auto *FirstElt = Srcs.begin();
15386 auto *SecondElt = std::next(FirstElt);
15387
15389
15390 // If we have multiple sources in the chain, combine them via perms (using
15391 // calculated perm mask) and Ors.
15392 while (true) {
15393 auto FirstMask = FirstElt->PermMask;
15394 auto SecondMask = SecondElt->PermMask;
15395
15396 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15397 unsigned FirstPlusFour = FirstMask | 0x04040404;
15398 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15399 // original 0x0C.
15400 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15401
15402 auto PermMask = addPermMasks(FirstMask, SecondMask);
15403 auto FirstVal =
15404 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15405 auto SecondVal =
15406 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15407
15408 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15409 SecondVal,
15410 DAG.getConstant(PermMask, SL, MVT::i32)));
15411
15412 FirstElt = std::next(SecondElt);
15413 if (FirstElt == Srcs.end())
15414 break;
15415
15416 SecondElt = std::next(FirstElt);
15417 // If we only have a FirstElt, then just combine that into the cumulative
15418 // source node.
15419 if (SecondElt == Srcs.end()) {
15420 auto EltOp =
15421 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15422
15423 Perms.push_back(
15424 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15425 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15426 break;
15427 }
15428 }
15429
15430 assert(Perms.size() == 1 || Perms.size() == 2);
15431 return Perms.size() == 2
15432 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15433 : Perms[0];
15434}
15435
15436static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15437 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15438 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15439 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15440 EntryMask += ZeroMask;
15441 }
15442}
15443
15444static bool isMul(const SDValue Op) {
15445 auto Opcode = Op.getOpcode();
15446
15447 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15448 Opcode == AMDGPUISD::MUL_I24);
15449}
15450
15451static std::optional<bool>
15453 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15454 const SDValue &S1Op, const SelectionDAG &DAG) {
15455 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15456 // of the dot4 is irrelevant.
15457 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15458 return false;
15459
15460 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15461 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15462 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15463 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15464 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15465 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15466
15467 assert(!(S0IsUnsigned && S0IsSigned));
15468 assert(!(S1IsUnsigned && S1IsSigned));
15469
15470 // There are 9 possible permutations of
15471 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15472
15473 // In two permutations, the sign bits are known to be the same for both Ops,
15474 // so simply return Signed / Unsigned corresponding to the MSB
15475
15476 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15477 return S0IsSigned;
15478
15479 // In another two permutations, the sign bits are known to be opposite. In
15480 // this case return std::nullopt to indicate a bad match.
15481
15482 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15483 return std::nullopt;
15484
15485 // In the remaining five permutations, we don't know the value of the sign
15486 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15487 // the upper bits must be extension bits. Thus, the only ways for the sign
15488 // bit to be unknown is if it was sign extended from unknown value, or if it
15489 // was any extended. In either case, it is correct to use the signed
15490 // version of the signedness semantics of dot4
15491
15492 // In two of such permutations, we known the sign bit is set for
15493 // one op, and the other is unknown. It is okay to used signed version of
15494 // dot4.
15495 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15496 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15497 return true;
15498
15499 // In one such permutation, we don't know either of the sign bits. It is okay
15500 // to used the signed version of dot4.
15501 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15502 return true;
15503
15504 // In two of such permutations, we known the sign bit is unset for
15505 // one op, and the other is unknown. Return std::nullopt to indicate a
15506 // bad match.
15507 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15508 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15509 return std::nullopt;
15510
15511 llvm_unreachable("Fully covered condition");
15512}
15513
15514SDValue SITargetLowering::performAddCombine(SDNode *N,
15515 DAGCombinerInfo &DCI) const {
15516 SelectionDAG &DAG = DCI.DAG;
15517 EVT VT = N->getValueType(0);
15518 SDLoc SL(N);
15519 SDValue LHS = N->getOperand(0);
15520 SDValue RHS = N->getOperand(1);
15521
15522 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15523 if (Subtarget->hasMad64_32()) {
15524 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15525 return Folded;
15526 }
15527 }
15528
15529 if (SDValue V = reassociateScalarOps(N, DAG)) {
15530 return V;
15531 }
15532
15533 if (VT == MVT::i64) {
15534 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15535 return Folded;
15536 }
15537
15538 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15539 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15540 SDValue TempNode(N, 0);
15541 std::optional<bool> IsSigned;
15545
15546 // Match the v_dot4 tree, while collecting src nodes.
15547 int ChainLength = 0;
15548 for (int I = 0; I < 4; I++) {
15549 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15550 if (MulIdx == -1)
15551 break;
15552 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15553 if (!Src0)
15554 break;
15555 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15556 if (!Src1)
15557 break;
15558
15559 auto IterIsSigned = checkDot4MulSignedness(
15560 TempNode->getOperand(MulIdx), *Src0, *Src1,
15561 TempNode->getOperand(MulIdx)->getOperand(0),
15562 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15563 if (!IterIsSigned)
15564 break;
15565 if (!IsSigned)
15566 IsSigned = *IterIsSigned;
15567 if (*IterIsSigned != *IsSigned)
15568 break;
15569 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15570 auto AddIdx = 1 - MulIdx;
15571 // Allow the special case where add (add (mul24, 0), mul24) became ->
15572 // add (mul24, mul24).
15573 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15574 Src2s.push_back(TempNode->getOperand(AddIdx));
15575 auto Src0 =
15576 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15577 if (!Src0)
15578 break;
15579 auto Src1 =
15580 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15581 if (!Src1)
15582 break;
15583 auto IterIsSigned = checkDot4MulSignedness(
15584 TempNode->getOperand(AddIdx), *Src0, *Src1,
15585 TempNode->getOperand(AddIdx)->getOperand(0),
15586 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15587 if (!IterIsSigned)
15588 break;
15589 assert(IsSigned);
15590 if (*IterIsSigned != *IsSigned)
15591 break;
15592 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15593 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15594 ChainLength = I + 2;
15595 break;
15596 }
15597
15598 TempNode = TempNode->getOperand(AddIdx);
15599 Src2s.push_back(TempNode);
15600 ChainLength = I + 1;
15601 if (TempNode->getNumOperands() < 2)
15602 break;
15603 LHS = TempNode->getOperand(0);
15604 RHS = TempNode->getOperand(1);
15605 }
15606
15607 if (ChainLength < 2)
15608 return SDValue();
15609
15610 // Masks were constructed with assumption that we would find a chain of
15611 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15612 // 0x0c) so they do not affect dot calculation.
15613 if (ChainLength < 4) {
15614 fixMasks(Src0s, ChainLength);
15615 fixMasks(Src1s, ChainLength);
15616 }
15617
15618 SDValue Src0, Src1;
15619
15620 // If we are just using a single source for both, and have permuted the
15621 // bytes consistently, we can just use the sources without permuting
15622 // (commutation).
15623 bool UseOriginalSrc = false;
15624 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15625 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15626 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15627 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15628 SmallVector<unsigned, 4> SrcBytes;
15629 auto Src0Mask = Src0s.begin()->PermMask;
15630 SrcBytes.push_back(Src0Mask & 0xFF000000);
15631 bool UniqueEntries = true;
15632 for (auto I = 1; I < 4; I++) {
15633 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15634
15635 if (is_contained(SrcBytes, NextByte)) {
15636 UniqueEntries = false;
15637 break;
15638 }
15639 SrcBytes.push_back(NextByte);
15640 }
15641
15642 if (UniqueEntries) {
15643 UseOriginalSrc = true;
15644
15645 auto *FirstElt = Src0s.begin();
15646 auto FirstEltOp =
15647 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15648
15649 auto *SecondElt = Src1s.begin();
15650 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
15651 SecondElt->DWordOffset);
15652
15653 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
15654 MVT::getIntegerVT(32));
15655 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
15656 MVT::getIntegerVT(32));
15657 }
15658 }
15659
15660 if (!UseOriginalSrc) {
15661 Src0 = resolveSources(DAG, SL, Src0s, false, true);
15662 Src1 = resolveSources(DAG, SL, Src1s, false, true);
15663 }
15664
15665 assert(IsSigned);
15666 SDValue Src2 =
15667 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15668
15669 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
15670 : Intrinsic::amdgcn_udot4,
15671 SL, MVT::i64);
15672
15673 assert(!VT.isVector());
15674 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
15675 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
15676
15677 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
15678 }
15679
15680 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15681 return SDValue();
15682
15683 // add x, zext (setcc) => uaddo_carry x, 0, setcc
15684 // add x, sext (setcc) => usubo_carry x, 0, setcc
15685 unsigned Opc = LHS.getOpcode();
15688 std::swap(RHS, LHS);
15689
15690 Opc = RHS.getOpcode();
15691 switch (Opc) {
15692 default:
15693 break;
15694 case ISD::ZERO_EXTEND:
15695 case ISD::SIGN_EXTEND:
15696 case ISD::ANY_EXTEND: {
15697 auto Cond = RHS.getOperand(0);
15698 // If this won't be a real VOPC output, we would still need to insert an
15699 // extra instruction anyway.
15700 if (!isBoolSGPR(Cond))
15701 break;
15702 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15703 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15705 return DAG.getNode(Opc, SL, VTList, Args);
15706 }
15707 case ISD::UADDO_CARRY: {
15708 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
15709 if (!isNullConstant(RHS.getOperand(1)))
15710 break;
15711 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
15712 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
15713 }
15714 }
15715 return SDValue();
15716}
15717
15718SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
15719 DAGCombinerInfo &DCI) const {
15720 SelectionDAG &DAG = DCI.DAG;
15721 SDLoc DL(N);
15722 EVT VT = N->getValueType(0);
15723 SDValue N0 = N->getOperand(0);
15724 SDValue N1 = N->getOperand(1);
15725
15726 // The following folds transform PTRADDs into regular arithmetic in cases
15727 // where the PTRADD wouldn't be folded as an immediate offset into memory
15728 // instructions anyway. They are target-specific in that other targets might
15729 // prefer to not lose information about the pointer arithmetic.
15730
15731 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
15732 // Adapted from DAGCombiner::visitADDLikeCommutative.
15733 SDValue V, K;
15734 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
15735 SDNodeFlags ShlFlags = N1->getFlags();
15736 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
15737 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
15738 // preserved.
15739 SDNodeFlags NewShlFlags =
15740 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
15742 : SDNodeFlags();
15743 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
15744 DCI.AddToWorklist(Inner.getNode());
15745 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
15746 }
15747
15748 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
15749 // performAddCombine.
15750 if (N1.getOpcode() == ISD::MUL) {
15751 if (Subtarget->hasMad64_32()) {
15752 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15753 return Folded;
15754 }
15755 }
15756
15757 // If the 32 low bits of the constant are all zero, there is nothing to fold
15758 // into an immediate offset, so it's better to eliminate the unnecessary
15759 // addition for the lower 32 bits than to preserve the PTRADD.
15760 // Analogous to a fold in performAddCombine.
15761 if (VT == MVT::i64) {
15762 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15763 return Folded;
15764 }
15765
15766 if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
15767 // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
15768 // global address GA and constant c, such that c can be folded into GA.
15769 SDValue GAValue = N0.getOperand(0);
15770 if (const GlobalAddressSDNode *GA =
15772 if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
15773 // If both additions in the original were NUW, reassociation preserves
15774 // that.
15775 SDNodeFlags Flags =
15776 (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15777 SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
15778 DCI.AddToWorklist(Inner.getNode());
15779 return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
15780 }
15781 }
15782 }
15783
15784 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15785 return SDValue();
15786
15787 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15788 // y is not, and (add y, z) is used only once.
15789 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15790 // z is not, and (add y, z) is used only once.
15791 // The goal is to move constant offsets to the outermost ptradd, to create
15792 // more opportunities to fold offsets into memory instructions.
15793 // Together with the generic combines in DAGCombiner.cpp, this also
15794 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15795 //
15796 // This transform is here instead of in the general DAGCombiner as it can
15797 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15798 // AArch64's CPA.
15799 SDValue X = N0;
15800 SDValue Y = N1.getOperand(0);
15801 SDValue Z = N1.getOperand(1);
15802 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15803 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15804
15805 // If both additions in the original were NUW, reassociation preserves that.
15806 SDNodeFlags ReassocFlags =
15807 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15808
15809 if (ZIsConstant != YIsConstant) {
15810 if (YIsConstant)
15811 std::swap(Y, Z);
15812 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15813 DCI.AddToWorklist(Inner.getNode());
15814 return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15815 }
15816
15817 // If one of Y and Z is constant, they have been handled above. If both were
15818 // constant, the addition would have been folded in SelectionDAG::getNode
15819 // already. This ensures that the generic DAG combines won't undo the
15820 // following reassociation.
15821 assert(!YIsConstant && !ZIsConstant);
15822
15823 if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15824 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15825 // y are uniform and z isn't.
15826 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15827 // z are uniform and y isn't.
15828 // The goal is to push uniform operands up in the computation, so that they
15829 // can be handled with scalar operations. We can't use reassociateScalarOps
15830 // for this since it requires two identical commutative operations to
15831 // reassociate.
15832 if (Y->isDivergent())
15833 std::swap(Y, Z);
15834 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15835 DCI.AddToWorklist(UniformInner.getNode());
15836 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15837 }
15838
15839 return SDValue();
15840}
15841
15842SDValue SITargetLowering::performSubCombine(SDNode *N,
15843 DAGCombinerInfo &DCI) const {
15844 SelectionDAG &DAG = DCI.DAG;
15845 EVT VT = N->getValueType(0);
15846
15847 if (VT == MVT::i64) {
15848 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15849 return Folded;
15850 }
15851
15852 if (VT != MVT::i32)
15853 return SDValue();
15854
15855 SDLoc SL(N);
15856 SDValue LHS = N->getOperand(0);
15857 SDValue RHS = N->getOperand(1);
15858
15859 // sub x, zext (setcc) => usubo_carry x, 0, setcc
15860 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
15861 unsigned Opc = RHS.getOpcode();
15862 switch (Opc) {
15863 default:
15864 break;
15865 case ISD::ZERO_EXTEND:
15866 case ISD::SIGN_EXTEND:
15867 case ISD::ANY_EXTEND: {
15868 auto Cond = RHS.getOperand(0);
15869 // If this won't be a real VOPC output, we would still need to insert an
15870 // extra instruction anyway.
15871 if (!isBoolSGPR(Cond))
15872 break;
15873 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15874 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15876 return DAG.getNode(Opc, SL, VTList, Args);
15877 }
15878 }
15879
15880 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
15881 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
15882 if (!isNullConstant(LHS.getOperand(1)))
15883 return SDValue();
15884 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
15885 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
15886 }
15887 return SDValue();
15888}
15889
15890SDValue
15891SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
15892 DAGCombinerInfo &DCI) const {
15893
15894 if (N->getValueType(0) != MVT::i32)
15895 return SDValue();
15896
15897 if (!isNullConstant(N->getOperand(1)))
15898 return SDValue();
15899
15900 SelectionDAG &DAG = DCI.DAG;
15901 SDValue LHS = N->getOperand(0);
15902
15903 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
15904 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
15905 unsigned LHSOpc = LHS.getOpcode();
15906 unsigned Opc = N->getOpcode();
15907 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
15908 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
15909 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
15910 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
15911 }
15912 return SDValue();
15913}
15914
15915SDValue SITargetLowering::performFAddCombine(SDNode *N,
15916 DAGCombinerInfo &DCI) const {
15917 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15918 return SDValue();
15919
15920 SelectionDAG &DAG = DCI.DAG;
15921 EVT VT = N->getValueType(0);
15922
15923 SDLoc SL(N);
15924 SDValue LHS = N->getOperand(0);
15925 SDValue RHS = N->getOperand(1);
15926
15927 // These should really be instruction patterns, but writing patterns with
15928 // source modifiers is a pain.
15929
15930 // fadd (fadd (a, a), b) -> mad 2.0, a, b
15931 if (LHS.getOpcode() == ISD::FADD) {
15932 SDValue A = LHS.getOperand(0);
15933 if (A == LHS.getOperand(1)) {
15934 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15935 if (FusedOp != 0) {
15936 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15937 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
15938 }
15939 }
15940 }
15941
15942 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
15943 if (RHS.getOpcode() == ISD::FADD) {
15944 SDValue A = RHS.getOperand(0);
15945 if (A == RHS.getOperand(1)) {
15946 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15947 if (FusedOp != 0) {
15948 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15949 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
15950 }
15951 }
15952 }
15953
15954 return SDValue();
15955}
15956
15957SDValue SITargetLowering::performFSubCombine(SDNode *N,
15958 DAGCombinerInfo &DCI) const {
15959 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15960 return SDValue();
15961
15962 SelectionDAG &DAG = DCI.DAG;
15963 SDLoc SL(N);
15964 EVT VT = N->getValueType(0);
15965 assert(!VT.isVector());
15966
15967 // Try to get the fneg to fold into the source modifier. This undoes generic
15968 // DAG combines and folds them into the mad.
15969 //
15970 // Only do this if we are not trying to support denormals. v_mad_f32 does
15971 // not support denormals ever.
15972 SDValue LHS = N->getOperand(0);
15973 SDValue RHS = N->getOperand(1);
15974 if (LHS.getOpcode() == ISD::FADD) {
15975 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
15976 SDValue A = LHS.getOperand(0);
15977 if (A == LHS.getOperand(1)) {
15978 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15979 if (FusedOp != 0) {
15980 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15981 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
15982
15983 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
15984 }
15985 }
15986 }
15987
15988 if (RHS.getOpcode() == ISD::FADD) {
15989 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
15990
15991 SDValue A = RHS.getOperand(0);
15992 if (A == RHS.getOperand(1)) {
15993 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
15994 if (FusedOp != 0) {
15995 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
15996 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
15997 }
15998 }
15999 }
16000
16001 return SDValue();
16002}
16003
16004SDValue SITargetLowering::performFDivCombine(SDNode *N,
16005 DAGCombinerInfo &DCI) const {
16006 SelectionDAG &DAG = DCI.DAG;
16007 SDLoc SL(N);
16008 EVT VT = N->getValueType(0);
16009 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16010 return SDValue();
16011
16012 SDValue LHS = N->getOperand(0);
16013 SDValue RHS = N->getOperand(1);
16014
16015 SDNodeFlags Flags = N->getFlags();
16016 SDNodeFlags RHSFlags = RHS->getFlags();
16017 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16018 !RHS->hasOneUse())
16019 return SDValue();
16020
16021 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16022 bool IsNegative = false;
16023 if (CLHS->isExactlyValue(1.0) ||
16024 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16025 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16026 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16027 if (RHS.getOpcode() == ISD::FSQRT) {
16028 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16029 SDValue Rsq =
16030 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16031 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16032 }
16033 }
16034 }
16035
16036 return SDValue();
16037}
16038
16039SDValue SITargetLowering::performFMulCombine(SDNode *N,
16040 DAGCombinerInfo &DCI) const {
16041 SelectionDAG &DAG = DCI.DAG;
16042 EVT VT = N->getValueType(0);
16043 EVT ScalarVT = VT.getScalarType();
16044 EVT IntVT = VT.changeElementType(MVT::i32);
16045
16046 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16047 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16048 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16049 return SDValue();
16050 }
16051
16052 SDValue LHS = N->getOperand(0);
16053 SDValue RHS = N->getOperand(1);
16054
16055 // It is cheaper to realize i32 inline constants as compared against
16056 // materializing f16 or f64 (or even non-inline f32) values,
16057 // possible via ldexp usage, as shown below :
16058 //
16059 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16060 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16061 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16062 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16063 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16064 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16065 if (!TrueNode)
16066 return SDValue();
16067 const ConstantFPSDNode *FalseNode =
16068 isConstOrConstSplatFP(RHS.getOperand(2));
16069 if (!FalseNode)
16070 return SDValue();
16071
16072 if (TrueNode->isNegative() != FalseNode->isNegative())
16073 return SDValue();
16074
16075 // For f32, only non-inline constants should be transformed.
16076 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16077 if (ScalarVT == MVT::f32 &&
16078 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16079 TII->isInlineConstant(FalseNode->getValueAPF()))
16080 return SDValue();
16081
16082 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16083 if (TrueNodeExpVal == INT_MIN)
16084 return SDValue();
16085 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16086 if (FalseNodeExpVal == INT_MIN)
16087 return SDValue();
16088
16089 SDLoc SL(N);
16090 SDValue SelectNode =
16091 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16092 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16093 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16094
16095 LHS = TrueNode->isNegative()
16096 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16097 : LHS;
16098
16099 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16100 }
16101
16102 return SDValue();
16103}
16104
16105SDValue SITargetLowering::performFMACombine(SDNode *N,
16106 DAGCombinerInfo &DCI) const {
16107 SelectionDAG &DAG = DCI.DAG;
16108 EVT VT = N->getValueType(0);
16109 SDLoc SL(N);
16110
16111 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16112 return SDValue();
16113
16114 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16115 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16116 SDValue Op1 = N->getOperand(0);
16117 SDValue Op2 = N->getOperand(1);
16118 SDValue FMA = N->getOperand(2);
16119
16120 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16121 Op2.getOpcode() != ISD::FP_EXTEND)
16122 return SDValue();
16123
16124 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16125 // regardless of the denorm mode setting. Therefore,
16126 // fp-contract is sufficient to allow generating fdot2.
16127 const TargetOptions &Options = DAG.getTarget().Options;
16128 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16129 (N->getFlags().hasAllowContract() &&
16130 FMA->getFlags().hasAllowContract())) {
16131 Op1 = Op1.getOperand(0);
16132 Op2 = Op2.getOperand(0);
16133 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16135 return SDValue();
16136
16137 SDValue Vec1 = Op1.getOperand(0);
16138 SDValue Idx1 = Op1.getOperand(1);
16139 SDValue Vec2 = Op2.getOperand(0);
16140
16141 SDValue FMAOp1 = FMA.getOperand(0);
16142 SDValue FMAOp2 = FMA.getOperand(1);
16143 SDValue FMAAcc = FMA.getOperand(2);
16144
16145 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16146 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16147 return SDValue();
16148
16149 FMAOp1 = FMAOp1.getOperand(0);
16150 FMAOp2 = FMAOp2.getOperand(0);
16151 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16153 return SDValue();
16154
16155 SDValue Vec3 = FMAOp1.getOperand(0);
16156 SDValue Vec4 = FMAOp2.getOperand(0);
16157 SDValue Idx2 = FMAOp1.getOperand(1);
16158
16159 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16160 // Idx1 and Idx2 cannot be the same.
16161 Idx1 == Idx2)
16162 return SDValue();
16163
16164 if (Vec1 == Vec2 || Vec3 == Vec4)
16165 return SDValue();
16166
16167 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16168 return SDValue();
16169
16170 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16171 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16172 DAG.getTargetConstant(0, SL, MVT::i1));
16173 }
16174 }
16175 return SDValue();
16176}
16177
16178SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16179 DAGCombinerInfo &DCI) const {
16180 SelectionDAG &DAG = DCI.DAG;
16181 SDLoc SL(N);
16182
16183 SDValue LHS = N->getOperand(0);
16184 SDValue RHS = N->getOperand(1);
16185 EVT VT = LHS.getValueType();
16186 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16187
16188 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16189 if (!CRHS) {
16191 if (CRHS) {
16192 std::swap(LHS, RHS);
16193 CC = getSetCCSwappedOperands(CC);
16194 }
16195 }
16196
16197 if (CRHS) {
16198 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16199 isBoolSGPR(LHS.getOperand(0))) {
16200 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16201 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16202 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16203 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16204 if ((CRHS->isAllOnes() &&
16205 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16206 (CRHS->isZero() &&
16207 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16208 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16209 DAG.getAllOnesConstant(SL, MVT::i1));
16210 if ((CRHS->isAllOnes() &&
16211 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16212 (CRHS->isZero() &&
16213 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16214 return LHS.getOperand(0);
16215 }
16216
16217 const APInt &CRHSVal = CRHS->getAPIntValue();
16218 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16219 LHS.getOpcode() == ISD::SELECT &&
16220 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16221 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16222 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16223 isBoolSGPR(LHS.getOperand(0))) {
16224 // Given CT != FT:
16225 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16226 // setcc (select cc, CT, CF), CF, ne => cc
16227 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16228 // setcc (select cc, CT, CF), CT, eq => cc
16229 const APInt &CT = LHS.getConstantOperandAPInt(1);
16230 const APInt &CF = LHS.getConstantOperandAPInt(2);
16231
16232 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16233 (CT == CRHSVal && CC == ISD::SETNE))
16234 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16235 DAG.getAllOnesConstant(SL, MVT::i1));
16236 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16237 (CT == CRHSVal && CC == ISD::SETEQ))
16238 return LHS.getOperand(0);
16239 }
16240 }
16241
16242 if (VT != MVT::f32 && VT != MVT::f64 &&
16243 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16244 return SDValue();
16245
16246 // Match isinf/isfinite pattern
16247 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16248 // (fcmp one (fabs x), inf) -> (fp_class x,
16249 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16250 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16251 LHS.getOpcode() == ISD::FABS) {
16252 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16253 if (!CRHS)
16254 return SDValue();
16255
16256 const APFloat &APF = CRHS->getValueAPF();
16257 if (APF.isInfinity() && !APF.isNegative()) {
16258 const unsigned IsInfMask =
16260 const unsigned IsFiniteMask =
16264 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16265 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16266 DAG.getConstant(Mask, SL, MVT::i32));
16267 }
16268 }
16269
16270 return SDValue();
16271}
16272
16273SDValue
16274SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16275 DAGCombinerInfo &DCI) const {
16276 SelectionDAG &DAG = DCI.DAG;
16277 SDLoc SL(N);
16278 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16279
16280 SDValue Src = N->getOperand(0);
16281 SDValue Shift = N->getOperand(0);
16282
16283 // TODO: Extend type shouldn't matter (assuming legal types).
16284 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16285 Shift = Shift.getOperand(0);
16286
16287 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16288 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16289 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16290 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16291 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16292 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16293 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16294 SDValue Shifted = DAG.getZExtOrTrunc(
16295 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16296
16297 unsigned ShiftOffset = 8 * Offset;
16298 if (Shift.getOpcode() == ISD::SHL)
16299 ShiftOffset -= C->getZExtValue();
16300 else
16301 ShiftOffset += C->getZExtValue();
16302
16303 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16304 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16305 MVT::f32, Shifted);
16306 }
16307 }
16308 }
16309
16310 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16311 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16312 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16313 // We simplified Src. If this node is not dead, visit it again so it is
16314 // folded properly.
16315 if (N->getOpcode() != ISD::DELETED_NODE)
16316 DCI.AddToWorklist(N);
16317 return SDValue(N, 0);
16318 }
16319
16320 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16321 if (SDValue DemandedSrc =
16322 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16323 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16324
16325 return SDValue();
16326}
16327
16328SDValue SITargetLowering::performClampCombine(SDNode *N,
16329 DAGCombinerInfo &DCI) const {
16330 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16331 if (!CSrc)
16332 return SDValue();
16333
16334 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16335 const APFloat &F = CSrc->getValueAPF();
16336 APFloat Zero = APFloat::getZero(F.getSemantics());
16337 if (F < Zero ||
16338 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16339 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16340 }
16341
16342 APFloat One(F.getSemantics(), "1.0");
16343 if (F > One)
16344 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16345
16346 return SDValue(CSrc, 0);
16347}
16348
16349SDValue SITargetLowering::performSelectCombine(SDNode *N,
16350 DAGCombinerInfo &DCI) const {
16351
16352 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16353 // integer).
16354 // Detect when CMP and SELECT use the same constant and fold them to avoid
16355 // loading the constant twice. Specifically handles patterns like:
16356 // %cmp = icmp eq i32 %val, 4242
16357 // %sel = select i1 %cmp, i32 4242, i32 %other
16358 // It can be optimized to reuse %val instead of 4242 in select.
16359 SDValue Cond = N->getOperand(0);
16360 SDValue TrueVal = N->getOperand(1);
16361 SDValue FalseVal = N->getOperand(2);
16362
16363 // Check if condition is a comparison.
16364 if (Cond.getOpcode() != ISD::SETCC)
16365 return SDValue();
16366
16367 SDValue LHS = Cond.getOperand(0);
16368 SDValue RHS = Cond.getOperand(1);
16369 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16370
16371 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16372 bool isInteger = LHS.getValueType().isInteger();
16373
16374 // Handle simple floating-point and integer types only.
16375 if (!isFloatingPoint && !isInteger)
16376 return SDValue();
16377
16378 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16379 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16380 if (!isEquality && !isNonEquality)
16381 return SDValue();
16382
16383 SDValue ArgVal, ConstVal;
16384 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16385 (isInteger && isa<ConstantSDNode>(RHS))) {
16386 ConstVal = RHS;
16387 ArgVal = LHS;
16388 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16389 (isInteger && isa<ConstantSDNode>(LHS))) {
16390 ConstVal = LHS;
16391 ArgVal = RHS;
16392 } else {
16393 return SDValue();
16394 }
16395
16396 // Skip optimization for inlinable immediates.
16397 if (isFloatingPoint) {
16398 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16399 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16400 return SDValue();
16401 } else {
16403 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16404 return SDValue();
16405 }
16406
16407 // For equality and non-equality comparisons, patterns:
16408 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16409 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16410 if (!(isEquality && TrueVal == ConstVal) &&
16411 !(isNonEquality && FalseVal == ConstVal))
16412 return SDValue();
16413
16414 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16415 SDValue SelectRHS =
16416 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16417 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16418 SelectLHS, SelectRHS);
16419}
16420
16422 DAGCombinerInfo &DCI) const {
16423 switch (N->getOpcode()) {
16424 case ISD::ADD:
16425 case ISD::SUB:
16426 case ISD::SHL:
16427 case ISD::SRL:
16428 case ISD::SRA:
16429 case ISD::AND:
16430 case ISD::OR:
16431 case ISD::XOR:
16432 case ISD::MUL:
16433 case ISD::SETCC:
16434 case ISD::SELECT:
16435 case ISD::SMIN:
16436 case ISD::SMAX:
16437 case ISD::UMIN:
16438 case ISD::UMAX:
16439 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16440 return Res;
16441 break;
16442 default:
16443 break;
16444 }
16445
16446 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16447 return SDValue();
16448
16449 switch (N->getOpcode()) {
16450 case ISD::ADD:
16451 return performAddCombine(N, DCI);
16452 case ISD::PTRADD:
16453 return performPtrAddCombine(N, DCI);
16454 case ISD::SUB:
16455 return performSubCombine(N, DCI);
16456 case ISD::UADDO_CARRY:
16457 case ISD::USUBO_CARRY:
16458 return performAddCarrySubCarryCombine(N, DCI);
16459 case ISD::FADD:
16460 return performFAddCombine(N, DCI);
16461 case ISD::FSUB:
16462 return performFSubCombine(N, DCI);
16463 case ISD::FDIV:
16464 return performFDivCombine(N, DCI);
16465 case ISD::FMUL:
16466 return performFMulCombine(N, DCI);
16467 case ISD::SETCC:
16468 return performSetCCCombine(N, DCI);
16469 case ISD::SELECT:
16470 if (auto Res = performSelectCombine(N, DCI))
16471 return Res;
16472 break;
16473 case ISD::FMAXNUM:
16474 case ISD::FMINNUM:
16475 case ISD::FMAXNUM_IEEE:
16476 case ISD::FMINNUM_IEEE:
16477 case ISD::FMAXIMUM:
16478 case ISD::FMINIMUM:
16479 case ISD::FMAXIMUMNUM:
16480 case ISD::FMINIMUMNUM:
16481 case ISD::SMAX:
16482 case ISD::SMIN:
16483 case ISD::UMAX:
16484 case ISD::UMIN:
16487 return performMinMaxCombine(N, DCI);
16488 case ISD::FMA:
16489 return performFMACombine(N, DCI);
16490 case ISD::AND:
16491 return performAndCombine(N, DCI);
16492 case ISD::OR:
16493 return performOrCombine(N, DCI);
16494 case ISD::FSHR: {
16496 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16497 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16498 return matchPERM(N, DCI);
16499 }
16500 break;
16501 }
16502 case ISD::XOR:
16503 return performXorCombine(N, DCI);
16504 case ISD::ZERO_EXTEND:
16505 return performZeroExtendCombine(N, DCI);
16507 return performSignExtendInRegCombine(N, DCI);
16509 return performClassCombine(N, DCI);
16510 case ISD::FCANONICALIZE:
16511 return performFCanonicalizeCombine(N, DCI);
16512 case AMDGPUISD::RCP:
16513 return performRcpCombine(N, DCI);
16514 case ISD::FLDEXP:
16515 case AMDGPUISD::FRACT:
16516 case AMDGPUISD::RSQ:
16519 case AMDGPUISD::RSQ_CLAMP: {
16520 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16521 SDValue Src = N->getOperand(0);
16522 if (Src.isUndef())
16523 return Src;
16524 break;
16525 }
16526 case ISD::SINT_TO_FP:
16527 case ISD::UINT_TO_FP:
16528 return performUCharToFloatCombine(N, DCI);
16529 case ISD::FCOPYSIGN:
16530 return performFCopySignCombine(N, DCI);
16535 return performCvtF32UByteNCombine(N, DCI);
16536 case AMDGPUISD::FMED3:
16537 return performFMed3Combine(N, DCI);
16539 return performCvtPkRTZCombine(N, DCI);
16540 case AMDGPUISD::CLAMP:
16541 return performClampCombine(N, DCI);
16542 case ISD::SCALAR_TO_VECTOR: {
16543 SelectionDAG &DAG = DCI.DAG;
16544 EVT VT = N->getValueType(0);
16545
16546 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16547 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16548 SDLoc SL(N);
16549 SDValue Src = N->getOperand(0);
16550 EVT EltVT = Src.getValueType();
16551 if (EltVT != MVT::i16)
16552 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16553
16554 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16555 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16556 }
16557
16558 break;
16559 }
16561 return performExtractVectorEltCombine(N, DCI);
16563 return performInsertVectorEltCombine(N, DCI);
16564 case ISD::FP_ROUND:
16565 return performFPRoundCombine(N, DCI);
16566 case ISD::LOAD: {
16567 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16568 return Widened;
16569 [[fallthrough]];
16570 }
16571 default: {
16572 if (!DCI.isBeforeLegalize()) {
16573 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16574 return performMemSDNodeCombine(MemNode, DCI);
16575 }
16576
16577 break;
16578 }
16579 }
16580
16582}
16583
16584/// Helper function for adjustWritemask
16585static unsigned SubIdx2Lane(unsigned Idx) {
16586 switch (Idx) {
16587 default:
16588 return ~0u;
16589 case AMDGPU::sub0:
16590 return 0;
16591 case AMDGPU::sub1:
16592 return 1;
16593 case AMDGPU::sub2:
16594 return 2;
16595 case AMDGPU::sub3:
16596 return 3;
16597 case AMDGPU::sub4:
16598 return 4; // Possible with TFE/LWE
16599 }
16600}
16601
16602/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16603SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16604 SelectionDAG &DAG) const {
16605 unsigned Opcode = Node->getMachineOpcode();
16606
16607 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16608 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16609 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16610 return Node; // not implemented for D16
16611
16612 SDNode *Users[5] = {nullptr};
16613 unsigned Lane = 0;
16614 unsigned DmaskIdx =
16615 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16616 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16617 unsigned NewDmask = 0;
16618 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16619 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16620 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16621 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16622 unsigned TFCLane = 0;
16623 bool HasChain = Node->getNumValues() > 1;
16624
16625 if (OldDmask == 0) {
16626 // These are folded out, but on the chance it happens don't assert.
16627 return Node;
16628 }
16629
16630 unsigned OldBitsSet = llvm::popcount(OldDmask);
16631 // Work out which is the TFE/LWE lane if that is enabled.
16632 if (UsesTFC) {
16633 TFCLane = OldBitsSet;
16634 }
16635
16636 // Try to figure out the used register components
16637 for (SDUse &Use : Node->uses()) {
16638
16639 // Don't look at users of the chain.
16640 if (Use.getResNo() != 0)
16641 continue;
16642
16643 SDNode *User = Use.getUser();
16644
16645 // Abort if we can't understand the usage
16646 if (!User->isMachineOpcode() ||
16647 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16648 return Node;
16649
16650 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16651 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16652 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16653 // set, etc.
16654 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16655 if (Lane == ~0u)
16656 return Node;
16657
16658 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16659 if (UsesTFC && Lane == TFCLane) {
16660 Users[Lane] = User;
16661 } else {
16662 // Set which texture component corresponds to the lane.
16663 unsigned Comp;
16664 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16665 Comp = llvm::countr_zero(Dmask);
16666 Dmask &= ~(1 << Comp);
16667 }
16668
16669 // Abort if we have more than one user per component.
16670 if (Users[Lane])
16671 return Node;
16672
16673 Users[Lane] = User;
16674 NewDmask |= 1 << Comp;
16675 }
16676 }
16677
16678 // Don't allow 0 dmask, as hardware assumes one channel enabled.
16679 bool NoChannels = !NewDmask;
16680 if (NoChannels) {
16681 if (!UsesTFC) {
16682 // No uses of the result and not using TFC. Then do nothing.
16683 return Node;
16684 }
16685 // If the original dmask has one channel - then nothing to do
16686 if (OldBitsSet == 1)
16687 return Node;
16688 // Use an arbitrary dmask - required for the instruction to work
16689 NewDmask = 1;
16690 }
16691 // Abort if there's no change
16692 if (NewDmask == OldDmask)
16693 return Node;
16694
16695 unsigned BitsSet = llvm::popcount(NewDmask);
16696
16697 // Check for TFE or LWE - increase the number of channels by one to account
16698 // for the extra return value
16699 // This will need adjustment for D16 if this is also included in
16700 // adjustWriteMask (this function) but at present D16 are excluded.
16701 unsigned NewChannels = BitsSet + UsesTFC;
16702
16703 int NewOpcode =
16704 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
16705 assert(NewOpcode != -1 &&
16706 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
16707 "failed to find equivalent MIMG op");
16708
16709 // Adjust the writemask in the node
16711 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
16712 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
16713 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
16714
16715 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
16716
16717 MVT ResultVT = NewChannels == 1
16718 ? SVT
16719 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
16720 : NewChannels == 5 ? 8
16721 : NewChannels);
16722 SDVTList NewVTList =
16723 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
16724
16725 MachineSDNode *NewNode =
16726 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
16727
16728 if (HasChain) {
16729 // Update chain.
16730 DAG.setNodeMemRefs(NewNode, Node->memoperands());
16731 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
16732 }
16733
16734 if (NewChannels == 1) {
16735 assert(Node->hasNUsesOfValue(1, 0));
16736 SDNode *Copy =
16737 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
16738 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
16739 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
16740 return nullptr;
16741 }
16742
16743 // Update the users of the node with the new indices
16744 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
16745 SDNode *User = Users[i];
16746 if (!User) {
16747 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
16748 // Users[0] is still nullptr because channel 0 doesn't really have a use.
16749 if (i || !NoChannels)
16750 continue;
16751 } else {
16752 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
16753 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
16754 if (NewUser != User) {
16755 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
16756 DAG.RemoveDeadNode(User);
16757 }
16758 }
16759
16760 switch (Idx) {
16761 default:
16762 break;
16763 case AMDGPU::sub0:
16764 Idx = AMDGPU::sub1;
16765 break;
16766 case AMDGPU::sub1:
16767 Idx = AMDGPU::sub2;
16768 break;
16769 case AMDGPU::sub2:
16770 Idx = AMDGPU::sub3;
16771 break;
16772 case AMDGPU::sub3:
16773 Idx = AMDGPU::sub4;
16774 break;
16775 }
16776 }
16777
16778 DAG.RemoveDeadNode(Node);
16779 return nullptr;
16780}
16781
16783 if (Op.getOpcode() == ISD::AssertZext)
16784 Op = Op.getOperand(0);
16785
16786 return isa<FrameIndexSDNode>(Op);
16787}
16788
16789/// Legalize target independent instructions (e.g. INSERT_SUBREG)
16790/// with frame index operands.
16791/// LLVM assumes that inputs are to these instructions are registers.
16792SDNode *
16794 SelectionDAG &DAG) const {
16795 if (Node->getOpcode() == ISD::CopyToReg) {
16796 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
16797 SDValue SrcVal = Node->getOperand(2);
16798
16799 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
16800 // to try understanding copies to physical registers.
16801 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
16802 SDLoc SL(Node);
16804 SDValue VReg = DAG.getRegister(
16805 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16806
16807 SDNode *Glued = Node->getGluedNode();
16808 SDValue ToVReg = DAG.getCopyToReg(
16809 Node->getOperand(0), SL, VReg, SrcVal,
16810 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
16811 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
16812 VReg, ToVReg.getValue(1));
16813 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
16814 DAG.RemoveDeadNode(Node);
16815 return ToResultReg.getNode();
16816 }
16817 }
16818
16820 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
16821 if (!isFrameIndexOp(Node->getOperand(i))) {
16822 Ops.push_back(Node->getOperand(i));
16823 continue;
16824 }
16825
16826 SDLoc DL(Node);
16827 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
16828 Node->getOperand(i).getValueType(),
16829 Node->getOperand(i)),
16830 0));
16831 }
16832
16833 return DAG.UpdateNodeOperands(Node, Ops);
16834}
16835
16836/// Fold the instructions after selecting them.
16837/// Returns null if users were already updated.
16839 SelectionDAG &DAG) const {
16841 unsigned Opcode = Node->getMachineOpcode();
16842
16843 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
16844 !TII->isGather4(Opcode) &&
16845 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
16846 return adjustWritemask(Node, DAG);
16847 }
16848
16849 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16851 return Node;
16852 }
16853
16854 switch (Opcode) {
16855 case AMDGPU::V_DIV_SCALE_F32_e64:
16856 case AMDGPU::V_DIV_SCALE_F64_e64: {
16857 // Satisfy the operand register constraint when one of the inputs is
16858 // undefined. Ordinarily each undef value will have its own implicit_def of
16859 // a vreg, so force these to use a single register.
16860 SDValue Src0 = Node->getOperand(1);
16861 SDValue Src1 = Node->getOperand(3);
16862 SDValue Src2 = Node->getOperand(5);
16863
16864 if ((Src0.isMachineOpcode() &&
16865 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
16866 (Src0 == Src1 || Src0 == Src2))
16867 break;
16868
16869 MVT VT = Src0.getValueType().getSimpleVT();
16870 const TargetRegisterClass *RC =
16871 getRegClassFor(VT, Src0.getNode()->isDivergent());
16872
16874 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
16875
16876 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
16877 Src0, SDValue());
16878
16879 // src0 must be the same register as src1 or src2, even if the value is
16880 // undefined, so make sure we don't violate this constraint.
16881 if (Src0.isMachineOpcode() &&
16882 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
16883 if (Src1.isMachineOpcode() &&
16884 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16885 Src0 = Src1;
16886 else if (Src2.isMachineOpcode() &&
16887 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16888 Src0 = Src2;
16889 else {
16890 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
16891 Src0 = UndefReg;
16892 Src1 = UndefReg;
16893 }
16894 } else
16895 break;
16896
16898 Ops[1] = Src0;
16899 Ops[3] = Src1;
16900 Ops[5] = Src2;
16901 Ops.push_back(ImpDef.getValue(1));
16902 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
16903 }
16904 default:
16905 break;
16906 }
16907
16908 return Node;
16909}
16910
16911// Any MIMG instructions that use tfe or lwe require an initialization of the
16912// result register that will be written in the case of a memory access failure.
16913// The required code is also added to tie this init code to the result of the
16914// img instruction.
16917 const SIRegisterInfo &TRI = TII->getRegisterInfo();
16918 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
16919 MachineBasicBlock &MBB = *MI.getParent();
16920
16921 int DstIdx =
16922 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
16923 unsigned InitIdx = 0;
16924
16925 if (TII->isImage(MI)) {
16926 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
16927 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
16928 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
16929
16930 if (!TFE && !LWE) // intersect_ray
16931 return;
16932
16933 unsigned TFEVal = TFE ? TFE->getImm() : 0;
16934 unsigned LWEVal = LWE ? LWE->getImm() : 0;
16935 unsigned D16Val = D16 ? D16->getImm() : 0;
16936
16937 if (!TFEVal && !LWEVal)
16938 return;
16939
16940 // At least one of TFE or LWE are non-zero
16941 // We have to insert a suitable initialization of the result value and
16942 // tie this to the dest of the image instruction.
16943
16944 // Calculate which dword we have to initialize to 0.
16945 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
16946
16947 // check that dmask operand is found.
16948 assert(MO_Dmask && "Expected dmask operand in instruction");
16949
16950 unsigned dmask = MO_Dmask->getImm();
16951 // Determine the number of active lanes taking into account the
16952 // Gather4 special case
16953 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
16954
16955 bool Packed = !Subtarget->hasUnpackedD16VMem();
16956
16957 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16958
16959 // Abandon attempt if the dst size isn't large enough
16960 // - this is in fact an error but this is picked up elsewhere and
16961 // reported correctly.
16962 uint32_t DstSize =
16963 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16964 if (DstSize < InitIdx)
16965 return;
16966 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
16967 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
16968 } else {
16969 return;
16970 }
16971
16972 const DebugLoc &DL = MI.getDebugLoc();
16973
16974 // Create a register for the initialization value.
16975 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
16976 unsigned NewDst = 0; // Final initialized value will be in here
16977
16978 // If PRTStrictNull feature is enabled (the default) then initialize
16979 // all the result registers to 0, otherwise just the error indication
16980 // register (VGPRn+1)
16981 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
16982 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
16983
16984 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
16985 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16986 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
16987 // Initialize dword
16988 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
16989 // clang-format off
16990 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
16991 .addImm(0);
16992 // clang-format on
16993 // Insert into the super-reg
16994 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
16995 .addReg(PrevDst)
16996 .addReg(SubReg)
16998
16999 PrevDst = NewDst;
17000 }
17001
17002 // Add as an implicit operand
17003 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17004
17005 // Tie the just added implicit operand to the dst
17006 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17007}
17008
17009/// Assign the register class depending on the number of
17010/// bits set in the writemask
17012 SDNode *Node) const {
17014
17015 MachineFunction *MF = MI.getParent()->getParent();
17018
17019 if (TII->isVOP3(MI.getOpcode())) {
17020 // Make sure constant bus requirements are respected.
17021 TII->legalizeOperandsVOP3(MRI, MI);
17022
17023 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17024 // This saves a chain-copy of registers and better balance register
17025 // use between vgpr and agpr as agpr tuples tend to be big.
17026 if (!MI.getDesc().operands().empty()) {
17027 unsigned Opc = MI.getOpcode();
17028 bool HasAGPRs = Info->mayNeedAGPRs();
17029 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17030 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17031 for (auto I :
17032 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17033 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17034 if (I == -1)
17035 break;
17036 if ((I == Src2Idx) && (HasAGPRs))
17037 break;
17038 MachineOperand &Op = MI.getOperand(I);
17039 if (!Op.isReg() || !Op.getReg().isVirtual())
17040 continue;
17041 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17042 if (!TRI->hasAGPRs(RC))
17043 continue;
17044 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17045 if (!Src || !Src->isCopy() ||
17046 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17047 continue;
17048 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17049 // All uses of agpr64 and agpr32 can also accept vgpr except for
17050 // v_accvgpr_read, but we do not produce agpr reads during selection,
17051 // so no use checks are needed.
17052 MRI.setRegClass(Op.getReg(), NewRC);
17053 }
17054
17055 if (TII->isMAI(MI)) {
17056 // The ordinary src0, src1, src2 were legalized above.
17057 //
17058 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17059 // as a separate instruction.
17060 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17061 AMDGPU::OpName::scale_src0);
17062 if (Src0Idx != -1) {
17063 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17064 AMDGPU::OpName::scale_src1);
17065 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17066 TII->usesConstantBus(MRI, MI, Src1Idx))
17067 TII->legalizeOpWithMove(MI, Src1Idx);
17068 }
17069 }
17070
17071 if (!HasAGPRs)
17072 return;
17073
17074 // Resolve the rest of AV operands to AGPRs.
17075 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17076 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17077 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17078 if (TRI->isVectorSuperClass(RC)) {
17079 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17080 MRI.setRegClass(Src2->getReg(), NewRC);
17081 if (Src2->isTied())
17082 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17083 }
17084 }
17085 }
17086 }
17087
17088 return;
17089 }
17090
17091 if (TII->isImage(MI))
17092 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17093}
17094
17096 uint64_t Val) {
17097 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17098 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17099}
17100
17102 const SDLoc &DL,
17103 SDValue Ptr) const {
17105
17106 // Build the half of the subregister with the constants before building the
17107 // full 128-bit register. If we are building multiple resource descriptors,
17108 // this will allow CSEing of the 2-component register.
17109 const SDValue Ops0[] = {
17110 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17111 buildSMovImm32(DAG, DL, 0),
17112 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17113 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17114 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17115
17116 SDValue SubRegHi = SDValue(
17117 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17118
17119 // Combine the constants and the pointer.
17120 const SDValue Ops1[] = {
17121 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17122 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17123 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17124
17125 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17126}
17127
17128/// Return a resource descriptor with the 'Add TID' bit enabled
17129/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17130/// of the resource descriptor) to create an offset, which is added to
17131/// the resource pointer.
17133 SDValue Ptr, uint32_t RsrcDword1,
17134 uint64_t RsrcDword2And3) const {
17135 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17136 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17137 if (RsrcDword1) {
17138 PtrHi =
17139 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17140 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17141 0);
17142 }
17143
17144 SDValue DataLo =
17145 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17146 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17147
17148 const SDValue Ops[] = {
17149 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17150 PtrLo,
17151 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17152 PtrHi,
17153 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17154 DataLo,
17155 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17156 DataHi,
17157 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17158
17159 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17160}
17161
17162//===----------------------------------------------------------------------===//
17163// SI Inline Assembly Support
17164//===----------------------------------------------------------------------===//
17165
17166std::pair<unsigned, const TargetRegisterClass *>
17168 StringRef Constraint,
17169 MVT VT) const {
17170 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17171
17172 const TargetRegisterClass *RC = nullptr;
17173 if (Constraint.size() == 1) {
17174 // Check if we cannot determine the bit size of the given value type. This
17175 // can happen, for example, in this situation where we have an empty struct
17176 // (size 0): `call void asm "", "v"({} poison)`-
17177 if (VT == MVT::Other)
17178 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17179 const unsigned BitWidth = VT.getSizeInBits();
17180 switch (Constraint[0]) {
17181 default:
17182 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17183 case 's':
17184 case 'r':
17185 switch (BitWidth) {
17186 case 16:
17187 RC = &AMDGPU::SReg_32RegClass;
17188 break;
17189 case 64:
17190 RC = &AMDGPU::SGPR_64RegClass;
17191 break;
17192 default:
17194 if (!RC)
17195 return std::pair(0U, nullptr);
17196 break;
17197 }
17198 break;
17199 case 'v':
17200 switch (BitWidth) {
17201 case 16:
17202 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17203 : &AMDGPU::VGPR_32_Lo256RegClass;
17204 break;
17205 default:
17206 RC = Subtarget->has1024AddressableVGPRs()
17207 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17208 : TRI->getVGPRClassForBitWidth(BitWidth);
17209 if (!RC)
17210 return std::pair(0U, nullptr);
17211 break;
17212 }
17213 break;
17214 case 'a':
17215 if (!Subtarget->hasMAIInsts())
17216 break;
17217 switch (BitWidth) {
17218 case 16:
17219 RC = &AMDGPU::AGPR_32RegClass;
17220 break;
17221 default:
17222 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17223 if (!RC)
17224 return std::pair(0U, nullptr);
17225 break;
17226 }
17227 break;
17228 }
17229 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17230 const unsigned BitWidth = VT.getSizeInBits();
17231 switch (BitWidth) {
17232 case 16:
17233 RC = &AMDGPU::AV_32RegClass;
17234 break;
17235 default:
17236 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17237 if (!RC)
17238 return std::pair(0U, nullptr);
17239 break;
17240 }
17241 }
17242
17243 // We actually support i128, i16 and f16 as inline parameters
17244 // even if they are not reported as legal
17245 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17246 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17247 return std::pair(0U, RC);
17248
17249 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17250 if (Kind != '\0') {
17251 if (Kind == 'v') {
17252 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17253 } else if (Kind == 's') {
17254 RC = &AMDGPU::SGPR_32RegClass;
17255 } else if (Kind == 'a') {
17256 RC = &AMDGPU::AGPR_32RegClass;
17257 }
17258
17259 if (RC) {
17260 if (NumRegs > 1) {
17261 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17262 return std::pair(0U, nullptr);
17263
17264 uint32_t Width = NumRegs * 32;
17265 // Prohibit constraints for register ranges with a width that does not
17266 // match the required type.
17267 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17268 return std::pair(0U, nullptr);
17269
17270 MCRegister Reg = RC->getRegister(Idx);
17272 RC = TRI->getVGPRClassForBitWidth(Width);
17273 else if (SIRegisterInfo::isSGPRClass(RC))
17274 RC = TRI->getSGPRClassForBitWidth(Width);
17275 else if (SIRegisterInfo::isAGPRClass(RC))
17276 RC = TRI->getAGPRClassForBitWidth(Width);
17277 if (RC) {
17278 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17279 if (!Reg) {
17280 // The register class does not contain the requested register,
17281 // e.g., because it is an SGPR pair that would violate alignment
17282 // requirements.
17283 return std::pair(0U, nullptr);
17284 }
17285 return std::pair(Reg, RC);
17286 }
17287 }
17288
17289 // Check for lossy scalar/vector conversions.
17290 if (VT.isVector() && VT.getSizeInBits() != 32)
17291 return std::pair(0U, nullptr);
17292 if (Idx < RC->getNumRegs())
17293 return std::pair(RC->getRegister(Idx), RC);
17294 return std::pair(0U, nullptr);
17295 }
17296 }
17297
17298 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17299 if (Ret.first)
17300 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17301
17302 return Ret;
17303}
17304
17305static bool isImmConstraint(StringRef Constraint) {
17306 if (Constraint.size() == 1) {
17307 switch (Constraint[0]) {
17308 default:
17309 break;
17310 case 'I':
17311 case 'J':
17312 case 'A':
17313 case 'B':
17314 case 'C':
17315 return true;
17316 }
17317 } else if (Constraint == "DA" || Constraint == "DB") {
17318 return true;
17319 }
17320 return false;
17321}
17322
17325 if (Constraint.size() == 1) {
17326 switch (Constraint[0]) {
17327 default:
17328 break;
17329 case 's':
17330 case 'v':
17331 case 'a':
17332 return C_RegisterClass;
17333 }
17334 } else if (Constraint.size() == 2) {
17335 if (Constraint == "VA")
17336 return C_RegisterClass;
17337 }
17338 if (isImmConstraint(Constraint)) {
17339 return C_Other;
17340 }
17341 return TargetLowering::getConstraintType(Constraint);
17342}
17343
17344static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17346 Val = Val & maskTrailingOnes<uint64_t>(Size);
17347 }
17348 return Val;
17349}
17350
17352 StringRef Constraint,
17353 std::vector<SDValue> &Ops,
17354 SelectionDAG &DAG) const {
17355 if (isImmConstraint(Constraint)) {
17356 uint64_t Val;
17357 if (getAsmOperandConstVal(Op, Val) &&
17358 checkAsmConstraintVal(Op, Constraint, Val)) {
17359 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17360 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17361 }
17362 } else {
17364 }
17365}
17366
17368 unsigned Size = Op.getScalarValueSizeInBits();
17369 if (Size > 64)
17370 return false;
17371
17372 if (Size == 16 && !Subtarget->has16BitInsts())
17373 return false;
17374
17376 Val = C->getSExtValue();
17377 return true;
17378 }
17380 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17381 return true;
17382 }
17384 if (Size != 16 || Op.getNumOperands() != 2)
17385 return false;
17386 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17387 return false;
17388 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17389 Val = C->getSExtValue();
17390 return true;
17391 }
17392 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17393 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17394 return true;
17395 }
17396 }
17397
17398 return false;
17399}
17400
17402 uint64_t Val) const {
17403 if (Constraint.size() == 1) {
17404 switch (Constraint[0]) {
17405 case 'I':
17407 case 'J':
17408 return isInt<16>(Val);
17409 case 'A':
17410 return checkAsmConstraintValA(Op, Val);
17411 case 'B':
17412 return isInt<32>(Val);
17413 case 'C':
17414 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17416 default:
17417 break;
17418 }
17419 } else if (Constraint.size() == 2) {
17420 if (Constraint == "DA") {
17421 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17422 int64_t LoBits = static_cast<int32_t>(Val);
17423 return checkAsmConstraintValA(Op, HiBits, 32) &&
17424 checkAsmConstraintValA(Op, LoBits, 32);
17425 }
17426 if (Constraint == "DB") {
17427 return true;
17428 }
17429 }
17430 llvm_unreachable("Invalid asm constraint");
17431}
17432
17434 unsigned MaxSize) const {
17435 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17436 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17437 if (Size == 16) {
17438 MVT VT = Op.getSimpleValueType();
17439 switch (VT.SimpleTy) {
17440 default:
17441 return false;
17442 case MVT::i16:
17443 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17444 case MVT::f16:
17445 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17446 case MVT::bf16:
17447 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17448 case MVT::v2i16:
17449 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17450 case MVT::v2f16:
17451 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17452 case MVT::v2bf16:
17453 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17454 }
17455 }
17456 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17457 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17458 return true;
17459 return false;
17460}
17461
17462static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17463 switch (UnalignedClassID) {
17464 case AMDGPU::VReg_64RegClassID:
17465 return AMDGPU::VReg_64_Align2RegClassID;
17466 case AMDGPU::VReg_96RegClassID:
17467 return AMDGPU::VReg_96_Align2RegClassID;
17468 case AMDGPU::VReg_128RegClassID:
17469 return AMDGPU::VReg_128_Align2RegClassID;
17470 case AMDGPU::VReg_160RegClassID:
17471 return AMDGPU::VReg_160_Align2RegClassID;
17472 case AMDGPU::VReg_192RegClassID:
17473 return AMDGPU::VReg_192_Align2RegClassID;
17474 case AMDGPU::VReg_224RegClassID:
17475 return AMDGPU::VReg_224_Align2RegClassID;
17476 case AMDGPU::VReg_256RegClassID:
17477 return AMDGPU::VReg_256_Align2RegClassID;
17478 case AMDGPU::VReg_288RegClassID:
17479 return AMDGPU::VReg_288_Align2RegClassID;
17480 case AMDGPU::VReg_320RegClassID:
17481 return AMDGPU::VReg_320_Align2RegClassID;
17482 case AMDGPU::VReg_352RegClassID:
17483 return AMDGPU::VReg_352_Align2RegClassID;
17484 case AMDGPU::VReg_384RegClassID:
17485 return AMDGPU::VReg_384_Align2RegClassID;
17486 case AMDGPU::VReg_512RegClassID:
17487 return AMDGPU::VReg_512_Align2RegClassID;
17488 case AMDGPU::VReg_1024RegClassID:
17489 return AMDGPU::VReg_1024_Align2RegClassID;
17490 case AMDGPU::AReg_64RegClassID:
17491 return AMDGPU::AReg_64_Align2RegClassID;
17492 case AMDGPU::AReg_96RegClassID:
17493 return AMDGPU::AReg_96_Align2RegClassID;
17494 case AMDGPU::AReg_128RegClassID:
17495 return AMDGPU::AReg_128_Align2RegClassID;
17496 case AMDGPU::AReg_160RegClassID:
17497 return AMDGPU::AReg_160_Align2RegClassID;
17498 case AMDGPU::AReg_192RegClassID:
17499 return AMDGPU::AReg_192_Align2RegClassID;
17500 case AMDGPU::AReg_256RegClassID:
17501 return AMDGPU::AReg_256_Align2RegClassID;
17502 case AMDGPU::AReg_512RegClassID:
17503 return AMDGPU::AReg_512_Align2RegClassID;
17504 case AMDGPU::AReg_1024RegClassID:
17505 return AMDGPU::AReg_1024_Align2RegClassID;
17506 default:
17507 return -1;
17508 }
17509}
17510
17511// Figure out which registers should be reserved for stack access. Only after
17512// the function is legalized do we know all of the non-spill stack objects or if
17513// calls are present.
17517 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17518 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17519 const SIInstrInfo *TII = ST.getInstrInfo();
17520
17521 if (Info->isEntryFunction()) {
17522 // Callable functions have fixed registers used for stack access.
17524 }
17525
17526 // TODO: Move this logic to getReservedRegs()
17527 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17528 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17529 Register SReg = ST.isWave32()
17530 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17531 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17532 &AMDGPU::SGPR_64RegClass);
17533 Info->setSGPRForEXECCopy(SReg);
17534
17535 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17536 Info->getStackPtrOffsetReg()));
17537 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17538 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17539
17540 // We need to worry about replacing the default register with itself in case
17541 // of MIR testcases missing the MFI.
17542 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17543 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17544
17545 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17546 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17547
17548 Info->limitOccupancy(MF);
17549
17550 if (ST.isWave32() && !MF.empty()) {
17551 for (auto &MBB : MF) {
17552 for (auto &MI : MBB) {
17553 TII->fixImplicitOperands(MI);
17554 }
17555 }
17556 }
17557
17558 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17559 // classes if required. Ideally the register class constraints would differ
17560 // per-subtarget, but there's no easy way to achieve that right now. This is
17561 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17562 // from using them as the register class for legal types.
17563 if (ST.needsAlignedVGPRs()) {
17564 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17565 const Register Reg = Register::index2VirtReg(I);
17566 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17567 if (!RC)
17568 continue;
17569 int NewClassID = getAlignedAGPRClassID(RC->getID());
17570 if (NewClassID != -1)
17571 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17572 }
17573 }
17574
17576}
17577
17579 KnownBits &Known,
17580 const APInt &DemandedElts,
17581 const SelectionDAG &DAG,
17582 unsigned Depth) const {
17583 Known.resetAll();
17584 unsigned Opc = Op.getOpcode();
17585 switch (Opc) {
17587 unsigned IID = Op.getConstantOperandVal(0);
17588 switch (IID) {
17589 case Intrinsic::amdgcn_mbcnt_lo:
17590 case Intrinsic::amdgcn_mbcnt_hi: {
17591 const GCNSubtarget &ST =
17593 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17594 // most 31 + src1.
17595 Known.Zero.setBitsFrom(
17596 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17597 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17598 Known = KnownBits::add(Known, Known2);
17599 return;
17600 }
17601 }
17602 break;
17603 }
17604 }
17606 Op, Known, DemandedElts, DAG, Depth);
17607}
17608
17610 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17612
17613 // Set the high bits to zero based on the maximum allowed scratch size per
17614 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17615 // calculation won't overflow, so assume the sign bit is never set.
17616 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17617}
17618
17620 GISelValueTracking &VT, KnownBits &Known,
17621 unsigned Dim) {
17622 unsigned MaxValue =
17623 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17624 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17625}
17626
17628 KnownBits &Known, const APInt &DemandedElts,
17629 unsigned BFEWidth, bool SExt, unsigned Depth) {
17631 const MachineOperand &Src1 = MI.getOperand(2);
17632
17633 unsigned Src1Cst = 0;
17634 if (Src1.isImm()) {
17635 Src1Cst = Src1.getImm();
17636 } else if (Src1.isReg()) {
17637 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17638 if (!Cst)
17639 return;
17640 Src1Cst = Cst->Value.getZExtValue();
17641 } else {
17642 return;
17643 }
17644
17645 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17646 // Width is always [22:16].
17647 const unsigned Offset =
17648 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17649 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17650
17651 if (Width >= BFEWidth) // Ill-formed.
17652 return;
17653
17654 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17655 Depth + 1);
17656
17657 Known = Known.extractBits(Width, Offset);
17658
17659 if (SExt)
17660 Known = Known.sext(BFEWidth);
17661 else
17662 Known = Known.zext(BFEWidth);
17663}
17664
17666 GISelValueTracking &VT, Register R, KnownBits &Known,
17667 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
17668 unsigned Depth) const {
17669 Known.resetAll();
17670 const MachineInstr *MI = MRI.getVRegDef(R);
17671 switch (MI->getOpcode()) {
17672 case AMDGPU::S_BFE_I32:
17673 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17674 /*SExt=*/true, Depth);
17675 case AMDGPU::S_BFE_U32:
17676 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17677 /*SExt=*/false, Depth);
17678 case AMDGPU::S_BFE_I64:
17679 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17680 /*SExt=*/true, Depth);
17681 case AMDGPU::S_BFE_U64:
17682 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17683 /*SExt=*/false, Depth);
17684 case AMDGPU::G_INTRINSIC:
17685 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17686 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
17687 switch (IID) {
17688 case Intrinsic::amdgcn_workitem_id_x:
17689 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
17690 break;
17691 case Intrinsic::amdgcn_workitem_id_y:
17692 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
17693 break;
17694 case Intrinsic::amdgcn_workitem_id_z:
17695 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
17696 break;
17697 case Intrinsic::amdgcn_mbcnt_lo:
17698 case Intrinsic::amdgcn_mbcnt_hi: {
17699 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17700 // most 31 + src1.
17701 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
17702 ? getSubtarget()->getWavefrontSizeLog2()
17703 : 5);
17704 KnownBits Known2;
17705 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
17706 Depth + 1);
17707 Known = KnownBits::add(Known, Known2);
17708 break;
17709 }
17710 case Intrinsic::amdgcn_groupstaticsize: {
17711 // We can report everything over the maximum size as 0. We can't report
17712 // based on the actual size because we don't know if it's accurate or not
17713 // at any given point.
17714 Known.Zero.setHighBits(
17715 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
17716 break;
17717 }
17718 }
17719 break;
17720 }
17721 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17722 Known.Zero.setHighBits(24);
17723 break;
17724 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17725 Known.Zero.setHighBits(16);
17726 break;
17727 case AMDGPU::G_AMDGPU_SMED3:
17728 case AMDGPU::G_AMDGPU_UMED3: {
17729 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
17730
17731 KnownBits Known2;
17732 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
17733 if (Known2.isUnknown())
17734 break;
17735
17736 KnownBits Known1;
17737 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
17738 if (Known1.isUnknown())
17739 break;
17740
17741 KnownBits Known0;
17742 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
17743 if (Known0.isUnknown())
17744 break;
17745
17746 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
17747 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
17748 Known.One = Known0.One & Known1.One & Known2.One;
17749 break;
17750 }
17751 }
17752}
17753
17756 unsigned Depth) const {
17757 const MachineInstr *MI = MRI.getVRegDef(R);
17758 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
17759 // FIXME: Can this move to generic code? What about the case where the call
17760 // site specifies a lower alignment?
17761 Intrinsic::ID IID = GI->getIntrinsicID();
17763 AttributeList Attrs =
17764 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
17765 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
17766 return *RetAlign;
17767 }
17768 return Align(1);
17769}
17770
17773 const Align CacheLineAlign = Align(64);
17774
17775 // Pre-GFX10 target did not benefit from loop alignment
17776 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
17777 getSubtarget()->hasInstFwdPrefetchBug())
17778 return PrefAlign;
17779
17780 // On GFX10 I$ is 4 x 64 bytes cache lines.
17781 // By default prefetcher keeps one cache line behind and reads two ahead.
17782 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
17783 // behind and one ahead.
17784 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
17785 // If loop fits 64 bytes it always spans no more than two cache lines and
17786 // does not need an alignment.
17787 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
17788 // Else if loop is less or equal 192 bytes we need two lines behind.
17789
17791 const MachineBasicBlock *Header = ML->getHeader();
17792 if (Header->getAlignment() != PrefAlign)
17793 return Header->getAlignment(); // Already processed.
17794
17795 unsigned LoopSize = 0;
17796 for (const MachineBasicBlock *MBB : ML->blocks()) {
17797 // If inner loop block is aligned assume in average half of the alignment
17798 // size to be added as nops.
17799 if (MBB != Header)
17800 LoopSize += MBB->getAlignment().value() / 2;
17801
17802 for (const MachineInstr &MI : *MBB) {
17803 LoopSize += TII->getInstSizeInBytes(MI);
17804 if (LoopSize > 192)
17805 return PrefAlign;
17806 }
17807 }
17808
17809 if (LoopSize <= 64)
17810 return PrefAlign;
17811
17812 if (LoopSize <= 128)
17813 return CacheLineAlign;
17814
17815 // If any of parent loops is surrounded by prefetch instructions do not
17816 // insert new for inner loop, which would reset parent's settings.
17817 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
17818 if (MachineBasicBlock *Exit = P->getExitBlock()) {
17819 auto I = Exit->getFirstNonDebugInstr();
17820 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17821 return CacheLineAlign;
17822 }
17823 }
17824
17825 MachineBasicBlock *Pre = ML->getLoopPreheader();
17826 MachineBasicBlock *Exit = ML->getExitBlock();
17827
17828 if (Pre && Exit) {
17829 auto PreTerm = Pre->getFirstTerminator();
17830 if (PreTerm == Pre->begin() ||
17831 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17832 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17833 .addImm(1); // prefetch 2 lines behind PC
17834
17835 auto ExitHead = Exit->getFirstNonDebugInstr();
17836 if (ExitHead == Exit->end() ||
17837 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17838 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17839 .addImm(2); // prefetch 1 line behind PC
17840 }
17841
17842 return CacheLineAlign;
17843}
17844
17846static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
17847 assert(N->getOpcode() == ISD::CopyFromReg);
17848 do {
17849 // Follow the chain until we find an INLINEASM node.
17850 N = N->getOperand(0).getNode();
17851 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
17852 return true;
17853 } while (N->getOpcode() == ISD::CopyFromReg);
17854 return false;
17855}
17856
17859 UniformityInfo *UA) const {
17860 switch (N->getOpcode()) {
17861 case ISD::CopyFromReg: {
17862 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
17863 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
17864 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17865 Register Reg = R->getReg();
17866
17867 // FIXME: Why does this need to consider isLiveIn?
17868 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
17869 return !TRI->isSGPRReg(MRI, Reg);
17870
17871 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
17872 return UA->isDivergent(V);
17873
17875 return !TRI->isSGPRReg(MRI, Reg);
17876 }
17877 case ISD::LOAD: {
17878 const LoadSDNode *L = cast<LoadSDNode>(N);
17879 unsigned AS = L->getAddressSpace();
17880 // A flat load may access private memory.
17882 }
17883 case ISD::CALLSEQ_END:
17884 return true;
17886 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
17888 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
17907 // Target-specific read-modify-write atomics are sources of divergence.
17908 return true;
17909 default:
17910 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
17911 // Generic read-modify-write atomics are sources of divergence.
17912 return A->readMem() && A->writeMem();
17913 }
17914 return false;
17915 }
17916}
17917
17919 EVT VT) const {
17920 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
17921 case MVT::f32:
17923 case MVT::f64:
17924 case MVT::f16:
17926 default:
17927 return false;
17928 }
17929}
17930
17932 LLT Ty, const MachineFunction &MF) const {
17933 switch (Ty.getScalarSizeInBits()) {
17934 case 32:
17935 return !denormalModeIsFlushAllF32(MF);
17936 case 64:
17937 case 16:
17938 return !denormalModeIsFlushAllF64F16(MF);
17939 default:
17940 return false;
17941 }
17942}
17943
17945 const APInt &DemandedElts,
17946 const SelectionDAG &DAG,
17947 bool SNaN,
17948 unsigned Depth) const {
17949 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
17950 const MachineFunction &MF = DAG.getMachineFunction();
17952
17953 if (Info->getMode().DX10Clamp)
17954 return true; // Clamped to 0.
17955 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
17956 }
17957
17959 DAG, SNaN, Depth);
17960}
17961
17962// On older subtargets, global FP atomic instructions have a hardcoded FP mode
17963// and do not support FP32 denormals, and only support v2f16/f64 denormals.
17965 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
17966 return true;
17967
17969 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
17970 if (DenormMode == DenormalMode::getPreserveSign())
17971 return true;
17972
17973 // TODO: Remove this.
17974 return RMW->getFunction()
17975 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
17976 .getValueAsBool();
17977}
17978
17980 LLVMContext &Ctx = RMW->getContext();
17981 StringRef MemScope =
17982 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
17983
17984 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
17985 << "Hardware instruction generated for atomic "
17986 << RMW->getOperationName(RMW->getOperation())
17987 << " operation at memory scope " << MemScope;
17988}
17989
17990static bool isV2F16OrV2BF16(Type *Ty) {
17991 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
17992 Type *EltTy = VT->getElementType();
17993 return VT->getNumElements() == 2 &&
17994 (EltTy->isHalfTy() || EltTy->isBFloatTy());
17995 }
17996
17997 return false;
17998}
17999
18000static bool isV2F16(Type *Ty) {
18002 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18003}
18004
18005static bool isV2BF16(Type *Ty) {
18007 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18008}
18009
18010/// \return true if atomicrmw integer ops work for the type.
18011static bool isAtomicRMWLegalIntTy(Type *Ty) {
18012 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18013 unsigned BW = IT->getBitWidth();
18014 return BW == 32 || BW == 64;
18015 }
18016
18017 return false;
18018}
18019
18020/// \return true if this atomicrmw xchg type can be selected.
18021static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18022 Type *Ty = RMW->getType();
18023 if (isAtomicRMWLegalIntTy(Ty))
18024 return true;
18025
18026 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18027 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18028 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18029 return BW == 32 || BW == 64;
18030 }
18031
18032 if (Ty->isFloatTy() || Ty->isDoubleTy())
18033 return true;
18034
18036 return VT->getNumElements() == 2 &&
18037 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18038 }
18039
18040 return false;
18041}
18042
18043/// \returns true if it's valid to emit a native instruction for \p RMW, based
18044/// on the properties of the target memory.
18045static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18046 const AtomicRMWInst *RMW,
18047 bool HasSystemScope) {
18048 // The remote/fine-grained access logic is different from the integer
18049 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18050 // fine-grained access does not work, even for a device local allocation.
18051 //
18052 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18053 // allocations work.
18054 if (HasSystemScope) {
18056 RMW->hasMetadata("amdgpu.no.remote.memory"))
18057 return true;
18058 if (Subtarget.hasEmulatedSystemScopeAtomics())
18059 return true;
18061 return true;
18062
18063 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18064}
18065
18066/// \return Action to perform on AtomicRMWInsts for integer operations.
18073
18074/// Return if a flat address space atomicrmw can access private memory.
18076 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18077 return !MD ||
18079}
18080
18088
18091 unsigned AS = RMW->getPointerAddressSpace();
18092 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18094
18095 // 64-bit flat atomics that dynamically reside in private memory will silently
18096 // be dropped.
18097 //
18098 // Note that we will emit a new copy of the original atomic in the expansion,
18099 // which will be incrementally relegalized.
18100 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18101 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18102 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18105
18106 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18108 ORE.emit([=]() {
18109 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18110 });
18111 return Kind;
18112 };
18113
18114 auto SSID = RMW->getSyncScopeID();
18115 bool HasSystemScope =
18116 SSID == SyncScope::System ||
18117 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18118
18119 auto Op = RMW->getOperation();
18120 switch (Op) {
18122 // PCIe supports add and xchg for system atomics.
18123 return isAtomicRMWLegalXChgTy(RMW)
18126 case AtomicRMWInst::Add:
18127 // PCIe supports add and xchg for system atomics.
18129 case AtomicRMWInst::Sub:
18130 case AtomicRMWInst::And:
18131 case AtomicRMWInst::Or:
18132 case AtomicRMWInst::Xor:
18133 case AtomicRMWInst::Max:
18134 case AtomicRMWInst::Min:
18141 if (Subtarget->hasEmulatedSystemScopeAtomics())
18143
18144 // On most subtargets, for atomicrmw operations other than add/xchg,
18145 // whether or not the instructions will behave correctly depends on where
18146 // the address physically resides and what interconnect is used in the
18147 // system configuration. On some some targets the instruction will nop,
18148 // and in others synchronization will only occur at degraded device scope.
18149 //
18150 // If the allocation is known local to the device, the instructions should
18151 // work correctly.
18152 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18154
18155 // If fine-grained remote memory works at device scope, we don't need to
18156 // do anything.
18157 if (!HasSystemScope &&
18158 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18160
18161 // If we are targeting a remote allocated address, it depends what kind of
18162 // allocation the address belongs to.
18163 //
18164 // If the allocation is fine-grained (in host memory, or in PCIe peer
18165 // device memory), the operation will fail depending on the target.
18166 //
18167 // Note fine-grained host memory access does work on APUs or if XGMI is
18168 // used, but we do not know if we are targeting an APU or the system
18169 // configuration from the ISA version/target-cpu.
18170 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18172
18175 // Atomic sub/or/xor do not work over PCI express, but atomic add
18176 // does. InstCombine transforms these with 0 to or, so undo that.
18177 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18178 ConstVal && ConstVal->isNullValue())
18180 }
18181
18182 // If the allocation could be in remote, fine-grained memory, the rmw
18183 // instructions may fail. cmpxchg should work, so emit that. On some
18184 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18185 // even work, so you're out of luck anyway.
18186
18187 // In summary:
18188 //
18189 // Cases that may fail:
18190 // - fine-grained pinned host memory
18191 // - fine-grained migratable host memory
18192 // - fine-grained PCIe peer device
18193 //
18194 // Cases that should work, but may be treated overly conservatively.
18195 // - fine-grained host memory on an APU
18196 // - fine-grained XGMI peer device
18198 }
18199
18201 }
18202 case AtomicRMWInst::FAdd: {
18203 Type *Ty = RMW->getType();
18204
18205 // TODO: Handle REGION_ADDRESS
18206 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18207 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18208 // is fixed to round-to-nearest-even.
18209 //
18210 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18211 // round-to-nearest-even.
18212 //
18213 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18214 // suggests it is OK if the floating-point mode may not match the calling
18215 // thread.
18216 if (Ty->isFloatTy()) {
18217 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18219 }
18220
18221 if (Ty->isDoubleTy()) {
18222 // Ignores denormal mode, but we don't consider flushing mandatory.
18223 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18225 }
18226
18227 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18229
18231 }
18232
18233 // LDS atomics respect the denormal mode from the mode register.
18234 //
18235 // Traditionally f32 global/buffer memory atomics would unconditionally
18236 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18237 // flush.
18238 //
18239 // On targets with flat atomic fadd, denormals would flush depending on
18240 // whether the target address resides in LDS or global memory. We consider
18241 // this flat-maybe-flush as will-flush.
18242 if (Ty->isFloatTy() &&
18243 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18246
18247 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18248 // safe. The message phrasing also should be better.
18249 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18250 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18251 // gfx942, gfx12
18252 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18253 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18254 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18255 // gfx90a, gfx942, gfx12
18256 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18257 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18258
18259 // gfx942, gfx12
18260 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18261 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18262 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18263 // gfx90a, gfx942, gfx12
18264 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18265 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18266
18267 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18268 // buffer. gfx12 does have the buffer version.
18269 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18270 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18271 }
18272
18273 // global and flat atomic fadd f64: gfx90a, gfx942.
18274 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18275 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18276
18277 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18278 if (Ty->isFloatTy()) {
18279 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18280 // gfx11+.
18281 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18282 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18283 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18284 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18285 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18286 } else {
18287 // gfx908
18288 if (RMW->use_empty() &&
18289 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18290 isV2F16(Ty))
18291 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18292 }
18293 }
18294
18295 // flat atomic fadd f32: gfx942, gfx11+.
18296 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18297 if (Subtarget->hasFlatAtomicFaddF32Inst())
18298 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18299
18300 // If it is in flat address space, and the type is float, we will try to
18301 // expand it, if the target supports global and lds atomic fadd. The
18302 // reason we need that is, in the expansion, we emit the check of
18303 // address space. If it is in global address space, we emit the global
18304 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18305 // fadd.
18306 if (Subtarget->hasLDSFPAtomicAddF32()) {
18307 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18309 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18311 }
18312 }
18313 }
18314
18316 }
18318 case AtomicRMWInst::FMax: {
18319 Type *Ty = RMW->getType();
18320
18321 // LDS float and double fmin/fmax were always supported.
18322 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18323 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18325 }
18326
18327 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18328 // For flat and global cases:
18329 // float, double in gfx7. Manual claims denormal support.
18330 // Removed in gfx8.
18331 // float, double restored in gfx10.
18332 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18333 //
18334 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18335 // no f32.
18336 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18337 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18338 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18339 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18340 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18341 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18343 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18344 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18345 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18346 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18347 }
18348 }
18349
18351 }
18354 default:
18356 }
18357
18358 llvm_unreachable("covered atomicrmw op switch");
18359}
18360
18367
18374
18377 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18378 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18380
18381 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18383
18384 const DataLayout &DL = CmpX->getDataLayout();
18385
18386 Type *ValTy = CmpX->getNewValOperand()->getType();
18387
18388 // If a 64-bit flat atomic may alias private, we need to avoid using the
18389 // atomic in the private case.
18390 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18392}
18393
18394const TargetRegisterClass *
18395SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18397 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18398 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18399 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18400 : &AMDGPU::SReg_32RegClass;
18401 if (!TRI->isSGPRClass(RC) && !isDivergent)
18402 return TRI->getEquivalentSGPRClass(RC);
18403 if (TRI->isSGPRClass(RC) && isDivergent)
18404 return TRI->getEquivalentVGPRClass(RC);
18405
18406 return RC;
18407}
18408
18409// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18410// uniform values (as produced by the mask results of control flow intrinsics)
18411// used outside of divergent blocks. The phi users need to also be treated as
18412// always uniform.
18413//
18414// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18415static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18416 unsigned WaveSize) {
18417 // FIXME: We assume we never cast the mask results of a control flow
18418 // intrinsic.
18419 // Early exit if the type won't be consistent as a compile time hack.
18420 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18421 if (!IT || IT->getBitWidth() != WaveSize)
18422 return false;
18423
18424 if (!isa<Instruction>(V))
18425 return false;
18426 if (!Visited.insert(V).second)
18427 return false;
18428 bool Result = false;
18429 for (const auto *U : V->users()) {
18431 if (V == U->getOperand(1)) {
18432 switch (Intrinsic->getIntrinsicID()) {
18433 default:
18434 Result = false;
18435 break;
18436 case Intrinsic::amdgcn_if_break:
18437 case Intrinsic::amdgcn_if:
18438 case Intrinsic::amdgcn_else:
18439 Result = true;
18440 break;
18441 }
18442 }
18443 if (V == U->getOperand(0)) {
18444 switch (Intrinsic->getIntrinsicID()) {
18445 default:
18446 Result = false;
18447 break;
18448 case Intrinsic::amdgcn_end_cf:
18449 case Intrinsic::amdgcn_loop:
18450 Result = true;
18451 break;
18452 }
18453 }
18454 } else {
18455 Result = hasCFUser(U, Visited, WaveSize);
18456 }
18457 if (Result)
18458 break;
18459 }
18460 return Result;
18461}
18462
18464 const Value *V) const {
18465 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18466 if (CI->isInlineAsm()) {
18467 // FIXME: This cannot give a correct answer. This should only trigger in
18468 // the case where inline asm returns mixed SGPR and VGPR results, used
18469 // outside the defining block. We don't have a specific result to
18470 // consider, so this assumes if any value is SGPR, the overall register
18471 // also needs to be SGPR.
18472 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18474 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18475 for (auto &TC : TargetConstraints) {
18476 if (TC.Type == InlineAsm::isOutput) {
18478 const TargetRegisterClass *RC =
18479 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18480 TC.ConstraintVT)
18481 .second;
18482 if (RC && SIRI->isSGPRClass(RC))
18483 return true;
18484 }
18485 }
18486 }
18487 }
18489 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18490}
18491
18493 for (SDUse &Use : N->uses()) {
18495 if (getBasePtrIndex(M) == Use.getOperandNo())
18496 return true;
18497 }
18498 }
18499 return false;
18500}
18501
18503 SDValue N1) const {
18504 if (!N0.hasOneUse())
18505 return false;
18506 // Take care of the opportunity to keep N0 uniform
18507 if (N0->isDivergent() || !N1->isDivergent())
18508 return true;
18509 // Check if we have a good chance to form the memory access pattern with the
18510 // base and offset
18511 return (DAG.isBaseWithConstantOffset(N0) &&
18513}
18514
18516 Register N0, Register N1) const {
18517 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18518}
18519
18522 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18524 if (I.getMetadata("amdgpu.noclobber"))
18525 Flags |= MONoClobber;
18526 if (I.getMetadata("amdgpu.last.use"))
18527 Flags |= MOLastUse;
18528 return Flags;
18529}
18530
18532 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18533 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18534 if (User->getOpcode() != ISD::CopyToReg)
18535 return false;
18536 if (!Def->isMachineOpcode())
18537 return false;
18539 if (!MDef)
18540 return false;
18541
18542 unsigned ResNo = User->getOperand(Op).getResNo();
18543 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18544 return false;
18545 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18546 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18547 PhysReg = AMDGPU::SCC;
18548 const TargetRegisterClass *RC =
18549 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18550 Cost = RC->getCopyCost();
18551 return true;
18552 }
18553 return false;
18554}
18555
18557 Instruction *AI) const {
18558 // Given: atomicrmw fadd ptr %addr, float %val ordering
18559 //
18560 // With this expansion we produce the following code:
18561 // [...]
18562 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18563 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18564 //
18565 // atomicrmw.shared:
18566 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18567 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18568 // float %val ordering
18569 // br label %atomicrmw.phi
18570 //
18571 // atomicrmw.check.private:
18572 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18573 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18574 //
18575 // atomicrmw.private:
18576 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18577 // %loaded.private = load float, ptr addrspace(5) %cast.private
18578 // %val.new = fadd float %loaded.private, %val
18579 // store float %val.new, ptr addrspace(5) %cast.private
18580 // br label %atomicrmw.phi
18581 //
18582 // atomicrmw.global:
18583 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18584 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18585 // float %val ordering
18586 // br label %atomicrmw.phi
18587 //
18588 // atomicrmw.phi:
18589 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18590 // [ %loaded.private, %atomicrmw.private ],
18591 // [ %loaded.global, %atomicrmw.global ]
18592 // br label %atomicrmw.end
18593 //
18594 // atomicrmw.end:
18595 // [...]
18596 //
18597 //
18598 // For 64-bit atomics which may reside in private memory, we perform a simpler
18599 // version that only inserts the private check, and uses the flat operation.
18600
18601 IRBuilder<> Builder(AI);
18602 LLVMContext &Ctx = Builder.getContext();
18603
18604 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18605 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18607 Value *Addr = AI->getOperand(PtrOpIdx);
18608
18609 /// TODO: Only need to check private, then emit flat-known-not private (no
18610 /// need for shared block, or cast to global).
18612
18613 Align Alignment;
18614 if (RMW)
18615 Alignment = RMW->getAlign();
18616 else if (CX)
18617 Alignment = CX->getAlign();
18618 else
18619 llvm_unreachable("unhandled atomic operation");
18620
18621 // FullFlatEmulation is true if we need to issue the private, shared, and
18622 // global cases.
18623 //
18624 // If this is false, we are only dealing with the flat-targeting-private case,
18625 // where we only insert a check for private and still use the flat instruction
18626 // for global and shared.
18627
18628 bool FullFlatEmulation =
18629 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18630 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18631 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18632 RMW->getType()->isDoubleTy()));
18633
18634 // If the return value isn't used, do not introduce a false use in the phi.
18635 bool ReturnValueIsUsed = !AI->use_empty();
18636
18637 BasicBlock *BB = Builder.GetInsertBlock();
18638 Function *F = BB->getParent();
18639 BasicBlock *ExitBB =
18640 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18641 BasicBlock *SharedBB = nullptr;
18642
18643 BasicBlock *CheckPrivateBB = BB;
18644 if (FullFlatEmulation) {
18645 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18646 CheckPrivateBB =
18647 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18648 }
18649
18650 BasicBlock *PrivateBB =
18651 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18652 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18653 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18654
18655 std::prev(BB->end())->eraseFromParent();
18656 Builder.SetInsertPoint(BB);
18657
18658 Value *LoadedShared = nullptr;
18659 if (FullFlatEmulation) {
18660 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18661 {Addr}, nullptr, "is.shared");
18662 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18663 Builder.SetInsertPoint(SharedBB);
18664 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18666
18667 Instruction *Clone = AI->clone();
18668 Clone->insertInto(SharedBB, SharedBB->end());
18669 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
18670 LoadedShared = Clone;
18671
18672 Builder.CreateBr(PhiBB);
18673 Builder.SetInsertPoint(CheckPrivateBB);
18674 }
18675
18676 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18677 {Addr}, nullptr, "is.private");
18678 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18679
18680 Builder.SetInsertPoint(PrivateBB);
18681
18682 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18684
18685 Value *LoadedPrivate;
18686 if (RMW) {
18687 LoadedPrivate = Builder.CreateAlignedLoad(
18688 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
18689
18690 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
18691 LoadedPrivate, RMW->getValOperand());
18692
18693 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18694 } else {
18695 auto [ResultLoad, Equal] =
18696 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
18697 CX->getNewValOperand(), CX->getAlign());
18698
18699 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
18700 ResultLoad, 0);
18701 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
18702 }
18703
18704 Builder.CreateBr(PhiBB);
18705
18706 Builder.SetInsertPoint(GlobalBB);
18707
18708 // Continue using a flat instruction if we only emitted the check for private.
18709 Instruction *LoadedGlobal = AI;
18710 if (FullFlatEmulation) {
18711 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
18713 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
18714 }
18715
18716 AI->removeFromParent();
18717 AI->insertInto(GlobalBB, GlobalBB->end());
18718
18719 // The new atomicrmw may go through another round of legalization later.
18720 if (!FullFlatEmulation) {
18721 // We inserted the runtime check already, make sure we do not try to
18722 // re-expand this.
18723 // TODO: Should union with any existing metadata.
18724 MDBuilder MDB(F->getContext());
18725 MDNode *RangeNotPrivate =
18728 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
18729 RangeNotPrivate);
18730 }
18731
18732 Builder.CreateBr(PhiBB);
18733
18734 Builder.SetInsertPoint(PhiBB);
18735
18736 if (ReturnValueIsUsed) {
18737 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
18738 AI->replaceAllUsesWith(Loaded);
18739 if (FullFlatEmulation)
18740 Loaded->addIncoming(LoadedShared, SharedBB);
18741 Loaded->addIncoming(LoadedPrivate, PrivateBB);
18742 Loaded->addIncoming(LoadedGlobal, GlobalBB);
18743 Loaded->takeName(AI);
18744 }
18745
18746 Builder.CreateBr(ExitBB);
18747}
18748
18750 unsigned PtrOpIdx) {
18751 Value *PtrOp = I->getOperand(PtrOpIdx);
18754
18755 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
18756 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
18757 I->getIterator());
18758 I->setOperand(PtrOpIdx, ASCast);
18759}
18760
18763
18766
18769 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
18770 ConstVal && ConstVal->isNullValue()) {
18771 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
18773
18774 // We may still need the private-alias-flat handling below.
18775
18776 // TODO: Skip this for cases where we cannot access remote memory.
18777 }
18778 }
18779
18780 // The non-flat expansions should only perform the de-canonicalization of
18781 // identity values.
18783 return;
18784
18786}
18787
18794
18798
18800 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
18801}
18802
18804 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
18805 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
18806
18808 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
18809}
18810
18811LoadInst *
18813 IRBuilder<> Builder(AI);
18814 auto Order = AI->getOrdering();
18815
18816 // The optimization removes store aspect of the atomicrmw. Therefore, cache
18817 // must be flushed if the atomic ordering had a release semantics. This is
18818 // not necessary a fence, a release fence just coincides to do that flush.
18819 // Avoid replacing of an atomicrmw with a release semantics.
18820 if (isReleaseOrStronger(Order))
18821 return nullptr;
18822
18823 LoadInst *LI = Builder.CreateAlignedLoad(
18824 AI->getType(), AI->getPointerOperand(), AI->getAlign());
18825 LI->setAtomic(Order, AI->getSyncScopeID());
18826 LI->copyMetadata(*AI);
18827 LI->takeName(AI);
18828 AI->replaceAllUsesWith(LI);
18829 AI->eraseFromParent();
18830 return LI;
18831}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1247
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1244
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1445
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:307
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:853
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:270
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:551
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:390
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:336
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:203
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:399
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:251
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:218
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:241
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs