LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/MDBuilder.h"
45#include "llvm/Support/ModRef.h"
47#include <optional>
48
49using namespace llvm;
50using namespace llvm::SDPatternMatch;
51
52#define DEBUG_TYPE "si-lower"
53
54STATISTIC(NumTailCalls, "Number of tail calls");
55
56static cl::opt<bool>
57 DisableLoopAlignment("amdgpu-disable-loop-alignment",
58 cl::desc("Do not align and prefetch loops"),
59 cl::init(false));
60
62 "amdgpu-use-divergent-register-indexing", cl::Hidden,
63 cl::desc("Use indirect register addressing for divergent indexes"),
64 cl::init(false));
65
66// TODO: This option should be removed once we switch to always using PTRADD in
67// the SelectionDAG.
69 "amdgpu-use-sdag-ptradd", cl::Hidden,
70 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
71 "SelectionDAG ISel"),
72 cl::init(false));
73
76 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
77}
78
81 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
82}
83
84static unsigned findFirstFreeSGPR(CCState &CCInfo) {
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
87 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
88 return AMDGPU::SGPR0 + Reg;
89 }
90 }
91 llvm_unreachable("Cannot allocate sgpr");
92}
93
95 const GCNSubtarget &STI)
96 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
97 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
98 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
99
100 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
101 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const SIRegisterInfo *TRI = STI.getRegisterInfo();
106 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
147
148 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
149 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
150
151 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
152 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
153
154 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
156
157 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
158 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
159
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
162 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
163 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
165 } else {
166 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
169 }
170
171 // Unless there are also VOP3P operations, not operations are really legal.
172 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
176 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
179 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
182 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
185 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
187 }
188
189 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
190 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
191
192 computeRegisterProperties(Subtarget->getRegisterInfo());
193
194 // The boolean content concept here is too inflexible. Compares only ever
195 // really produce a 1-bit result. Any copy/extend from these will turn into a
196 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
197 // it's what most targets use.
200
201 // We need to custom lower vector stores from local memory
202 setOperationAction(ISD::LOAD,
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
207 Custom);
208
209 setOperationAction(ISD::STORE,
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
214 Custom);
215
216 if (isTypeLegal(MVT::bf16)) {
217 for (unsigned Opc :
219 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
225 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
226 ISD::SETCC}) {
227 // FIXME: The promoted to type shouldn't need to be explicit
228 setOperationAction(Opc, MVT::bf16, Promote);
229 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
230 }
231
233
235 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
236
237 setOperationAction(ISD::FABS, MVT::bf16, Legal);
238 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
240
241 // We only need to custom lower because we can't specify an action for bf16
242 // sources.
245 }
246
247 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
248 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
253 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
258 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
259 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
260 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
261 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
262 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
263
264 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
265 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
266 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
267 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
270 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
271
272 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
273
277 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
278
279 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
280
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
283
285 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
286 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
287
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
292 Expand);
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
297 Expand);
298
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
302 Custom);
303
304 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
305 setOperationAction(ISD::BR_CC,
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
307
309
311
313 Expand);
314
315#if 0
317#endif
318
319 // We only support LOAD/STORE and vector manipulation ops for vectors
320 // with > 4 elements.
321 for (MVT VT :
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
330 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
331 switch (Op) {
332 case ISD::LOAD:
333 case ISD::STORE:
335 case ISD::BITCAST:
336 case ISD::UNDEF:
340 case ISD::IS_FPCLASS:
341 break;
346 break;
347 default:
349 break;
350 }
351 }
352 }
353
354 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
355
356 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
357 // is expanded to avoid having two separate loops in case the index is a VGPR.
358
359 // Most operations are naturally 32-bit vector operations. We only support
360 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
361 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
363 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
364
366 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
367
369 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
373 }
374
375 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
377 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
378
380 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
381
383 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
387 }
388
389 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
391 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
392
394 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
395
397 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
401 }
402
403 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
405 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
406
408 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
409
411 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
415 }
416
417 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
419 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
420
422 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
423
425 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
429 }
430
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
434 Custom);
435
436 if (Subtarget->hasPkMovB32()) {
437 // TODO: 16-bit element vectors should be legal with even aligned elements.
438 // TODO: Can be legal with wider source types than the result with
439 // subregister extracts.
440 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441 }
442
443 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
444 Custom);
445
446 // Avoid stack access for these.
447 // TODO: Generalize to more vector types.
449 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
450 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
451 Custom);
452
453 // Deal with vec3 vector operations when widened to vec4.
455 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
456
457 // Deal with vec5/6/7 vector operations when widened to vec8.
459 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
460 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
461 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
462 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
463 Custom);
464
465 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
466 // and output demarshalling
467 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
468
469 // We can't return success/failure, only the old value,
470 // let LLVM add the comparison
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
472 Expand);
473
474 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
475
476 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
477
478 // FIXME: This should be narrowed to i32, but that only happens if i64 is
479 // illegal.
480 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
481 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
482
483 // On SI this is s_memtime and s_memrealtime on VI.
484 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
485
486 if (Subtarget->hasSMemRealTime() ||
487 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
488 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
489 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
490
491 if (Subtarget->has16BitInsts()) {
492 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
493 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
494 } else {
495 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
496 }
497
498 if (Subtarget->hasMadMacF32Insts())
500
501 if (!Subtarget->hasBFI())
502 // fcopysign can be done in a single instruction with BFI.
503 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
504
505 if (!Subtarget->hasBCNT(32))
507
508 if (!Subtarget->hasBCNT(64))
510
511 if (Subtarget->hasFFBH())
513
514 if (Subtarget->hasFFBL())
516
517 // We only really have 32-bit BFE instructions (and 16-bit on VI).
518 //
519 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
520 // effort to match them now. We want this to be false for i64 cases when the
521 // extraction isn't restricted to the upper or lower half. Ideally we would
522 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
523 // span the midpoint are probably relatively rare, so don't worry about them
524 // for now.
525 if (Subtarget->hasBFE())
527
528 // Clamp modifier on add/sub
529 if (Subtarget->hasIntClamp())
531
532 if (Subtarget->hasAddNoCarry())
533 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
534 Legal);
535
537 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
538 {MVT::f32, MVT::f64}, Custom);
539
540 // These are really only legal for ieee_mode functions. We should be avoiding
541 // them for functions that don't have ieee_mode enabled, so just say they are
542 // legal.
543 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
544 {MVT::f32, MVT::f64}, Legal);
545
546 if (Subtarget->haveRoundOpsF64())
547 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
548 Legal);
549 else
550 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
551 MVT::f64, Custom);
552
553 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
554 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
555 Legal);
556 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
557
558 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
560
561 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
562 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
563
564 // Custom lower these because we can't specify a rule based on an illegal
565 // source bf16.
566 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
567 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
568
569 if (Subtarget->has16BitInsts()) {
572 MVT::i16, Legal);
573
574 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
575
577 MVT::i16, Expand);
578
582 ISD::CTPOP},
583 MVT::i16, Promote);
584
585 setOperationAction(ISD::LOAD, MVT::i16, Custom);
586
587 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
588
589 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
590 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
591 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
592 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
593
597
599
600 // F16 - Constant Actions.
603
604 // F16 - Load/Store Actions.
605 setOperationAction(ISD::LOAD, MVT::f16, Promote);
606 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
607 setOperationAction(ISD::STORE, MVT::f16, Promote);
608 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
609
610 // BF16 - Load/Store Actions.
611 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
612 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
613 setOperationAction(ISD::STORE, MVT::bf16, Promote);
614 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
615
616 // F16 - VOP1 Actions.
618 ISD::FSIN, ISD::FROUND},
619 MVT::f16, Custom);
620
621 // BF16 - VOP1 Actions.
622 if (Subtarget->hasBF16TransInsts())
623 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
624
627
628 // F16 - VOP2 Actions.
629 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
630 Expand);
631 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
632 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
634
635 // F16 - VOP3 Actions.
637 if (STI.hasMadF16())
639
640 for (MVT VT :
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
644 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
645 switch (Op) {
646 case ISD::LOAD:
647 case ISD::STORE:
649 case ISD::BITCAST:
650 case ISD::UNDEF:
655 case ISD::IS_FPCLASS:
656 break;
660 break;
661 default:
663 break;
664 }
665 }
666 }
667
668 // v_perm_b32 can handle either of these.
669 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
671
672 // XXX - Do these do anything? Vector constants turn into build_vector.
673 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
674
675 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
676 Legal);
677
678 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
679 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
680 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
681 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
682
683 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
684 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
685 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
686 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
687
688 setOperationAction(ISD::AND, MVT::v2i16, Promote);
689 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
690 setOperationAction(ISD::OR, MVT::v2i16, Promote);
691 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
692 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
694
695 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
696 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
697 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
698 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
699 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
700 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
701
702 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
703 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
704 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
705 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
706 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
707 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
708
709 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
711 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
712 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
713 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
715
716 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
718 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
720
721 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
722 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
723 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
724 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
725 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
726 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
727
728 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
730 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
731 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
732 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
733 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
734
735 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
737 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
738 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
739 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
740 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
741
742 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
744 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
746 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
748
749 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
750 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
751 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
752 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
753 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
755
757 MVT::v2i32, Expand);
758 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
759
761 MVT::v4i32, Expand);
762
764 MVT::v8i32, Expand);
765
766 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
767 Subtarget->hasVOP3PInsts() ? Legal : Custom);
768
769 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
770 // This isn't really legal, but this avoids the legalizer unrolling it (and
771 // allows matching fneg (fabs x) patterns)
772 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
773
774 // Can do this in one BFI plus a constant materialize.
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
779 Custom);
780
782 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
783 MVT::f16, Custom);
784 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
785
786 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
787 ISD::FMAXIMUMNUM},
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
789 Custom);
790
791 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 Expand);
794
795 for (MVT Vec16 :
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
800 Vec16, Custom);
802 }
803 }
804
805 if (Subtarget->hasVOP3PInsts()) {
809 MVT::v2i16, Legal);
810
811 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
812 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
813 MVT::v2f16, Legal);
814
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
817
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
822 Custom);
823
824 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
825 // Split vector operations.
830 VT, Custom);
831
832 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
833 // Split vector operations.
835 VT, Custom);
836
838 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
839 {MVT::v2f16, MVT::v4f16}, Custom);
840
841 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
842 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
843 Custom);
844
845 if (Subtarget->hasPackedFP32Ops()) {
847 MVT::v2f32, Legal);
849 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
850 Custom);
851 }
852 }
853
854 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
855
856 if (Subtarget->has16BitInsts()) {
858 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
860 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
861 } else {
862 // Legalization hack.
863 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
864
865 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
866 }
867
869 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
870 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
871 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
872 MVT::v32f16, MVT::v32bf16},
873 Custom);
874
876
877 if (Subtarget->hasVectorMulU64())
879 else if (Subtarget->hasScalarSMulU64())
881
882 if (Subtarget->hasMad64_32())
884
885 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
886 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
887
888 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
889 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
890 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
891 } else {
892 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
893 if (Subtarget->hasMinimum3Maximum3F32())
894 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
895
896 if (Subtarget->hasMinimum3Maximum3PKF16()) {
897 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
898
899 // If only the vector form is available, we need to widen to a vector.
900 if (!Subtarget->hasMinimum3Maximum3F16())
901 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
902 }
903 }
904
905 if (Subtarget->hasVOP3PInsts()) {
906 // We want to break these into v2f16 pieces, not scalarize.
907 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
908 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
909 Custom);
910 }
911
912 if (Subtarget->hasIntMinMax64())
914 Legal);
915
917 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
918 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
919 MVT::i8},
920 Custom);
921
923 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
924 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
925 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
926 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
927 Custom);
928
930 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
931 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
932 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
933 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
934 Custom);
935
936 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
938 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
939 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
940 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
941
942 // TODO: Could move this to custom lowering, could benefit from combines on
943 // extract of relevant bits.
944 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
945
947
948 if (Subtarget->hasBF16ConversionInsts()) {
949 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
951 }
952
953 if (Subtarget->hasBF16PackedInsts()) {
955 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
956 MVT::v2bf16, Legal);
957 }
958
959 if (Subtarget->hasBF16TransInsts()) {
960 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
961 }
962
963 if (Subtarget->hasCvtPkF16F32Inst()) {
965 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
966 Custom);
967 }
968
970 ISD::PTRADD,
972 ISD::SUB,
974 ISD::MUL,
975 ISD::FADD,
976 ISD::FSUB,
977 ISD::FDIV,
978 ISD::FMUL,
979 ISD::FMINNUM,
980 ISD::FMAXNUM,
981 ISD::FMINNUM_IEEE,
982 ISD::FMAXNUM_IEEE,
983 ISD::FMINIMUM,
984 ISD::FMAXIMUM,
985 ISD::FMINIMUMNUM,
986 ISD::FMAXIMUMNUM,
987 ISD::FMA,
988 ISD::SMIN,
989 ISD::SMAX,
990 ISD::UMIN,
991 ISD::UMAX,
994 ISD::SMIN,
995 ISD::SMAX,
996 ISD::UMIN,
997 ISD::UMAX,
998 ISD::AND,
999 ISD::OR,
1000 ISD::XOR,
1001 ISD::SHL,
1002 ISD::SRL,
1003 ISD::SRA,
1004 ISD::FSHR,
1014
1015 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1017
1018 // All memory operations. Some folding on the pointer operand is done to help
1019 // matching the constant offsets in the addressing modes.
1020 setTargetDAGCombine({ISD::LOAD,
1021 ISD::STORE,
1022 ISD::ATOMIC_LOAD,
1023 ISD::ATOMIC_STORE,
1024 ISD::ATOMIC_CMP_SWAP,
1025 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1026 ISD::ATOMIC_SWAP,
1027 ISD::ATOMIC_LOAD_ADD,
1028 ISD::ATOMIC_LOAD_SUB,
1029 ISD::ATOMIC_LOAD_AND,
1030 ISD::ATOMIC_LOAD_OR,
1031 ISD::ATOMIC_LOAD_XOR,
1032 ISD::ATOMIC_LOAD_NAND,
1033 ISD::ATOMIC_LOAD_MIN,
1034 ISD::ATOMIC_LOAD_MAX,
1035 ISD::ATOMIC_LOAD_UMIN,
1036 ISD::ATOMIC_LOAD_UMAX,
1037 ISD::ATOMIC_LOAD_FADD,
1038 ISD::ATOMIC_LOAD_FMIN,
1039 ISD::ATOMIC_LOAD_FMAX,
1040 ISD::ATOMIC_LOAD_UINC_WRAP,
1041 ISD::ATOMIC_LOAD_UDEC_WRAP,
1044
1045 // FIXME: In other contexts we pretend this is a per-function property.
1047
1049}
1050
1051const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1052
1054 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1055 return RCRegs;
1056}
1057
1058//===----------------------------------------------------------------------===//
1059// TargetLowering queries
1060//===----------------------------------------------------------------------===//
1061
1062// v_mad_mix* support a conversion from f16 to f32.
1063//
1064// There is only one special case when denormals are enabled we don't currently,
1065// where this is OK to use.
1066bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1067 EVT DestVT, EVT SrcVT) const {
1068 return DestVT.getScalarType() == MVT::f32 &&
1069 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1070 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1071 SrcVT.getScalarType() == MVT::f16) ||
1072 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1073 SrcVT.getScalarType() == MVT::bf16)) &&
1074 // TODO: This probably only requires no input flushing?
1076}
1077
1079 LLT DestTy, LLT SrcTy) const {
1080 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1081 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1082 DestTy.getScalarSizeInBits() == 32 &&
1083 SrcTy.getScalarSizeInBits() == 16 &&
1084 // TODO: This probably only requires no input flushing?
1085 denormalModeIsFlushAllF32(*MI.getMF());
1086}
1087
1089 // SI has some legal vector types, but no legal vector operations. Say no
1090 // shuffles are legal in order to prefer scalarizing some vector operations.
1091 return false;
1092}
1093
1095 CallingConv::ID CC,
1096 EVT VT) const {
1098 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1099
1100 if (VT.isVector()) {
1101 EVT ScalarVT = VT.getScalarType();
1102 unsigned Size = ScalarVT.getSizeInBits();
1103 if (Size == 16) {
1104 if (Subtarget->has16BitInsts()) {
1105 if (VT.isInteger())
1106 return MVT::v2i16;
1107 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1108 }
1109 return VT.isInteger() ? MVT::i32 : MVT::f32;
1110 }
1111
1112 if (Size < 16)
1113 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1114 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1115 }
1116
1117 if (VT.getSizeInBits() > 32)
1118 return MVT::i32;
1119
1120 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1121}
1122
1124 CallingConv::ID CC,
1125 EVT VT) const {
1127 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1128
1129 if (VT.isVector()) {
1130 unsigned NumElts = VT.getVectorNumElements();
1131 EVT ScalarVT = VT.getScalarType();
1132 unsigned Size = ScalarVT.getSizeInBits();
1133
1134 // FIXME: Should probably promote 8-bit vectors to i16.
1135 if (Size == 16 && Subtarget->has16BitInsts())
1136 return (NumElts + 1) / 2;
1137
1138 if (Size <= 32)
1139 return NumElts;
1140
1141 if (Size > 32)
1142 return NumElts * ((Size + 31) / 32);
1143 } else if (VT.getSizeInBits() > 32)
1144 return (VT.getSizeInBits() + 31) / 32;
1145
1146 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1147}
1148
1150 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1151 unsigned &NumIntermediates, MVT &RegisterVT) const {
1152 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1153 unsigned NumElts = VT.getVectorNumElements();
1154 EVT ScalarVT = VT.getScalarType();
1155 unsigned Size = ScalarVT.getSizeInBits();
1156 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1157 // support, but unless we can properly handle 3-vectors, it will be still be
1158 // inconsistent.
1159 if (Size == 16 && Subtarget->has16BitInsts()) {
1160 if (ScalarVT == MVT::bf16) {
1161 RegisterVT = MVT::i32;
1162 IntermediateVT = MVT::v2bf16;
1163 } else {
1164 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1165 IntermediateVT = RegisterVT;
1166 }
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1169 }
1170
1171 if (Size == 32) {
1172 RegisterVT = ScalarVT.getSimpleVT();
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1176 }
1177
1178 if (Size < 16 && Subtarget->has16BitInsts()) {
1179 // FIXME: Should probably form v2i16 pieces
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size != 16 && Size <= 32) {
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1191 }
1192
1193 if (Size > 32) {
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((Size + 31) / 32);
1197 return NumIntermediates;
1198 }
1199 }
1200
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1203}
1204
1206 const DataLayout &DL, Type *Ty,
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1209
1210 LLVMContext &Ctx = Ty->getContext();
1211 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1213 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1214 NumElts);
1215 }
1216
1217 return TLI.getValueType(DL, Ty);
1218}
1219
1220// Peek through TFE struct returns to only use the data size.
1222 const DataLayout &DL, Type *Ty,
1223 unsigned MaxNumLanes) {
1224 auto *ST = dyn_cast<StructType>(Ty);
1225 if (!ST)
1226 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1227
1228 // TFE intrinsics return an aggregate type.
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1231 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1232}
1233
1234/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1235/// in-memory representation. This return value is a custom type because there
1236/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1237/// could cause issues during codegen, these address space 7 pointers will be
1238/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1239/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1240/// for cost modeling, to work. (This also sets us up decently for doing the
1241/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1243 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1249}
1250/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1251/// v8i32 when padding is added.
1252/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1253/// also v8i32 with padding.
1255 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1259 return MVT::v8i32;
1261}
1262
1263static unsigned getIntrMemWidth(unsigned IntrID) {
1264 switch (IntrID) {
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1267 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1268 return 8;
1269 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1270 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1271 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1272 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1273 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1274 return 32;
1275 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1276 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1277 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1278 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1279 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1280 return 64;
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1284 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1285 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1286 return 128;
1287 default:
1288 llvm_unreachable("Unknown width");
1289 }
1290}
1291
1292static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1294 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1295 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1296 switch (AtomicOrderingCABI(Ord)) {
1299 break;
1302 break;
1305 break;
1306 default:
1308 break;
1309 }
1310
1311 Info.flags =
1313 Info.flags |= MOCooperative;
1314
1315 MDNode *ScopeMD = cast<MDNode>(
1316 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1317 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1318 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1319}
1320
1322 const CallInst &CI,
1323 MachineFunction &MF,
1324 unsigned IntrID) const {
1325 Info.flags = MachineMemOperand::MONone;
1326 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1327 Info.flags |= MachineMemOperand::MOInvariant;
1328 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1330 Info.flags |= getTargetMMOFlags(CI);
1331
1332 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1334 AttributeSet Attr =
1336 MemoryEffects ME = Attr.getMemoryEffects();
1337 if (ME.doesNotAccessMemory())
1338 return false;
1339
1340 // TODO: Should images get their own address space?
1341 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1342
1343 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1344 if (RsrcIntr->IsImage) {
1345 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1347 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1348 Info.align.reset();
1349 }
1350
1351 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1352 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1353 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1354 // We conservatively set the memory operand of a buffer intrinsic to the
1355 // base resource pointer, so that we can access alias information about
1356 // those pointers. Cases like "this points at the same value
1357 // but with a different offset" are handled in
1358 // areMemAccessesTriviallyDisjoint.
1359 Info.ptrVal = RsrcArg;
1360 }
1361
1362 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1363 if (!IsSPrefetch) {
1364 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1365 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1366 Info.flags |= MachineMemOperand::MOVolatile;
1367 }
1368
1370 if (ME.onlyReadsMemory()) {
1371 if (RsrcIntr->IsImage) {
1372 unsigned MaxNumLanes = 4;
1373
1374 if (!BaseOpcode->Gather4) {
1375 // If this isn't a gather, we may have excess loaded elements in the
1376 // IR type. Check the dmask for the real number of elements loaded.
1377 unsigned DMask =
1378 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1379 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1380 }
1381
1382 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1383 CI.getType(), MaxNumLanes);
1384 } else {
1385 Info.memVT =
1387 std::numeric_limits<unsigned>::max());
1388 }
1389
1390 // FIXME: What does alignment mean for an image?
1391 Info.opc = ISD::INTRINSIC_W_CHAIN;
1392 Info.flags |= MachineMemOperand::MOLoad;
1393 } else if (ME.onlyWritesMemory()) {
1394 Info.opc = ISD::INTRINSIC_VOID;
1395
1396 Type *DataTy = CI.getArgOperand(0)->getType();
1397 if (RsrcIntr->IsImage) {
1398 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1399 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1400 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1401 DMaskLanes);
1402 } else
1403 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1404
1405 Info.flags |= MachineMemOperand::MOStore;
1406 } else {
1407 // Atomic, NoReturn Sampler or prefetch
1408 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1410 Info.flags |=
1412
1413 if (!IsSPrefetch)
1414 Info.flags |= MachineMemOperand::MOStore;
1415
1416 switch (IntrID) {
1417 default:
1418 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1419 // Fake memory access type for no return sampler intrinsics
1420 Info.memVT = MVT::i32;
1421 } else {
1422 // XXX - Should this be volatile without known ordering?
1423 Info.flags |= MachineMemOperand::MOVolatile;
1424 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1425 }
1426 break;
1427 case Intrinsic::amdgcn_raw_buffer_load_lds:
1428 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1429 case Intrinsic::amdgcn_struct_buffer_load_lds:
1430 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1431 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1432 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1433 Info.ptrVal = CI.getArgOperand(1);
1434 return true;
1435 }
1436 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1437 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1438 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1439 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1440 Info.memVT =
1442 std::numeric_limits<unsigned>::max());
1443 Info.flags &= ~MachineMemOperand::MOStore;
1444 return true;
1445 }
1446 }
1447 }
1448 return true;
1449 }
1450
1451 switch (IntrID) {
1452 case Intrinsic::amdgcn_ds_ordered_add:
1453 case Intrinsic::amdgcn_ds_ordered_swap: {
1454 Info.opc = ISD::INTRINSIC_W_CHAIN;
1455 Info.memVT = MVT::getVT(CI.getType());
1456 Info.ptrVal = CI.getOperand(0);
1457 Info.align.reset();
1459
1460 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1461 if (!Vol->isZero())
1462 Info.flags |= MachineMemOperand::MOVolatile;
1463
1464 return true;
1465 }
1466 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1467 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1468 Info.opc = ISD::INTRINSIC_W_CHAIN;
1469 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1470 Info.ptrVal = nullptr;
1471 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_append:
1476 case Intrinsic::amdgcn_ds_consume: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getType());
1479 Info.ptrVal = CI.getOperand(0);
1480 Info.align.reset();
1482
1483 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1484 if (!Vol->isZero())
1485 Info.flags |= MachineMemOperand::MOVolatile;
1486
1487 return true;
1488 }
1489 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1490 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1491 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1494 Info.memVT = MVT::getVT(CI.getType());
1495 Info.ptrVal = CI.getOperand(0);
1496 Info.memVT = MVT::i64;
1497 Info.size = 8;
1498 Info.align.reset();
1500 return true;
1501 }
1502 case Intrinsic::amdgcn_global_atomic_csub: {
1503 Info.opc = ISD::INTRINSIC_W_CHAIN;
1504 Info.memVT = MVT::getVT(CI.getType());
1505 Info.ptrVal = CI.getOperand(0);
1506 Info.align.reset();
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1512 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1513 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1514 Info.opc = ISD::INTRINSIC_W_CHAIN;
1515 Info.memVT =
1516 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1517 ? CI.getType()
1519 ->getElementType(0)); // XXX: what is correct VT?
1520
1521 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1522 Info.align.reset();
1523 Info.flags |=
1525 return true;
1526 }
1527 case Intrinsic::amdgcn_global_atomic_fmin_num:
1528 case Intrinsic::amdgcn_global_atomic_fmax_num:
1529 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1530 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1531 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1532 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1533 Info.opc = ISD::INTRINSIC_W_CHAIN;
1534 Info.memVT = MVT::getVT(CI.getType());
1535 Info.ptrVal = CI.getOperand(0);
1536 Info.align.reset();
1540 return true;
1541 }
1542 case Intrinsic::amdgcn_flat_load_monitor_b32:
1543 case Intrinsic::amdgcn_flat_load_monitor_b64:
1544 case Intrinsic::amdgcn_flat_load_monitor_b128:
1545 case Intrinsic::amdgcn_global_load_monitor_b32:
1546 case Intrinsic::amdgcn_global_load_monitor_b64:
1547 case Intrinsic::amdgcn_global_load_monitor_b128:
1548 case Intrinsic::amdgcn_cluster_load_b32:
1549 case Intrinsic::amdgcn_cluster_load_b64:
1550 case Intrinsic::amdgcn_cluster_load_b128:
1551 case Intrinsic::amdgcn_ds_load_tr6_b96:
1552 case Intrinsic::amdgcn_ds_load_tr4_b64:
1553 case Intrinsic::amdgcn_ds_load_tr8_b64:
1554 case Intrinsic::amdgcn_ds_load_tr16_b128:
1555 case Intrinsic::amdgcn_global_load_tr6_b96:
1556 case Intrinsic::amdgcn_global_load_tr4_b64:
1557 case Intrinsic::amdgcn_global_load_tr_b64:
1558 case Intrinsic::amdgcn_global_load_tr_b128:
1559 case Intrinsic::amdgcn_ds_read_tr4_b64:
1560 case Intrinsic::amdgcn_ds_read_tr6_b96:
1561 case Intrinsic::amdgcn_ds_read_tr8_b64:
1562 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1563 Info.opc = ISD::INTRINSIC_W_CHAIN;
1564 Info.memVT = MVT::getVT(CI.getType());
1565 Info.ptrVal = CI.getOperand(0);
1566 Info.align.reset();
1567 Info.flags |= MachineMemOperand::MOLoad;
1568 return true;
1569 }
1570 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1571 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1572 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1573 Info.opc = ISD::INTRINSIC_W_CHAIN;
1574 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1575 Info.ptrVal = CI.getOperand(0);
1576 Info.align.reset();
1577 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1578 return true;
1579 }
1580 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1581 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1582 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1583 Info.opc = ISD::INTRINSIC_VOID;
1584 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1585 Info.ptrVal = CI.getArgOperand(0);
1586 Info.align.reset();
1587 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1588 return true;
1589 }
1590 case Intrinsic::amdgcn_ds_gws_init:
1591 case Intrinsic::amdgcn_ds_gws_barrier:
1592 case Intrinsic::amdgcn_ds_gws_sema_v:
1593 case Intrinsic::amdgcn_ds_gws_sema_br:
1594 case Intrinsic::amdgcn_ds_gws_sema_p:
1595 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1596 Info.opc = ISD::INTRINSIC_VOID;
1597
1598 const GCNTargetMachine &TM =
1599 static_cast<const GCNTargetMachine &>(getTargetMachine());
1600
1602 Info.ptrVal = MFI->getGWSPSV(TM);
1603
1604 // This is an abstract access, but we need to specify a type and size.
1605 Info.memVT = MVT::i32;
1606 Info.size = 4;
1607 Info.align = Align(4);
1608
1609 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1610 Info.flags |= MachineMemOperand::MOLoad;
1611 else
1612 Info.flags |= MachineMemOperand::MOStore;
1613 return true;
1614 }
1615 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1616 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1617 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1618 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1619 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1620 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1621 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1622 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1623 Info.opc = ISD::INTRINSIC_VOID;
1624 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1625 Info.ptrVal = CI.getArgOperand(1);
1627 return true;
1628 }
1629 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1630 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1631 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1632 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1633 Info.opc = ISD::INTRINSIC_VOID;
1634 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1635 Info.ptrVal = CI.getArgOperand(0);
1637 return true;
1638 }
1639 case Intrinsic::amdgcn_load_to_lds:
1640 case Intrinsic::amdgcn_global_load_lds: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1643 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1644 Info.ptrVal = CI.getArgOperand(1);
1646 return true;
1647 }
1648 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1649 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1650 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1651 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1652 Info.opc = ISD::INTRINSIC_W_CHAIN;
1653
1654 const GCNTargetMachine &TM =
1655 static_cast<const GCNTargetMachine &>(getTargetMachine());
1656
1658 Info.ptrVal = MFI->getGWSPSV(TM);
1659
1660 // This is an abstract access, but we need to specify a type and size.
1661 Info.memVT = MVT::i32;
1662 Info.size = 4;
1663 Info.align = Align(4);
1664
1666 return true;
1667 }
1668 case Intrinsic::amdgcn_s_prefetch_data:
1669 case Intrinsic::amdgcn_flat_prefetch:
1670 case Intrinsic::amdgcn_global_prefetch: {
1671 Info.opc = ISD::INTRINSIC_VOID;
1672 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1673 Info.ptrVal = CI.getArgOperand(0);
1674 Info.flags |= MachineMemOperand::MOLoad;
1675 return true;
1676 }
1677 default:
1678 return false;
1679 }
1680}
1681
1683 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1685 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1686 // The DAG's ValueType loses the addrspaces.
1687 // Add them as 2 extra Constant operands "from" and "to".
1688 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1689 unsigned DstAS = I.getType()->getPointerAddressSpace();
1690 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1691 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1692 break;
1693 }
1694 default:
1695 break;
1696 }
1697}
1698
1701 Type *&AccessTy) const {
1702 Value *Ptr = nullptr;
1703 switch (II->getIntrinsicID()) {
1704 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1705 case Intrinsic::amdgcn_cluster_load_b128:
1706 case Intrinsic::amdgcn_cluster_load_b64:
1707 case Intrinsic::amdgcn_cluster_load_b32:
1708 case Intrinsic::amdgcn_ds_append:
1709 case Intrinsic::amdgcn_ds_consume:
1710 case Intrinsic::amdgcn_ds_load_tr8_b64:
1711 case Intrinsic::amdgcn_ds_load_tr16_b128:
1712 case Intrinsic::amdgcn_ds_load_tr4_b64:
1713 case Intrinsic::amdgcn_ds_load_tr6_b96:
1714 case Intrinsic::amdgcn_ds_read_tr4_b64:
1715 case Intrinsic::amdgcn_ds_read_tr6_b96:
1716 case Intrinsic::amdgcn_ds_read_tr8_b64:
1717 case Intrinsic::amdgcn_ds_read_tr16_b64:
1718 case Intrinsic::amdgcn_ds_ordered_add:
1719 case Intrinsic::amdgcn_ds_ordered_swap:
1720 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1721 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1722 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1723 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1724 case Intrinsic::amdgcn_flat_load_monitor_b128:
1725 case Intrinsic::amdgcn_flat_load_monitor_b32:
1726 case Intrinsic::amdgcn_flat_load_monitor_b64:
1727 case Intrinsic::amdgcn_global_atomic_csub:
1728 case Intrinsic::amdgcn_global_atomic_fmax_num:
1729 case Intrinsic::amdgcn_global_atomic_fmin_num:
1730 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1731 case Intrinsic::amdgcn_global_load_monitor_b128:
1732 case Intrinsic::amdgcn_global_load_monitor_b32:
1733 case Intrinsic::amdgcn_global_load_monitor_b64:
1734 case Intrinsic::amdgcn_global_load_tr_b64:
1735 case Intrinsic::amdgcn_global_load_tr_b128:
1736 case Intrinsic::amdgcn_global_load_tr4_b64:
1737 case Intrinsic::amdgcn_global_load_tr6_b96:
1738 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1739 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1740 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1741 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1742 Ptr = II->getArgOperand(0);
1743 break;
1744 case Intrinsic::amdgcn_load_to_lds:
1745 case Intrinsic::amdgcn_global_load_lds:
1746 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1747 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1748 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1749 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1750 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1751 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1752 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1753 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1754 Ptr = II->getArgOperand(1);
1755 break;
1756 default:
1757 return false;
1758 }
1759 AccessTy = II->getType();
1760 Ops.push_back(Ptr);
1761 return true;
1762}
1763
1765 unsigned AddrSpace) const {
1766 if (!Subtarget->hasFlatInstOffsets()) {
1767 // Flat instructions do not have offsets, and only have the register
1768 // address.
1769 return AM.BaseOffs == 0 && AM.Scale == 0;
1770 }
1771
1772 decltype(SIInstrFlags::FLAT) FlatVariant =
1776
1777 return AM.Scale == 0 &&
1778 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1779 AM.BaseOffs, AddrSpace, FlatVariant));
1780}
1781
1783 if (Subtarget->hasFlatGlobalInsts())
1785
1786 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1787 // Assume the we will use FLAT for all global memory accesses
1788 // on VI.
1789 // FIXME: This assumption is currently wrong. On VI we still use
1790 // MUBUF instructions for the r + i addressing mode. As currently
1791 // implemented, the MUBUF instructions only work on buffer < 4GB.
1792 // It may be possible to support > 4GB buffers with MUBUF instructions,
1793 // by setting the stride value in the resource descriptor which would
1794 // increase the size limit to (stride * 4GB). However, this is risky,
1795 // because it has never been validated.
1797 }
1798
1799 return isLegalMUBUFAddressingMode(AM);
1800}
1801
1802bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1803 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1804 // additionally can do r + r + i with addr64. 32-bit has more addressing
1805 // mode options. Depending on the resource constant, it can also do
1806 // (i64 r0) + (i32 r1) * (i14 i).
1807 //
1808 // Private arrays end up using a scratch buffer most of the time, so also
1809 // assume those use MUBUF instructions. Scratch loads / stores are currently
1810 // implemented as mubuf instructions with offen bit set, so slightly
1811 // different than the normal addr64.
1812 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1813 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1814 return false;
1815
1816 // FIXME: Since we can split immediate into soffset and immediate offset,
1817 // would it make sense to allow any immediate?
1818
1819 switch (AM.Scale) {
1820 case 0: // r + i or just i, depending on HasBaseReg.
1821 return true;
1822 case 1:
1823 return true; // We have r + r or r + i.
1824 case 2:
1825 if (AM.HasBaseReg) {
1826 // Reject 2 * r + r.
1827 return false;
1828 }
1829
1830 // Allow 2 * r as r + r
1831 // Or 2 * r + i is allowed as r + r + i.
1832 return true;
1833 default: // Don't allow n * r
1834 return false;
1835 }
1836}
1837
1839 const AddrMode &AM, Type *Ty,
1840 unsigned AS,
1841 Instruction *I) const {
1842 // No global is ever allowed as a base.
1843 if (AM.BaseGV)
1844 return false;
1845
1846 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1847 return isLegalGlobalAddressingMode(AM);
1848
1849 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1853 // If the offset isn't a multiple of 4, it probably isn't going to be
1854 // correctly aligned.
1855 // FIXME: Can we get the real alignment here?
1856 if (AM.BaseOffs % 4 != 0)
1857 return isLegalMUBUFAddressingMode(AM);
1858
1859 if (!Subtarget->hasScalarSubwordLoads()) {
1860 // There are no SMRD extloads, so if we have to do a small type access we
1861 // will use a MUBUF load.
1862 // FIXME?: We also need to do this if unaligned, but we don't know the
1863 // alignment here.
1864 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1865 return isLegalGlobalAddressingMode(AM);
1866 }
1867
1868 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1869 // SMRD instructions have an 8-bit, dword offset on SI.
1870 if (!isUInt<8>(AM.BaseOffs / 4))
1871 return false;
1872 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1873 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1874 // in 8-bits, it can use a smaller encoding.
1875 if (!isUInt<32>(AM.BaseOffs / 4))
1876 return false;
1877 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1878 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1879 if (!isUInt<20>(AM.BaseOffs))
1880 return false;
1881 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1882 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1883 // for S_BUFFER_* instructions).
1884 if (!isInt<21>(AM.BaseOffs))
1885 return false;
1886 } else {
1887 // On GFX12, all offsets are signed 24-bit in bytes.
1888 if (!isInt<24>(AM.BaseOffs))
1889 return false;
1890 }
1891
1892 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1894 AM.BaseOffs < 0) {
1895 // Scalar (non-buffer) loads can only use a negative offset if
1896 // soffset+offset is non-negative. Since the compiler can only prove that
1897 // in a few special cases, it is safer to claim that negative offsets are
1898 // not supported.
1899 return false;
1900 }
1901
1902 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1903 return true;
1904
1905 if (AM.Scale == 1 && AM.HasBaseReg)
1906 return true;
1907
1908 return false;
1909 }
1910
1911 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1912 return Subtarget->enableFlatScratch()
1914 : isLegalMUBUFAddressingMode(AM);
1915
1916 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1917 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1918 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1919 // field.
1920 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1921 // an 8-bit dword offset but we don't know the alignment here.
1922 if (!isUInt<16>(AM.BaseOffs))
1923 return false;
1924
1925 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1926 return true;
1927
1928 if (AM.Scale == 1 && AM.HasBaseReg)
1929 return true;
1930
1931 return false;
1932 }
1933
1935 // For an unknown address space, this usually means that this is for some
1936 // reason being used for pure arithmetic, and not based on some addressing
1937 // computation. We don't have instructions that compute pointers with any
1938 // addressing modes, so treat them as having no offset like flat
1939 // instructions.
1941 }
1942
1943 // Assume a user alias of global for unknown address spaces.
1944 return isLegalGlobalAddressingMode(AM);
1945}
1946
1948 const MachineFunction &MF) const {
1950 return (MemVT.getSizeInBits() <= 4 * 32);
1951 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1952 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1953 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1954 }
1956 return (MemVT.getSizeInBits() <= 2 * 32);
1957 return true;
1958}
1959
1961 unsigned Size, unsigned AddrSpace, Align Alignment,
1962 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1963 if (IsFast)
1964 *IsFast = 0;
1965
1966 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1967 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1968 // Check if alignment requirements for ds_read/write instructions are
1969 // disabled.
1970 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1971 return false;
1972
1973 Align RequiredAlignment(
1974 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1975 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1976 Alignment < RequiredAlignment)
1977 return false;
1978
1979 // Either, the alignment requirements are "enabled", or there is an
1980 // unaligned LDS access related hardware bug though alignment requirements
1981 // are "disabled". In either case, we need to check for proper alignment
1982 // requirements.
1983 //
1984 switch (Size) {
1985 case 64:
1986 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1987 // address is negative, then the instruction is incorrectly treated as
1988 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1989 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1990 // load later in the SILoadStoreOptimizer.
1991 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1992 return false;
1993
1994 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1995 // can do a 4 byte aligned, 8 byte access in a single operation using
1996 // ds_read2/write2_b32 with adjacent offsets.
1997 RequiredAlignment = Align(4);
1998
1999 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2000 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2001 // ds_write2_b32 depending on the alignment. In either case with either
2002 // alignment there is no faster way of doing this.
2003
2004 // The numbers returned here and below are not additive, it is a 'speed
2005 // rank'. They are just meant to be compared to decide if a certain way
2006 // of lowering an operation is faster than another. For that purpose
2007 // naturally aligned operation gets it bitsize to indicate that "it
2008 // operates with a speed comparable to N-bit wide load". With the full
2009 // alignment ds128 is slower than ds96 for example. If underaligned it
2010 // is comparable to a speed of a single dword access, which would then
2011 // mean 32 < 128 and it is faster to issue a wide load regardless.
2012 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2013 // wider load which will not be aligned anymore the latter is slower.
2014 if (IsFast)
2015 *IsFast = (Alignment >= RequiredAlignment) ? 64
2016 : (Alignment < Align(4)) ? 32
2017 : 1;
2018 return true;
2019 }
2020
2021 break;
2022 case 96:
2023 if (!Subtarget->hasDS96AndDS128())
2024 return false;
2025
2026 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2027 // gfx8 and older.
2028
2029 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2030 // Naturally aligned access is fastest. However, also report it is Fast
2031 // if memory is aligned less than DWORD. A narrow load or store will be
2032 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2033 // be more of them, so overall we will pay less penalty issuing a single
2034 // instruction.
2035
2036 // See comment on the values above.
2037 if (IsFast)
2038 *IsFast = (Alignment >= RequiredAlignment) ? 96
2039 : (Alignment < Align(4)) ? 32
2040 : 1;
2041 return true;
2042 }
2043
2044 break;
2045 case 128:
2046 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2047 return false;
2048
2049 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2050 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2051 // single operation using ds_read2/write2_b64.
2052 RequiredAlignment = Align(8);
2053
2054 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2055 // Naturally aligned access is fastest. However, also report it is Fast
2056 // if memory is aligned less than DWORD. A narrow load or store will be
2057 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2058 // will be more of them, so overall we will pay less penalty issuing a
2059 // single instruction.
2060
2061 // See comment on the values above.
2062 if (IsFast)
2063 *IsFast = (Alignment >= RequiredAlignment) ? 128
2064 : (Alignment < Align(4)) ? 32
2065 : 1;
2066 return true;
2067 }
2068
2069 break;
2070 default:
2071 if (Size > 32)
2072 return false;
2073
2074 break;
2075 }
2076
2077 // See comment on the values above.
2078 // Note that we have a single-dword or sub-dword here, so if underaligned
2079 // it is a slowest possible access, hence returned value is 0.
2080 if (IsFast)
2081 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2082
2083 return Alignment >= RequiredAlignment ||
2084 Subtarget->hasUnalignedDSAccessEnabled();
2085 }
2086
2087 // FIXME: We have to be conservative here and assume that flat operations
2088 // will access scratch. If we had access to the IR function, then we
2089 // could determine if any private memory was used in the function.
2090 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2091 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2092 bool AlignedBy4 = Alignment >= Align(4);
2093 if (IsFast)
2094 *IsFast = AlignedBy4;
2095
2096 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2097 }
2098
2099 // So long as they are correct, wide global memory operations perform better
2100 // than multiple smaller memory ops -- even when misaligned
2101 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2102 if (IsFast)
2103 *IsFast = Size;
2104
2105 return Alignment >= Align(4) ||
2106 Subtarget->hasUnalignedBufferAccessEnabled();
2107 }
2108
2109 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2110 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2111 // out-of-bounds behavior, but in the edge case where an access starts
2112 // out-of-bounds and then enter in-bounds, the entire access would be treated
2113 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2114 // natural alignment of buffer accesses.
2115 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2116 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2117 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2118 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2119 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2120 return false;
2121 }
2122
2123 // Smaller than dword value must be aligned.
2124 if (Size < 32)
2125 return false;
2126
2127 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2128 // byte-address are ignored, thus forcing Dword alignment.
2129 // This applies to private, global, and constant memory.
2130 if (IsFast)
2131 *IsFast = 1;
2132
2133 return Size >= 32 && Alignment >= Align(4);
2134}
2135
2137 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2138 unsigned *IsFast) const {
2140 Alignment, Flags, IsFast);
2141}
2142
2144 LLVMContext &Context, const MemOp &Op,
2145 const AttributeList &FuncAttributes) const {
2146 // FIXME: Should account for address space here.
2147
2148 // The default fallback uses the private pointer size as a guess for a type to
2149 // use. Make sure we switch these to 64-bit accesses.
2150
2151 if (Op.size() >= 16 &&
2152 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2153 return MVT::v4i32;
2154
2155 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2156 return MVT::v2i32;
2157
2158 // Use the default.
2159 return MVT::Other;
2160}
2161
2163 const MemSDNode *MemNode = cast<MemSDNode>(N);
2164 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2165}
2166
2171
2173 unsigned DestAS) const {
2174 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2175 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2176 Subtarget->hasGloballyAddressableScratch()) {
2177 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2178 return false;
2179 }
2180
2181 // Flat -> private/local is a simple truncate.
2182 // Flat -> global is no-op
2183 return true;
2184 }
2185
2186 const GCNTargetMachine &TM =
2187 static_cast<const GCNTargetMachine &>(getTargetMachine());
2188 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2189}
2190
2198
2200 Type *Ty) const {
2201 // FIXME: Could be smarter if called for vector constants.
2202 return true;
2203}
2204
2206 unsigned Index) const {
2208 return false;
2209
2210 // TODO: Add more cases that are cheap.
2211 return Index == 0;
2212}
2213
2214bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2215 // TODO: This should be more aggressive, particular for 16-bit element
2216 // vectors. However there are some mixed improvements and regressions.
2217 EVT EltTy = VT.getVectorElementType();
2218 return EltTy.getSizeInBits() % 32 == 0;
2219}
2220
2222 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2223 switch (Op) {
2224 case ISD::LOAD:
2225 case ISD::STORE:
2226 return true;
2227 default:
2228 return false;
2229 }
2230 }
2231
2232 // SimplifySetCC uses this function to determine whether or not it should
2233 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2234 if (VT == MVT::i1 && Op == ISD::SETCC)
2235 return false;
2236
2238}
2239
2240SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2241 const SDLoc &SL,
2242 SDValue Chain,
2243 uint64_t Offset) const {
2244 const DataLayout &DL = DAG.getDataLayout();
2248
2249 auto [InputPtrReg, RC, ArgTy] =
2250 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2251
2252 // We may not have the kernarg segment argument if we have no kernel
2253 // arguments.
2254 if (!InputPtrReg)
2255 return DAG.getConstant(Offset, SL, PtrVT);
2256
2258 SDValue BasePtr = DAG.getCopyFromReg(
2259 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2260
2261 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2262}
2263
2264SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2265 const SDLoc &SL) const {
2268 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2269}
2270
2271SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2272 const SDLoc &SL) const {
2273
2275 std::optional<uint32_t> KnownSize =
2277 if (KnownSize.has_value())
2278 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2279 return SDValue();
2280}
2281
2282SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2283 const SDLoc &SL, SDValue Val,
2284 bool Signed,
2285 const ISD::InputArg *Arg) const {
2286 // First, if it is a widened vector, narrow it.
2287 if (VT.isVector() &&
2289 EVT NarrowedVT =
2292 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2293 DAG.getConstant(0, SL, MVT::i32));
2294 }
2295
2296 // Then convert the vector elements or scalar value.
2297 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2298 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2299 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2300 }
2301
2302 if (MemVT.isFloatingPoint())
2303 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2304 else if (Signed)
2305 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2306 else
2307 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2308
2309 return Val;
2310}
2311
2312SDValue SITargetLowering::lowerKernargMemParameter(
2313 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2314 uint64_t Offset, Align Alignment, bool Signed,
2315 const ISD::InputArg *Arg) const {
2316 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2317
2318 // Try to avoid using an extload by loading earlier than the argument address,
2319 // and extracting the relevant bits. The load should hopefully be merged with
2320 // the previous argument.
2321 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2322 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2323 int64_t AlignDownOffset = alignDown(Offset, 4);
2324 int64_t OffsetDiff = Offset - AlignDownOffset;
2325
2326 EVT IntVT = MemVT.changeTypeToInteger();
2327
2328 // TODO: If we passed in the base kernel offset we could have a better
2329 // alignment than 4, but we don't really need it.
2330 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2331 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2334
2335 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2336 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2337
2338 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2339 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2340 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2341
2342 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2343 }
2344
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2346 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2349
2350 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2351 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2352}
2353
2354SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2355 CCValAssign &VA, const SDLoc &SL,
2356 SDValue Chain,
2357 const ISD::InputArg &Arg) const {
2358 MachineFunction &MF = DAG.getMachineFunction();
2359 MachineFrameInfo &MFI = MF.getFrameInfo();
2360
2361 if (Arg.Flags.isByVal()) {
2362 unsigned Size = Arg.Flags.getByValSize();
2363 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2364 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2365 }
2366
2367 unsigned ArgOffset = VA.getLocMemOffset();
2368 unsigned ArgSize = VA.getValVT().getStoreSize();
2369
2370 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2371
2372 // Create load nodes to retrieve arguments from the stack.
2373 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2374 SDValue ArgValue;
2375
2376 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2378 MVT MemVT = VA.getValVT();
2379
2380 switch (VA.getLocInfo()) {
2381 default:
2382 break;
2383 case CCValAssign::BCvt:
2384 MemVT = VA.getLocVT();
2385 break;
2386 case CCValAssign::SExt:
2387 ExtType = ISD::SEXTLOAD;
2388 break;
2389 case CCValAssign::ZExt:
2390 ExtType = ISD::ZEXTLOAD;
2391 break;
2392 case CCValAssign::AExt:
2393 ExtType = ISD::EXTLOAD;
2394 break;
2395 }
2396
2397 ArgValue = DAG.getExtLoad(
2398 ExtType, SL, VA.getLocVT(), Chain, FIN,
2400 return ArgValue;
2401}
2402
2403SDValue SITargetLowering::getPreloadedValue(
2404 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2406 const ArgDescriptor *Reg = nullptr;
2407 const TargetRegisterClass *RC;
2408 LLT Ty;
2409
2411 const ArgDescriptor WorkGroupIDX =
2412 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2413 // If GridZ is not programmed in an entry function then the hardware will set
2414 // it to all zeros, so there is no need to mask the GridY value in the low
2415 // order bits.
2416 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2417 AMDGPU::TTMP7,
2418 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2419 const ArgDescriptor WorkGroupIDZ =
2420 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2421 if (Subtarget->hasArchitectedSGPRs() &&
2424 switch (PVID) {
2426 Reg = &WorkGroupIDX;
2427 RC = &AMDGPU::SReg_32RegClass;
2428 Ty = LLT::scalar(32);
2429 break;
2431 Reg = &WorkGroupIDY;
2432 RC = &AMDGPU::SReg_32RegClass;
2433 Ty = LLT::scalar(32);
2434 break;
2436 Reg = &WorkGroupIDZ;
2437 RC = &AMDGPU::SReg_32RegClass;
2438 Ty = LLT::scalar(32);
2439 break;
2440 default:
2441 break;
2442 }
2443 }
2444
2445 if (!Reg)
2446 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2447 if (!Reg) {
2449 // It's possible for a kernarg intrinsic call to appear in a kernel with
2450 // no allocated segment, in which case we do not add the user sgpr
2451 // argument, so just return null.
2452 return DAG.getConstant(0, SDLoc(), VT);
2453 }
2454
2455 // It's undefined behavior if a function marked with the amdgpu-no-*
2456 // attributes uses the corresponding intrinsic.
2457 return DAG.getPOISON(VT);
2458 }
2459
2460 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2461}
2462
2464 CallingConv::ID CallConv,
2465 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2466 FunctionType *FType,
2468 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2469 const ISD::InputArg *Arg = &Ins[I];
2470
2471 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2472 "vector type argument should have been split");
2473
2474 // First check if it's a PS input addr.
2475 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2476 PSInputNum <= 15) {
2477 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2478
2479 // Inconveniently only the first part of the split is marked as isSplit,
2480 // so skip to the end. We only want to increment PSInputNum once for the
2481 // entire split argument.
2482 if (Arg->Flags.isSplit()) {
2483 while (!Arg->Flags.isSplitEnd()) {
2484 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2485 "unexpected vector split in ps argument type");
2486 if (!SkipArg)
2487 Splits.push_back(*Arg);
2488 Arg = &Ins[++I];
2489 }
2490 }
2491
2492 if (SkipArg) {
2493 // We can safely skip PS inputs.
2494 Skipped.set(Arg->getOrigArgIndex());
2495 ++PSInputNum;
2496 continue;
2497 }
2498
2499 Info->markPSInputAllocated(PSInputNum);
2500 if (Arg->Used)
2501 Info->markPSInputEnabled(PSInputNum);
2502
2503 ++PSInputNum;
2504 }
2505
2506 Splits.push_back(*Arg);
2507 }
2508}
2509
2510// Allocate special inputs passed in VGPRs.
2512 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2513 SIMachineFunctionInfo &Info) const {
2514 const LLT S32 = LLT::scalar(32);
2516
2517 if (Info.hasWorkItemIDX()) {
2518 Register Reg = AMDGPU::VGPR0;
2519 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2520
2521 CCInfo.AllocateReg(Reg);
2522 unsigned Mask =
2523 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2524 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2525 }
2526
2527 if (Info.hasWorkItemIDY()) {
2528 assert(Info.hasWorkItemIDX());
2529 if (Subtarget->hasPackedTID()) {
2530 Info.setWorkItemIDY(
2531 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2532 } else {
2533 unsigned Reg = AMDGPU::VGPR1;
2534 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2535
2536 CCInfo.AllocateReg(Reg);
2537 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2538 }
2539 }
2540
2541 if (Info.hasWorkItemIDZ()) {
2542 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2543 if (Subtarget->hasPackedTID()) {
2544 Info.setWorkItemIDZ(
2545 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2546 } else {
2547 unsigned Reg = AMDGPU::VGPR2;
2548 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2549
2550 CCInfo.AllocateReg(Reg);
2551 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2552 }
2553 }
2554}
2555
2556// Try to allocate a VGPR at the end of the argument list, or if no argument
2557// VGPRs are left allocating a stack slot.
2558// If \p Mask is is given it indicates bitfield position in the register.
2559// If \p Arg is given use it with new ]p Mask instead of allocating new.
2560static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2561 ArgDescriptor Arg = ArgDescriptor()) {
2562 if (Arg.isSet())
2563 return ArgDescriptor::createArg(Arg, Mask);
2564
2565 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2566 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2567 if (RegIdx == ArgVGPRs.size()) {
2568 // Spill to stack required.
2569 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2570
2571 return ArgDescriptor::createStack(Offset, Mask);
2572 }
2573
2574 unsigned Reg = ArgVGPRs[RegIdx];
2575 Reg = CCInfo.AllocateReg(Reg);
2576 assert(Reg != AMDGPU::NoRegister);
2577
2578 MachineFunction &MF = CCInfo.getMachineFunction();
2579 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2580 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2581 return ArgDescriptor::createRegister(Reg, Mask);
2582}
2583
2585 const TargetRegisterClass *RC,
2586 unsigned NumArgRegs) {
2587 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2588 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2589 if (RegIdx == ArgSGPRs.size())
2590 report_fatal_error("ran out of SGPRs for arguments");
2591
2592 unsigned Reg = ArgSGPRs[RegIdx];
2593 Reg = CCInfo.AllocateReg(Reg);
2594 assert(Reg != AMDGPU::NoRegister);
2595
2596 MachineFunction &MF = CCInfo.getMachineFunction();
2597 MF.addLiveIn(Reg, RC);
2599}
2600
2601// If this has a fixed position, we still should allocate the register in the
2602// CCInfo state. Technically we could get away with this for values passed
2603// outside of the normal argument range.
2605 const TargetRegisterClass *RC,
2606 MCRegister Reg) {
2607 Reg = CCInfo.AllocateReg(Reg);
2608 assert(Reg != AMDGPU::NoRegister);
2609 MachineFunction &MF = CCInfo.getMachineFunction();
2610 MF.addLiveIn(Reg, RC);
2611}
2612
2613static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2614 if (Arg) {
2615 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2616 Arg.getRegister());
2617 } else
2618 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2619}
2620
2621static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2622 if (Arg) {
2623 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2624 Arg.getRegister());
2625 } else
2626 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2627}
2628
2629/// Allocate implicit function VGPR arguments at the end of allocated user
2630/// arguments.
2632 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2633 SIMachineFunctionInfo &Info) const {
2634 const unsigned Mask = 0x3ff;
2635 ArgDescriptor Arg;
2636
2637 if (Info.hasWorkItemIDX()) {
2638 Arg = allocateVGPR32Input(CCInfo, Mask);
2639 Info.setWorkItemIDX(Arg);
2640 }
2641
2642 if (Info.hasWorkItemIDY()) {
2643 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2644 Info.setWorkItemIDY(Arg);
2645 }
2646
2647 if (Info.hasWorkItemIDZ())
2648 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2649}
2650
2651/// Allocate implicit function VGPR arguments in fixed registers.
2653 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2654 SIMachineFunctionInfo &Info) const {
2655 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2656 if (!Reg)
2657 report_fatal_error("failed to allocate VGPR for implicit arguments");
2658
2659 const unsigned Mask = 0x3ff;
2660 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2661 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2662 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2663}
2664
2666 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2667 SIMachineFunctionInfo &Info) const {
2668 auto &ArgInfo = Info.getArgInfo();
2669 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2670
2671 // TODO: Unify handling with private memory pointers.
2672 if (UserSGPRInfo.hasDispatchPtr())
2673 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2674
2675 if (UserSGPRInfo.hasQueuePtr())
2676 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2677
2678 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2679 // constant offset from the kernarg segment.
2680 if (Info.hasImplicitArgPtr())
2681 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2682
2683 if (UserSGPRInfo.hasDispatchID())
2684 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2685
2686 // flat_scratch_init is not applicable for non-kernel functions.
2687
2688 if (Info.hasWorkGroupIDX())
2689 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2690
2691 if (Info.hasWorkGroupIDY())
2692 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2693
2694 if (Info.hasWorkGroupIDZ())
2695 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2696
2697 if (Info.hasLDSKernelId())
2698 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2699}
2700
2701// Allocate special inputs passed in user SGPRs.
2703 MachineFunction &MF,
2704 const SIRegisterInfo &TRI,
2705 SIMachineFunctionInfo &Info) const {
2706 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2707 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2708 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2709 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2710 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2711 }
2712
2713 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2714 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2715 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2716 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2717 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2718 }
2719
2720 if (UserSGPRInfo.hasDispatchPtr()) {
2721 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2722 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2723 CCInfo.AllocateReg(DispatchPtrReg);
2724 }
2725
2726 if (UserSGPRInfo.hasQueuePtr()) {
2727 Register QueuePtrReg = Info.addQueuePtr(TRI);
2728 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2729 CCInfo.AllocateReg(QueuePtrReg);
2730 }
2731
2732 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2734 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2735 CCInfo.AllocateReg(InputPtrReg);
2736
2737 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2738 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2739 }
2740
2741 if (UserSGPRInfo.hasDispatchID()) {
2742 Register DispatchIDReg = Info.addDispatchID(TRI);
2743 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2744 CCInfo.AllocateReg(DispatchIDReg);
2745 }
2746
2747 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2748 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2749 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2750 CCInfo.AllocateReg(FlatScratchInitReg);
2751 }
2752
2753 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2754 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2755 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2756 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2757 }
2758
2759 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2760 // these from the dispatch pointer.
2761}
2762
2763// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2764// sequential starting from the first argument.
2766 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2768 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2769 Function &F = MF.getFunction();
2770 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2771 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2772 bool InPreloadSequence = true;
2773 unsigned InIdx = 0;
2774 bool AlignedForImplictArgs = false;
2775 unsigned ImplicitArgOffset = 0;
2776 for (auto &Arg : F.args()) {
2777 if (!InPreloadSequence || !Arg.hasInRegAttr())
2778 break;
2779
2780 unsigned ArgIdx = Arg.getArgNo();
2781 // Don't preload non-original args or parts not in the current preload
2782 // sequence.
2783 if (InIdx < Ins.size() &&
2784 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2785 break;
2786
2787 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2788 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2789 InIdx++) {
2790 assert(ArgLocs[ArgIdx].isMemLoc());
2791 auto &ArgLoc = ArgLocs[InIdx];
2792 const Align KernelArgBaseAlign = Align(16);
2793 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2794 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2795 unsigned NumAllocSGPRs =
2796 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2797
2798 // Fix alignment for hidden arguments.
2799 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2800 if (!AlignedForImplictArgs) {
2801 ImplicitArgOffset =
2802 alignTo(LastExplicitArgOffset,
2803 Subtarget->getAlignmentForImplicitArgPtr()) -
2804 LastExplicitArgOffset;
2805 AlignedForImplictArgs = true;
2806 }
2807 ArgOffset += ImplicitArgOffset;
2808 }
2809
2810 // Arg is preloaded into the previous SGPR.
2811 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2812 assert(InIdx >= 1 && "No previous SGPR");
2813 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2814 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2815 continue;
2816 }
2817
2818 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2819 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2820 // Check for free user SGPRs for preloading.
2821 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2822 InPreloadSequence = false;
2823 break;
2824 }
2825
2826 // Preload this argument.
2827 const TargetRegisterClass *RC =
2828 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2829 SmallVectorImpl<MCRegister> *PreloadRegs =
2830 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2831
2832 if (PreloadRegs->size() > 1)
2833 RC = &AMDGPU::SGPR_32RegClass;
2834 for (auto &Reg : *PreloadRegs) {
2835 assert(Reg);
2836 MF.addLiveIn(Reg, RC);
2837 CCInfo.AllocateReg(Reg);
2838 }
2839
2840 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2841 }
2842 }
2843}
2844
2846 const SIRegisterInfo &TRI,
2847 SIMachineFunctionInfo &Info) const {
2848 // Always allocate this last since it is a synthetic preload.
2849 if (Info.hasLDSKernelId()) {
2850 Register Reg = Info.addLDSKernelId();
2851 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2852 CCInfo.AllocateReg(Reg);
2853 }
2854}
2855
2856// Allocate special input registers that are initialized per-wave.
2859 CallingConv::ID CallConv,
2860 bool IsShader) const {
2861 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2862 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2863 // Note: user SGPRs are handled by the front-end for graphics shaders
2864 // Pad up the used user SGPRs with dead inputs.
2865
2866 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2867 // before enabling architected SGPRs for workgroup IDs.
2868 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2869
2870 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2871 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2872 // rely on it to reach 16 since if we end up having no stack usage, it will
2873 // not really be added.
2874 unsigned NumRequiredSystemSGPRs =
2875 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2876 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2877 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2878 Register Reg = Info.addReservedUserSGPR();
2879 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2880 CCInfo.AllocateReg(Reg);
2881 }
2882 }
2883
2884 if (!HasArchitectedSGPRs) {
2885 if (Info.hasWorkGroupIDX()) {
2886 Register Reg = Info.addWorkGroupIDX();
2887 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2888 CCInfo.AllocateReg(Reg);
2889 }
2890
2891 if (Info.hasWorkGroupIDY()) {
2892 Register Reg = Info.addWorkGroupIDY();
2893 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2894 CCInfo.AllocateReg(Reg);
2895 }
2896
2897 if (Info.hasWorkGroupIDZ()) {
2898 Register Reg = Info.addWorkGroupIDZ();
2899 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2900 CCInfo.AllocateReg(Reg);
2901 }
2902 }
2903
2904 if (Info.hasWorkGroupInfo()) {
2905 Register Reg = Info.addWorkGroupInfo();
2906 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2907 CCInfo.AllocateReg(Reg);
2908 }
2909
2910 if (Info.hasPrivateSegmentWaveByteOffset()) {
2911 // Scratch wave offset passed in system SGPR.
2912 unsigned PrivateSegmentWaveByteOffsetReg;
2913
2914 if (IsShader) {
2915 PrivateSegmentWaveByteOffsetReg =
2916 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2917
2918 // This is true if the scratch wave byte offset doesn't have a fixed
2919 // location.
2920 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2921 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2922 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2923 }
2924 } else
2925 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2926
2927 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2928 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2929 }
2930
2931 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2932 Info.getNumPreloadedSGPRs() >= 16);
2933}
2934
2936 MachineFunction &MF,
2937 const SIRegisterInfo &TRI,
2939 // Now that we've figured out where the scratch register inputs are, see if
2940 // should reserve the arguments and use them directly.
2941 MachineFrameInfo &MFI = MF.getFrameInfo();
2942 bool HasStackObjects = MFI.hasStackObjects();
2943 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2944
2945 // Record that we know we have non-spill stack objects so we don't need to
2946 // check all stack objects later.
2947 if (HasStackObjects)
2948 Info.setHasNonSpillStackObjects(true);
2949
2950 // Everything live out of a block is spilled with fast regalloc, so it's
2951 // almost certain that spilling will be required.
2952 if (TM.getOptLevel() == CodeGenOptLevel::None)
2953 HasStackObjects = true;
2954
2955 // For now assume stack access is needed in any callee functions, so we need
2956 // the scratch registers to pass in.
2957 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2958
2959 if (!ST.enableFlatScratch()) {
2960 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2961 // If we have stack objects, we unquestionably need the private buffer
2962 // resource. For the Code Object V2 ABI, this will be the first 4 user
2963 // SGPR inputs. We can reserve those and use them directly.
2964
2965 Register PrivateSegmentBufferReg =
2967 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2968 } else {
2969 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2970 // We tentatively reserve the last registers (skipping the last registers
2971 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2972 // we'll replace these with the ones immediately after those which were
2973 // really allocated. In the prologue copies will be inserted from the
2974 // argument to these reserved registers.
2975
2976 // Without HSA, relocations are used for the scratch pointer and the
2977 // buffer resource setup is always inserted in the prologue. Scratch wave
2978 // offset is still in an input SGPR.
2979 Info.setScratchRSrcReg(ReservedBufferReg);
2980 }
2981 }
2982
2984
2985 // For entry functions we have to set up the stack pointer if we use it,
2986 // whereas non-entry functions get this "for free". This means there is no
2987 // intrinsic advantage to using S32 over S34 in cases where we do not have
2988 // calls but do need a frame pointer (i.e. if we are requested to have one
2989 // because frame pointer elimination is disabled). To keep things simple we
2990 // only ever use S32 as the call ABI stack pointer, and so using it does not
2991 // imply we need a separate frame pointer.
2992 //
2993 // Try to use s32 as the SP, but move it if it would interfere with input
2994 // arguments. This won't work with calls though.
2995 //
2996 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2997 // registers.
2998 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2999 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3000 } else {
3002
3003 if (MFI.hasCalls())
3004 report_fatal_error("call in graphics shader with too many input SGPRs");
3005
3006 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3007 if (!MRI.isLiveIn(Reg)) {
3008 Info.setStackPtrOffsetReg(Reg);
3009 break;
3010 }
3011 }
3012
3013 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3014 report_fatal_error("failed to find register for SP");
3015 }
3016
3017 // hasFP should be accurate for entry functions even before the frame is
3018 // finalized, because it does not rely on the known stack size, only
3019 // properties like whether variable sized objects are present.
3020 if (ST.getFrameLowering()->hasFP(MF)) {
3021 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3022 }
3023}
3024
3027 return !Info->isEntryFunction();
3028}
3029
3031
3033 MachineBasicBlock *Entry,
3034 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3036
3037 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3038 if (!IStart)
3039 return;
3040
3041 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3042 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3043 MachineBasicBlock::iterator MBBI = Entry->begin();
3044 for (const MCPhysReg *I = IStart; *I; ++I) {
3045 const TargetRegisterClass *RC = nullptr;
3046 if (AMDGPU::SReg_64RegClass.contains(*I))
3047 RC = &AMDGPU::SGPR_64RegClass;
3048 else if (AMDGPU::SReg_32RegClass.contains(*I))
3049 RC = &AMDGPU::SGPR_32RegClass;
3050 else
3051 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3052
3053 Register NewVR = MRI->createVirtualRegister(RC);
3054 // Create copy from CSR to a virtual register.
3055 Entry->addLiveIn(*I);
3056 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3057 .addReg(*I);
3058
3059 // Insert the copy-back instructions right before the terminator.
3060 for (auto *Exit : Exits)
3061 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3062 TII->get(TargetOpcode::COPY), *I)
3063 .addReg(NewVR);
3064 }
3065}
3066
3068 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3069 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3070 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3072
3074 const Function &Fn = MF.getFunction();
3077 bool IsError = false;
3078
3079 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3081 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3082 IsError = true;
3083 }
3084
3087 BitVector Skipped(Ins.size());
3088 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3089 *DAG.getContext());
3090
3091 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3092 bool IsKernel = AMDGPU::isKernel(CallConv);
3093 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3094
3095 if (IsGraphics) {
3096 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3097 assert(!UserSGPRInfo.hasDispatchPtr() &&
3098 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3099 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3100 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3101 (void)UserSGPRInfo;
3102 if (!Subtarget->enableFlatScratch())
3103 assert(!UserSGPRInfo.hasFlatScratchInit());
3104 if ((CallConv != CallingConv::AMDGPU_CS &&
3105 CallConv != CallingConv::AMDGPU_Gfx &&
3106 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3107 !Subtarget->hasArchitectedSGPRs())
3108 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3109 !Info->hasWorkGroupIDZ());
3110 }
3111
3112 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3113
3114 if (CallConv == CallingConv::AMDGPU_PS) {
3115 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3116
3117 // At least one interpolation mode must be enabled or else the GPU will
3118 // hang.
3119 //
3120 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3121 // set PSInputAddr, the user wants to enable some bits after the compilation
3122 // based on run-time states. Since we can't know what the final PSInputEna
3123 // will look like, so we shouldn't do anything here and the user should take
3124 // responsibility for the correct programming.
3125 //
3126 // Otherwise, the following restrictions apply:
3127 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3128 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3129 // enabled too.
3130 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3131 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3132 CCInfo.AllocateReg(AMDGPU::VGPR0);
3133 CCInfo.AllocateReg(AMDGPU::VGPR1);
3134 Info->markPSInputAllocated(0);
3135 Info->markPSInputEnabled(0);
3136 }
3137 if (Subtarget->isAmdPalOS()) {
3138 // For isAmdPalOS, the user does not enable some bits after compilation
3139 // based on run-time states; the register values being generated here are
3140 // the final ones set in hardware. Therefore we need to apply the
3141 // workaround to PSInputAddr and PSInputEnable together. (The case where
3142 // a bit is set in PSInputAddr but not PSInputEnable is where the
3143 // frontend set up an input arg for a particular interpolation mode, but
3144 // nothing uses that input arg. Really we should have an earlier pass
3145 // that removes such an arg.)
3146 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3147 if ((PsInputBits & 0x7F) == 0 ||
3148 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3149 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3150 }
3151 } else if (IsKernel) {
3152 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3153 } else {
3154 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3155 Ins.end());
3156 }
3157
3158 if (IsKernel)
3159 analyzeFormalArgumentsCompute(CCInfo, Ins);
3160
3161 if (IsEntryFunc) {
3162 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3163 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3164 if (IsKernel && Subtarget->hasKernargPreload())
3165 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3166
3167 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3168 } else if (!IsGraphics) {
3169 // For the fixed ABI, pass workitem IDs in the last argument register.
3170 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3171
3172 // FIXME: Sink this into allocateSpecialInputSGPRs
3173 if (!Subtarget->enableFlatScratch())
3174 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3175
3176 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3177 }
3178
3179 if (!IsKernel) {
3180 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3181 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3182
3183 // This assumes the registers are allocated by CCInfo in ascending order
3184 // with no gaps.
3185 Info->setNumWaveDispatchSGPRs(
3186 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3187 Info->setNumWaveDispatchVGPRs(
3188 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3189 } else if (Info->getNumKernargPreloadedSGPRs()) {
3190 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3191 }
3192
3194
3195 if (IsWholeWaveFunc) {
3197 {MVT::i1, MVT::Other}, Chain);
3198 InVals.push_back(Setup.getValue(0));
3199 Chains.push_back(Setup.getValue(1));
3200 }
3201
3202 // FIXME: This is the minimum kernel argument alignment. We should improve
3203 // this to the maximum alignment of the arguments.
3204 //
3205 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3206 // kern arg offset.
3207 const Align KernelArgBaseAlign = Align(16);
3208
3209 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3210 ++i) {
3211 const ISD::InputArg &Arg = Ins[i];
3212 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3213 InVals.push_back(DAG.getPOISON(Arg.VT));
3214 continue;
3215 }
3216
3217 CCValAssign &VA = ArgLocs[ArgIdx++];
3218 MVT VT = VA.getLocVT();
3219
3220 if (IsEntryFunc && VA.isMemLoc()) {
3221 VT = Ins[i].VT;
3222 EVT MemVT = VA.getLocVT();
3223
3224 const uint64_t Offset = VA.getLocMemOffset();
3225 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3226
3227 if (Arg.Flags.isByRef()) {
3228 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3229
3230 const GCNTargetMachine &TM =
3231 static_cast<const GCNTargetMachine &>(getTargetMachine());
3232 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3233 Arg.Flags.getPointerAddrSpace())) {
3236 }
3237
3238 InVals.push_back(Ptr);
3239 continue;
3240 }
3241
3242 SDValue NewArg;
3243 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3244 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3245 // In this case the argument is packed into the previous preload SGPR.
3246 int64_t AlignDownOffset = alignDown(Offset, 4);
3247 int64_t OffsetDiff = Offset - AlignDownOffset;
3248 EVT IntVT = MemVT.changeTypeToInteger();
3249
3250 const SIMachineFunctionInfo *Info =
3253 Register Reg =
3254 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3255
3256 assert(Reg);
3257 Register VReg = MRI.getLiveInVirtReg(Reg);
3258 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3259
3260 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3261 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3262
3263 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3264 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3265 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3266 Ins[i].Flags.isSExt(), &Ins[i]);
3267
3268 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3269 } else {
3270 const SIMachineFunctionInfo *Info =
3273 const SmallVectorImpl<MCRegister> &PreloadRegs =
3274 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3275
3276 SDValue Copy;
3277 if (PreloadRegs.size() == 1) {
3278 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3279 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3280 NewArg = DAG.getCopyFromReg(
3281 Chain, DL, VReg,
3283 TRI->getRegSizeInBits(*RC)));
3284
3285 } else {
3286 // If the kernarg alignment does not match the alignment of the SGPR
3287 // tuple RC that can accommodate this argument, it will be built up
3288 // via copies from from the individual SGPRs that the argument was
3289 // preloaded to.
3291 for (auto Reg : PreloadRegs) {
3292 Register VReg = MRI.getLiveInVirtReg(Reg);
3293 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3294 Elts.push_back(Copy);
3295 }
3296 NewArg =
3297 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3298 PreloadRegs.size()),
3299 DL, Elts);
3300 }
3301
3302 // If the argument was preloaded to multiple consecutive 32-bit
3303 // registers because of misalignment between addressable SGPR tuples
3304 // and the argument size, we can still assume that because of kernarg
3305 // segment alignment restrictions that NewArg's size is the same as
3306 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3307 // truncate since we cannot preload to less than a single SGPR and the
3308 // MemVT may be smaller.
3309 EVT MemVTInt =
3311 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3312 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3313
3314 NewArg = DAG.getBitcast(MemVT, NewArg);
3315 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3316 Ins[i].Flags.isSExt(), &Ins[i]);
3317 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3318 }
3319 } else {
3320 // Hidden arguments that are in the kernel signature must be preloaded
3321 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3322 // the argument list and is not preloaded.
3323 if (Arg.isOrigArg()) {
3324 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3325 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3327 *OrigArg->getParent(),
3328 "hidden argument in kernel signature was not preloaded",
3329 DL.getDebugLoc()));
3330 }
3331 }
3332
3333 NewArg =
3334 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3335 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3336 }
3337 Chains.push_back(NewArg.getValue(1));
3338
3339 auto *ParamTy =
3340 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3341 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3342 ParamTy &&
3343 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3344 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3345 // On SI local pointers are just offsets into LDS, so they are always
3346 // less than 16-bits. On CI and newer they could potentially be
3347 // real pointers, so we can't guarantee their size.
3348 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3349 DAG.getValueType(MVT::i16));
3350 }
3351
3352 InVals.push_back(NewArg);
3353 continue;
3354 }
3355 if (!IsEntryFunc && VA.isMemLoc()) {
3356 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3357 InVals.push_back(Val);
3358 if (!Arg.Flags.isByVal())
3359 Chains.push_back(Val.getValue(1));
3360 continue;
3361 }
3362
3363 assert(VA.isRegLoc() && "Parameter must be in a register!");
3364
3365 Register Reg = VA.getLocReg();
3366 const TargetRegisterClass *RC = nullptr;
3367 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3368 RC = &AMDGPU::VGPR_32RegClass;
3369 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3370 RC = &AMDGPU::SGPR_32RegClass;
3371 else
3372 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3373 EVT ValVT = VA.getValVT();
3374
3375 Reg = MF.addLiveIn(Reg, RC);
3376 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3377
3378 if (Arg.Flags.isSRet()) {
3379 // The return object should be reasonably addressable.
3380
3381 // FIXME: This helps when the return is a real sret. If it is a
3382 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3383 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3384 unsigned NumBits =
3386 Val = DAG.getNode(
3387 ISD::AssertZext, DL, VT, Val,
3388 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3389 }
3390
3391 // If this is an 8 or 16-bit value, it is really passed promoted
3392 // to 32 bits. Insert an assert[sz]ext to capture this, then
3393 // truncate to the right size.
3394 switch (VA.getLocInfo()) {
3395 case CCValAssign::Full:
3396 break;
3397 case CCValAssign::BCvt:
3398 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3399 break;
3400 case CCValAssign::SExt:
3401 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT));
3402 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3403 break;
3404 case CCValAssign::ZExt:
3405 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT));
3406 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3407 break;
3408 case CCValAssign::AExt:
3409 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3410 break;
3411 default:
3412 llvm_unreachable("Unknown loc info!");
3413 }
3414
3415 InVals.push_back(Val);
3416 }
3417
3418 // Start adding system SGPRs.
3419 if (IsEntryFunc)
3420 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3421
3422 // DAG.getPass() returns nullptr when using new pass manager.
3423 // TODO: Use DAG.getMFAM() to access analysis result.
3424 if (DAG.getPass()) {
3425 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3426 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3427 }
3428
3429 unsigned StackArgSize = CCInfo.getStackSize();
3430 Info->setBytesInStackArgArea(StackArgSize);
3431
3432 return Chains.empty() ? Chain
3433 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3434}
3435
3436// TODO: If return values can't fit in registers, we should return as many as
3437// possible in registers before passing on stack.
3439 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3440 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3441 const Type *RetTy) const {
3442 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3443 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3444 // for shaders. Vector types should be explicitly handled by CC.
3445 if (AMDGPU::isEntryFunctionCC(CallConv))
3446 return true;
3447
3449 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3450 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3451 return false;
3452
3453 // We must use the stack if return would require unavailable registers.
3454 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3455 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3456 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3457 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3458 return false;
3459
3460 return true;
3461}
3462
3463SDValue
3465 bool isVarArg,
3467 const SmallVectorImpl<SDValue> &OutVals,
3468 const SDLoc &DL, SelectionDAG &DAG) const {
3472
3473 if (AMDGPU::isKernel(CallConv)) {
3474 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3475 OutVals, DL, DAG);
3476 }
3477
3478 bool IsShader = AMDGPU::isShader(CallConv);
3479
3480 Info->setIfReturnsVoid(Outs.empty());
3481 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3482
3483 // CCValAssign - represent the assignment of the return value to a location.
3485
3486 // CCState - Info about the registers and stack slots.
3487 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3488 *DAG.getContext());
3489
3490 // Analyze outgoing return values.
3491 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3492
3493 SDValue Glue;
3495 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3496
3497 SDValue ReadFirstLane =
3498 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3499 // Copy the result values into the output registers.
3500 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3501 ++I, ++RealRVLocIdx) {
3502 CCValAssign &VA = RVLocs[I];
3503 assert(VA.isRegLoc() && "Can only return in registers!");
3504 // TODO: Partially return in registers if return values don't fit.
3505 SDValue Arg = OutVals[RealRVLocIdx];
3506
3507 // Copied from other backends.
3508 switch (VA.getLocInfo()) {
3509 case CCValAssign::Full:
3510 break;
3511 case CCValAssign::BCvt:
3512 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3513 break;
3514 case CCValAssign::SExt:
3515 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3516 break;
3517 case CCValAssign::ZExt:
3518 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3519 break;
3520 case CCValAssign::AExt:
3521 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3522 break;
3523 default:
3524 llvm_unreachable("Unknown loc info!");
3525 }
3526 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3528 ReadFirstLane, Arg);
3529 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3530 Glue = Chain.getValue(1);
3531 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3532 }
3533
3534 // FIXME: Does sret work properly?
3535 if (!Info->isEntryFunction()) {
3536 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3537 const MCPhysReg *I =
3538 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3539 if (I) {
3540 for (; *I; ++I) {
3541 if (AMDGPU::SReg_64RegClass.contains(*I))
3542 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3543 else if (AMDGPU::SReg_32RegClass.contains(*I))
3544 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3545 else
3546 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3547 }
3548 }
3549 }
3550
3551 // Update chain and glue.
3552 RetOps[0] = Chain;
3553 if (Glue.getNode())
3554 RetOps.push_back(Glue);
3555
3556 unsigned Opc = AMDGPUISD::ENDPGM;
3557 if (!IsWaveEnd)
3558 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3559 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3561 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3562}
3563
3565 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3566 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3567 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3568 SDValue ThisVal) const {
3569 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3570
3571 // Assign locations to each value returned by this call.
3573 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3574 *DAG.getContext());
3575 CCInfo.AnalyzeCallResult(Ins, RetCC);
3576
3577 // Copy all of the result registers out of their specified physreg.
3578 for (CCValAssign VA : RVLocs) {
3579 SDValue Val;
3580
3581 if (VA.isRegLoc()) {
3582 Val =
3583 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3584 Chain = Val.getValue(1);
3585 InGlue = Val.getValue(2);
3586 } else if (VA.isMemLoc()) {
3587 report_fatal_error("TODO: return values in memory");
3588 } else
3589 llvm_unreachable("unknown argument location type");
3590
3591 switch (VA.getLocInfo()) {
3592 case CCValAssign::Full:
3593 break;
3594 case CCValAssign::BCvt:
3595 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3596 break;
3597 case CCValAssign::ZExt:
3598 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3599 DAG.getValueType(VA.getValVT()));
3600 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3601 break;
3602 case CCValAssign::SExt:
3603 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3604 DAG.getValueType(VA.getValVT()));
3605 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3606 break;
3607 case CCValAssign::AExt:
3608 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3609 break;
3610 default:
3611 llvm_unreachable("Unknown loc info!");
3612 }
3613
3614 InVals.push_back(Val);
3615 }
3616
3617 return Chain;
3618}
3619
3620// Add code to pass special inputs required depending on used features separate
3621// from the explicit user arguments present in the IR.
3623 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3624 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3625 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3626 // If we don't have a call site, this was a call inserted by
3627 // legalization. These can never use special inputs.
3628 if (!CLI.CB)
3629 return;
3630
3631 SelectionDAG &DAG = CLI.DAG;
3632 const SDLoc &DL = CLI.DL;
3633 const Function &F = DAG.getMachineFunction().getFunction();
3634
3635 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3636 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3637
3638 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3640 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3641 // DAG.getPass() returns nullptr when using new pass manager.
3642 // TODO: Use DAG.getMFAM() to access analysis result.
3643 if (DAG.getPass()) {
3644 auto &ArgUsageInfo =
3646 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3647 }
3648 }
3649
3650 // TODO: Unify with private memory register handling. This is complicated by
3651 // the fact that at least in kernels, the input argument is not necessarily
3652 // in the same location as the input.
3653 // clang-format off
3654 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3656 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3657 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3658 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3659 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3660 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3661 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3662 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3663 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3664 };
3665 // clang-format on
3666
3667 for (auto [InputID, Attr] : ImplicitAttrs) {
3668 // If the callee does not use the attribute value, skip copying the value.
3669 if (CLI.CB->hasFnAttr(Attr))
3670 continue;
3671
3672 const auto [OutgoingArg, ArgRC, ArgTy] =
3673 CalleeArgInfo->getPreloadedValue(InputID);
3674 if (!OutgoingArg)
3675 continue;
3676
3677 const auto [IncomingArg, IncomingArgRC, Ty] =
3678 CallerArgInfo.getPreloadedValue(InputID);
3679 assert(IncomingArgRC == ArgRC);
3680
3681 // All special arguments are ints for now.
3682 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3683 SDValue InputReg;
3684
3685 if (IncomingArg) {
3686 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3687 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3688 // The implicit arg ptr is special because it doesn't have a corresponding
3689 // input for kernels, and is computed from the kernarg segment pointer.
3690 InputReg = getImplicitArgPtr(DAG, DL);
3691 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3692 std::optional<uint32_t> Id =
3694 if (Id.has_value()) {
3695 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3696 } else {
3697 InputReg = DAG.getPOISON(ArgVT);
3698 }
3699 } else {
3700 // We may have proven the input wasn't needed, although the ABI is
3701 // requiring it. We just need to allocate the register appropriately.
3702 InputReg = DAG.getPOISON(ArgVT);
3703 }
3704
3705 if (OutgoingArg->isRegister()) {
3706 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3707 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3708 report_fatal_error("failed to allocate implicit input argument");
3709 } else {
3710 unsigned SpecialArgOffset =
3711 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3712 SDValue ArgStore =
3713 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3714 MemOpChains.push_back(ArgStore);
3715 }
3716 }
3717
3718 // Pack workitem IDs into a single register or pass it as is if already
3719 // packed.
3720
3721 auto [OutgoingArg, ArgRC, Ty] =
3723 if (!OutgoingArg)
3724 std::tie(OutgoingArg, ArgRC, Ty) =
3726 if (!OutgoingArg)
3727 std::tie(OutgoingArg, ArgRC, Ty) =
3729 if (!OutgoingArg)
3730 return;
3731
3732 const ArgDescriptor *IncomingArgX = std::get<0>(
3734 const ArgDescriptor *IncomingArgY = std::get<0>(
3736 const ArgDescriptor *IncomingArgZ = std::get<0>(
3738
3739 SDValue InputReg;
3740 SDLoc SL;
3741
3742 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3743 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3744 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3745
3746 // If incoming ids are not packed we need to pack them.
3747 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3748 NeedWorkItemIDX) {
3749 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3750 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3751 } else {
3752 InputReg = DAG.getConstant(0, DL, MVT::i32);
3753 }
3754 }
3755
3756 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3757 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3758 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3759 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3760 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3761 InputReg = InputReg.getNode()
3762 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3763 : Y;
3764 }
3765
3766 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3767 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3768 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3769 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3770 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3771 InputReg = InputReg.getNode()
3772 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3773 : Z;
3774 }
3775
3776 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3777 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3778 // We're in a situation where the outgoing function requires the workitem
3779 // ID, but the calling function does not have it (e.g a graphics function
3780 // calling a C calling convention function). This is illegal, but we need
3781 // to produce something.
3782 InputReg = DAG.getPOISON(MVT::i32);
3783 } else {
3784 // Workitem ids are already packed, any of present incoming arguments
3785 // will carry all required fields.
3786 ArgDescriptor IncomingArg =
3787 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3788 : IncomingArgY ? *IncomingArgY
3789 : *IncomingArgZ,
3790 ~0u);
3791 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3792 }
3793 }
3794
3795 if (OutgoingArg->isRegister()) {
3796 if (InputReg)
3797 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3798
3799 CCInfo.AllocateReg(OutgoingArg->getRegister());
3800 } else {
3801 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3802 if (InputReg) {
3803 SDValue ArgStore =
3804 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3805 MemOpChains.push_back(ArgStore);
3806 }
3807 }
3808}
3809
3811 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3813 const SmallVectorImpl<SDValue> &OutVals,
3814 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3815 if (AMDGPU::isChainCC(CalleeCC))
3816 return true;
3817
3818 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3819 return false;
3820
3821 // For a divergent call target, we need to do a waterfall loop over the
3822 // possible callees which precludes us from using a simple jump.
3823 if (Callee->isDivergent())
3824 return false;
3825
3827 const Function &CallerF = MF.getFunction();
3828 CallingConv::ID CallerCC = CallerF.getCallingConv();
3830 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3831
3832 // Kernels aren't callable, and don't have a live in return address so it
3833 // doesn't make sense to do a tail call with entry functions.
3834 if (!CallerPreserved)
3835 return false;
3836
3837 bool CCMatch = CallerCC == CalleeCC;
3838
3840 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3841 return true;
3842 return false;
3843 }
3844
3845 // TODO: Can we handle var args?
3846 if (IsVarArg)
3847 return false;
3848
3849 for (const Argument &Arg : CallerF.args()) {
3850 if (Arg.hasByValAttr())
3851 return false;
3852 }
3853
3854 LLVMContext &Ctx = *DAG.getContext();
3855
3856 // Check that the call results are passed in the same way.
3857 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3858 CCAssignFnForCall(CalleeCC, IsVarArg),
3859 CCAssignFnForCall(CallerCC, IsVarArg)))
3860 return false;
3861
3862 // The callee has to preserve all registers the caller needs to preserve.
3863 if (!CCMatch) {
3864 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3865 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3866 return false;
3867 }
3868
3869 // Nothing more to check if the callee is taking no arguments.
3870 if (Outs.empty())
3871 return true;
3872
3874 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3875
3876 // FIXME: We are not allocating special input registers, so we will be
3877 // deciding based on incorrect register assignments.
3878 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3879
3880 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3881 // If the stack arguments for this call do not fit into our own save area then
3882 // the call cannot be made tail.
3883 // TODO: Is this really necessary?
3884 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3885 return false;
3886
3887 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3888 // FIXME: What about inreg arguments that end up passed in memory?
3889 if (!CCVA.isRegLoc())
3890 continue;
3891
3892 // If we are passing an argument in an SGPR, and the value is divergent,
3893 // this call requires a waterfall loop.
3894 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3895 LLVM_DEBUG(
3896 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3897 << printReg(CCVA.getLocReg(), TRI) << '\n');
3898 return false;
3899 }
3900 }
3901
3902 const MachineRegisterInfo &MRI = MF.getRegInfo();
3903 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3904}
3905
3907 if (!CI->isTailCall())
3908 return false;
3909
3910 const Function *ParentFn = CI->getParent()->getParent();
3912 return false;
3913 return true;
3914}
3915
3916namespace {
3917// Chain calls have special arguments that we need to handle. These are
3918// tagging along at the end of the arguments list(s), after the SGPR and VGPR
3919// arguments (index 0 and 1 respectively).
3920enum ChainCallArgIdx {
3921 Exec = 2,
3922 Flags,
3923 NumVGPRs,
3924 FallbackExec,
3925 FallbackCallee
3926};
3927} // anonymous namespace
3928
3929// The wave scratch offset register is used as the global base pointer.
3931 SmallVectorImpl<SDValue> &InVals) const {
3932 CallingConv::ID CallConv = CLI.CallConv;
3933 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3934
3935 SelectionDAG &DAG = CLI.DAG;
3936
3937 const SDLoc &DL = CLI.DL;
3938 SDValue Chain = CLI.Chain;
3939 SDValue Callee = CLI.Callee;
3940
3941 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
3942 bool UsesDynamicVGPRs = false;
3943 if (IsChainCallConv) {
3944 // The last arguments should be the value that we need to put in EXEC,
3945 // followed by the flags and any other arguments with special meanings.
3946 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
3947 // we don't treat them like the "real" arguments.
3948 auto RequestedExecIt =
3949 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
3950 return Arg.OrigArgIndex == 2;
3951 });
3952 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
3953
3954 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
3955 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
3956 CLI.OutVals.end());
3957 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
3958
3959 assert(CLI.Outs.back().OrigArgIndex < 2 &&
3960 "Haven't popped all the special args");
3961
3962 TargetLowering::ArgListEntry RequestedExecArg =
3963 CLI.Args[ChainCallArgIdx::Exec];
3964 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3965 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3966
3967 // Convert constants into TargetConstants, so they become immediate operands
3968 // instead of being selected into S_MOV.
3969 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
3970 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
3971 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
3972 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
3973 } else
3974 ChainCallSpecialArgs.push_back(Arg.Node);
3975 };
3976
3977 PushNodeOrTargetConstant(RequestedExecArg);
3978
3979 // Process any other special arguments depending on the value of the flags.
3980 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
3981
3982 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
3983 if (FlagsValue.isZero()) {
3984 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
3985 return lowerUnhandledCall(CLI, InVals,
3986 "no additional args allowed if flags == 0");
3987 } else if (FlagsValue.isOneBitSet(0)) {
3988 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3989 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
3990 }
3991
3992 if (!Subtarget->isWave32()) {
3993 return lowerUnhandledCall(
3994 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
3995 }
3996
3997 UsesDynamicVGPRs = true;
3998 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
3999 CLI.Args.end(), PushNodeOrTargetConstant);
4000 }
4001 }
4002
4004 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4006 bool &IsTailCall = CLI.IsTailCall;
4007 bool IsVarArg = CLI.IsVarArg;
4008 bool IsSibCall = false;
4010
4011 if (Callee.isUndef() || isNullConstant(Callee)) {
4012 if (!CLI.IsTailCall) {
4013 for (ISD::InputArg &Arg : CLI.Ins)
4014 InVals.push_back(DAG.getPOISON(Arg.VT));
4015 }
4016
4017 return Chain;
4018 }
4019
4020 if (IsVarArg) {
4021 return lowerUnhandledCall(CLI, InVals,
4022 "unsupported call to variadic function ");
4023 }
4024
4025 if (!CLI.CB)
4026 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4027
4028 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4029 return lowerUnhandledCall(CLI, InVals,
4030 "unsupported required tail call to function ");
4031 }
4032
4033 if (IsTailCall) {
4034 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4035 Outs, OutVals, Ins, DAG);
4036 if (!IsTailCall &&
4037 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4038 report_fatal_error("failed to perform tail call elimination on a call "
4039 "site marked musttail or on llvm.amdgcn.cs.chain");
4040 }
4041
4042 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4043
4044 // A sibling call is one where we're under the usual C ABI and not planning
4045 // to change that but can still do a tail call:
4046 if (!TailCallOpt && IsTailCall)
4047 IsSibCall = true;
4048
4049 if (IsTailCall)
4050 ++NumTailCalls;
4051 }
4052
4055 SmallVector<SDValue, 8> MemOpChains;
4056
4057 // Analyze operands of the call, assigning locations to each operand.
4059 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4060 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4061
4062 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4064 // With a fixed ABI, allocate fixed registers before user arguments.
4065 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4066 }
4067
4068 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4069
4070 // Get a count of how many bytes are to be pushed on the stack.
4071 unsigned NumBytes = CCInfo.getStackSize();
4072
4073 if (IsSibCall) {
4074 // Since we're not changing the ABI to make this a tail call, the memory
4075 // operands are already available in the caller's incoming argument space.
4076 NumBytes = 0;
4077 }
4078
4079 // FPDiff is the byte offset of the call's argument area from the callee's.
4080 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4081 // by this amount for a tail call. In a sibling call it must be 0 because the
4082 // caller will deallocate the entire stack and the callee still expects its
4083 // arguments to begin at SP+0. Completely unused for non-tail calls.
4084 int32_t FPDiff = 0;
4085 MachineFrameInfo &MFI = MF.getFrameInfo();
4086 auto *TRI = Subtarget->getRegisterInfo();
4087
4088 // Adjust the stack pointer for the new arguments...
4089 // These operations are automatically eliminated by the prolog/epilog pass
4090 if (!IsSibCall)
4091 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4092
4093 if (!IsSibCall || IsChainCallConv) {
4094 if (!Subtarget->enableFlatScratch()) {
4095 SmallVector<SDValue, 4> CopyFromChains;
4096
4097 // In the HSA case, this should be an identity copy.
4098 SDValue ScratchRSrcReg =
4099 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4100 RegsToPass.emplace_back(IsChainCallConv
4101 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4102 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4103 ScratchRSrcReg);
4104 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4105 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4106 }
4107 }
4108
4109 const unsigned NumSpecialInputs = RegsToPass.size();
4110
4111 MVT PtrVT = MVT::i32;
4112
4113 // Walk the register/memloc assignments, inserting copies/loads.
4114 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4115 CCValAssign &VA = ArgLocs[i];
4116 SDValue Arg = OutVals[i];
4117
4118 // Promote the value if needed.
4119 switch (VA.getLocInfo()) {
4120 case CCValAssign::Full:
4121 break;
4122 case CCValAssign::BCvt:
4123 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4124 break;
4125 case CCValAssign::ZExt:
4126 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4127 break;
4128 case CCValAssign::SExt:
4129 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4130 break;
4131 case CCValAssign::AExt:
4132 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4133 break;
4134 case CCValAssign::FPExt:
4135 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4136 break;
4137 default:
4138 llvm_unreachable("Unknown loc info!");
4139 }
4140
4141 if (VA.isRegLoc()) {
4142 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4143 } else {
4144 assert(VA.isMemLoc());
4145
4146 SDValue DstAddr;
4147 MachinePointerInfo DstInfo;
4148
4149 unsigned LocMemOffset = VA.getLocMemOffset();
4150 int32_t Offset = LocMemOffset;
4151
4152 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4153 MaybeAlign Alignment;
4154
4155 if (IsTailCall) {
4156 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4157 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4158 : VA.getValVT().getStoreSize();
4159
4160 // FIXME: We can have better than the minimum byval required alignment.
4161 Alignment =
4162 Flags.isByVal()
4163 ? Flags.getNonZeroByValAlign()
4164 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4165
4166 Offset = Offset + FPDiff;
4167 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4168
4169 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4170 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4171
4172 // Make sure any stack arguments overlapping with where we're storing
4173 // are loaded before this eventual operation. Otherwise they'll be
4174 // clobbered.
4175
4176 // FIXME: Why is this really necessary? This seems to just result in a
4177 // lot of code to copy the stack and write them back to the same
4178 // locations, which are supposed to be immutable?
4179 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4180 } else {
4181 // Stores to the argument stack area are relative to the stack pointer.
4182 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4183 MVT::i32);
4184 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4185 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4186 Alignment =
4187 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4188 }
4189
4190 if (Outs[i].Flags.isByVal()) {
4191 SDValue SizeNode =
4192 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4193 SDValue Cpy =
4194 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4195 Outs[i].Flags.getNonZeroByValAlign(),
4196 /*isVol = */ false, /*AlwaysInline = */ true,
4197 /*CI=*/nullptr, std::nullopt, DstInfo,
4199
4200 MemOpChains.push_back(Cpy);
4201 } else {
4202 SDValue Store =
4203 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4204 MemOpChains.push_back(Store);
4205 }
4206 }
4207 }
4208
4209 if (!MemOpChains.empty())
4210 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4211
4212 SDValue ReadFirstLaneID =
4213 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4214
4215 SDValue TokenGlue;
4216 if (CLI.ConvergenceControlToken) {
4217 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4219 }
4220
4221 // Build a sequence of copy-to-reg nodes chained together with token chain
4222 // and flag operands which copy the outgoing args into the appropriate regs.
4223 SDValue InGlue;
4224
4225 unsigned ArgIdx = 0;
4226 for (auto [Reg, Val] : RegsToPass) {
4227 if (ArgIdx++ >= NumSpecialInputs &&
4228 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4229 // For chain calls, the inreg arguments are required to be
4230 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4231 // they are uniform.
4232 //
4233 // For other calls, if an inreg arguments is known to be uniform,
4234 // speculatively insert a readfirstlane in case it is in a VGPR.
4235 //
4236 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4237 // value, so let that continue to produce invalid code.
4238
4239 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4240 if (TokenGlue)
4241 ReadfirstlaneArgs.push_back(TokenGlue);
4243 ReadfirstlaneArgs);
4244 }
4245
4246 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4247 InGlue = Chain.getValue(1);
4248 }
4249
4250 // We don't usually want to end the call-sequence here because we would tidy
4251 // the frame up *after* the call, however in the ABI-changing tail-call case
4252 // we've carefully laid out the parameters so that when sp is reset they'll be
4253 // in the correct location.
4254 if (IsTailCall && !IsSibCall) {
4255 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4256 InGlue = Chain.getValue(1);
4257 }
4258
4259 std::vector<SDValue> Ops({Chain});
4260
4261 // Add a redundant copy of the callee global which will not be legalized, as
4262 // we need direct access to the callee later.
4264 const GlobalValue *GV = GSD->getGlobal();
4265 Ops.push_back(Callee);
4266 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4267 } else {
4268 if (IsTailCall) {
4269 // isEligibleForTailCallOptimization considered whether the call target is
4270 // divergent, but we may still end up with a uniform value in a VGPR.
4271 // Insert a readfirstlane just in case.
4272 SDValue ReadFirstLaneID =
4273 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4274
4275 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4276 if (TokenGlue)
4277 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4278 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4279 ReadfirstlaneArgs);
4280 }
4281
4282 Ops.push_back(Callee);
4283 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4284 }
4285
4286 if (IsTailCall) {
4287 // Each tail call may have to adjust the stack by a different amount, so
4288 // this information must travel along with the operation for eventual
4289 // consumption by emitEpilogue.
4290 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4291 }
4292
4293 if (IsChainCallConv)
4294 llvm::append_range(Ops, ChainCallSpecialArgs);
4295
4296 // Add argument registers to the end of the list so that they are known live
4297 // into the call.
4298 for (auto &[Reg, Val] : RegsToPass)
4299 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4300
4301 // Add a register mask operand representing the call-preserved registers.
4302 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4303 assert(Mask && "Missing call preserved mask for calling convention");
4304 Ops.push_back(DAG.getRegisterMask(Mask));
4305
4306 if (SDValue Token = CLI.ConvergenceControlToken) {
4308 GlueOps.push_back(Token);
4309 if (InGlue)
4310 GlueOps.push_back(InGlue);
4311
4312 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4313 MVT::Glue, GlueOps),
4314 0);
4315 }
4316
4317 if (InGlue)
4318 Ops.push_back(InGlue);
4319
4320 // If we're doing a tall call, use a TC_RETURN here rather than an
4321 // actual call instruction.
4322 if (IsTailCall) {
4323 MFI.setHasTailCall();
4324 unsigned OPC = AMDGPUISD::TC_RETURN;
4325 switch (CallConv) {
4328 break;
4331 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4333 break;
4334 }
4335
4336 // If the caller is a whole wave function, we need to use a special opcode
4337 // so we can patch up EXEC.
4338 if (Info->isWholeWaveFunction())
4340
4341 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4342 }
4343
4344 // Returns a chain and a flag for retval copy to use.
4345 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4346 Chain = Call.getValue(0);
4347 InGlue = Call.getValue(1);
4348
4349 uint64_t CalleePopBytes = NumBytes;
4350 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4351 if (!Ins.empty())
4352 InGlue = Chain.getValue(1);
4353
4354 // Handle result values, copying them out of physregs into vregs that we
4355 // return.
4356 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4357 InVals, /*IsThisReturn=*/false, SDValue());
4358}
4359
4360// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4361// except for:
4362// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4363// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4365 SelectionDAG &DAG) const {
4366 const MachineFunction &MF = DAG.getMachineFunction();
4368
4369 SDLoc dl(Op);
4370 EVT VT = Op.getValueType();
4371 SDValue Chain = Op.getOperand(0);
4372 Register SPReg = Info->getStackPtrOffsetReg();
4373
4374 // Chain the dynamic stack allocation so that it doesn't modify the stack
4375 // pointer when other instructions are using the stack.
4376 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4377
4378 SDValue Size = Op.getOperand(1);
4379 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4380 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4381
4382 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4384 "Stack grows upwards for AMDGPU");
4385
4386 Chain = BaseAddr.getValue(1);
4387 Align StackAlign = TFL->getStackAlign();
4388 if (Alignment > StackAlign) {
4389 uint64_t ScaledAlignment = Alignment.value()
4390 << Subtarget->getWavefrontSizeLog2();
4391 uint64_t StackAlignMask = ScaledAlignment - 1;
4392 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4393 DAG.getConstant(StackAlignMask, dl, VT));
4394 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4395 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4396 }
4397
4398 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4399 SDValue NewSP;
4401 // For constant sized alloca, scale alloca size by wave-size
4402 SDValue ScaledSize = DAG.getNode(
4403 ISD::SHL, dl, VT, Size,
4404 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4405 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4406 } else {
4407 // For dynamic sized alloca, perform wave-wide reduction to get max of
4408 // alloca size(divergent) and then scale it by wave-size
4409 SDValue WaveReduction =
4410 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4411 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4412 Size, DAG.getConstant(0, dl, MVT::i32));
4413 SDValue ScaledSize = DAG.getNode(
4414 ISD::SHL, dl, VT, Size,
4415 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4416 NewSP =
4417 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4418 SDValue ReadFirstLaneID =
4419 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4420 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4421 NewSP);
4422 }
4423
4424 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4425 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4426
4427 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4428}
4429
4431 if (Op.getValueType() != MVT::i32)
4432 return Op; // Defer to cannot select error.
4433
4435 SDLoc SL(Op);
4436
4437 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4438
4439 // Convert from wave uniform to swizzled vector address. This should protect
4440 // from any edge cases where the stacksave result isn't directly used with
4441 // stackrestore.
4442 SDValue VectorAddress =
4443 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4444 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4445}
4446
4448 SelectionDAG &DAG) const {
4449 SDLoc SL(Op);
4450 assert(Op.getValueType() == MVT::i32);
4451
4452 uint32_t BothRoundHwReg =
4454 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4455
4456 SDValue IntrinID =
4457 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4458 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4459 Op.getOperand(0), IntrinID, GetRoundBothImm);
4460
4461 // There are two rounding modes, one for f32 and one for f64/f16. We only
4462 // report in the standard value range if both are the same.
4463 //
4464 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4465 // ties away from zero is not supported, and the other values are rotated by
4466 // 1.
4467 //
4468 // If the two rounding modes are not the same, report a target defined value.
4469
4470 // Mode register rounding mode fields:
4471 //
4472 // [1:0] Single-precision round mode.
4473 // [3:2] Double/Half-precision round mode.
4474 //
4475 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4476 //
4477 // Hardware Spec
4478 // Toward-0 3 0
4479 // Nearest Even 0 1
4480 // +Inf 1 2
4481 // -Inf 2 3
4482 // NearestAway0 N/A 4
4483 //
4484 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4485 // table we can index by the raw hardware mode.
4486 //
4487 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4488
4489 SDValue BitTable =
4491
4492 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4493 SDValue RoundModeTimesNumBits =
4494 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4495
4496 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4497 // knew only one mode was demanded.
4498 SDValue TableValue =
4499 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4500 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4501
4502 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4503 SDValue TableEntry =
4504 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4505
4506 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4507 // if it's an extended value.
4508 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4509 SDValue IsStandardValue =
4510 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4511 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4512 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4513 TableEntry, EnumOffset);
4514
4515 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4516}
4517
4519 SelectionDAG &DAG) const {
4520 SDLoc SL(Op);
4521
4522 SDValue NewMode = Op.getOperand(1);
4523 assert(NewMode.getValueType() == MVT::i32);
4524
4525 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4526 // hardware MODE.fp_round values.
4527 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4528 uint32_t ClampedVal = std::min(
4529 static_cast<uint32_t>(ConstMode->getZExtValue()),
4531 NewMode = DAG.getConstant(
4532 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4533 } else {
4534 // If we know the input can only be one of the supported standard modes in
4535 // the range 0-3, we can use a simplified mapping to hardware values.
4536 KnownBits KB = DAG.computeKnownBits(NewMode);
4537 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4538 // The supported standard values are 0-3. The extended values start at 8. We
4539 // need to offset by 4 if the value is in the extended range.
4540
4541 if (UseReducedTable) {
4542 // Truncate to the low 32-bits.
4543 SDValue BitTable = DAG.getConstant(
4544 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4545
4546 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4547 SDValue RoundModeTimesNumBits =
4548 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4549
4550 NewMode =
4551 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4552
4553 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4554 // the table extracted bits into inline immediates.
4555 } else {
4556 // table_index = umin(value, value - 4)
4557 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4558 SDValue BitTable =
4560
4561 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4562 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4563 SDValue IndexVal =
4564 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4565
4566 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4567 SDValue RoundModeTimesNumBits =
4568 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4569
4570 SDValue TableValue =
4571 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4572 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4573
4574 // No need to mask out the high bits since the setreg will ignore them
4575 // anyway.
4576 NewMode = TruncTable;
4577 }
4578
4579 // Insert a readfirstlane in case the value is a VGPR. We could do this
4580 // earlier and keep more operations scalar, but that interferes with
4581 // combining the source.
4582 SDValue ReadFirstLaneID =
4583 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4584 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4585 ReadFirstLaneID, NewMode);
4586 }
4587
4588 // N.B. The setreg will be later folded into s_round_mode on supported
4589 // targets.
4590 SDValue IntrinID =
4591 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4592 uint32_t BothRoundHwReg =
4594 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4595
4596 SDValue SetReg =
4597 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4598 IntrinID, RoundBothImm, NewMode);
4599
4600 return SetReg;
4601}
4602
4604 if (Op->isDivergent() &&
4605 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4606 // Cannot do I$ prefetch with divergent pointer.
4607 return SDValue();
4608
4609 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4613 break;
4615 if (Subtarget->hasSafeSmemPrefetch())
4616 break;
4617 [[fallthrough]];
4618 default:
4619 return SDValue();
4620 }
4621
4622 // I$ prefetch
4623 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4624 return SDValue();
4625
4626 return Op;
4627}
4628
4629// Work around DAG legality rules only based on the result type.
4631 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4632 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4633 EVT SrcVT = Src.getValueType();
4634
4635 if (SrcVT.getScalarType() != MVT::bf16)
4636 return Op;
4637
4638 SDLoc SL(Op);
4639 SDValue BitCast =
4640 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4641
4642 EVT DstVT = Op.getValueType();
4643 if (IsStrict)
4644 llvm_unreachable("Need STRICT_BF16_TO_FP");
4645
4646 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4647}
4648
4650 SDLoc SL(Op);
4651 if (Op.getValueType() != MVT::i64)
4652 return Op;
4653
4654 uint32_t ModeHwReg =
4656 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4657 uint32_t TrapHwReg =
4659 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4660
4661 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4662 SDValue IntrinID =
4663 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4664 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4665 Op.getOperand(0), IntrinID, ModeHwRegImm);
4666 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4667 Op.getOperand(0), IntrinID, TrapHwRegImm);
4668 SDValue TokenReg =
4669 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4670 GetTrapReg.getValue(1));
4671
4672 SDValue CvtPtr =
4673 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4674 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4675
4676 return DAG.getMergeValues({Result, TokenReg}, SL);
4677}
4678
4680 SDLoc SL(Op);
4681 if (Op.getOperand(1).getValueType() != MVT::i64)
4682 return Op;
4683
4684 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4685 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4686 DAG.getConstant(0, SL, MVT::i32));
4687 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4688 DAG.getConstant(1, SL, MVT::i32));
4689
4690 SDValue ReadFirstLaneID =
4691 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4692 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4693 ReadFirstLaneID, NewModeReg);
4694 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4695 ReadFirstLaneID, NewTrapReg);
4696
4697 unsigned ModeHwReg =
4699 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4700 unsigned TrapHwReg =
4702 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4703
4704 SDValue IntrinID =
4705 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4706 SDValue SetModeReg =
4707 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4708 IntrinID, ModeHwRegImm, NewModeReg);
4709 SDValue SetTrapReg =
4710 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4711 IntrinID, TrapHwRegImm, NewTrapReg);
4712 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4713}
4714
4716 const MachineFunction &MF) const {
4717 const Function &Fn = MF.getFunction();
4718
4720 .Case("m0", AMDGPU::M0)
4721 .Case("exec", AMDGPU::EXEC)
4722 .Case("exec_lo", AMDGPU::EXEC_LO)
4723 .Case("exec_hi", AMDGPU::EXEC_HI)
4724 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4725 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4726 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4727 .Default(Register());
4728 if (!Reg)
4729 return Reg;
4730
4731 if (!Subtarget->hasFlatScrRegister() &&
4732 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4733 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4734 "\" for subtarget."));
4735 }
4736
4737 switch (Reg) {
4738 case AMDGPU::M0:
4739 case AMDGPU::EXEC_LO:
4740 case AMDGPU::EXEC_HI:
4741 case AMDGPU::FLAT_SCR_LO:
4742 case AMDGPU::FLAT_SCR_HI:
4743 if (VT.getSizeInBits() == 32)
4744 return Reg;
4745 break;
4746 case AMDGPU::EXEC:
4747 case AMDGPU::FLAT_SCR:
4748 if (VT.getSizeInBits() == 64)
4749 return Reg;
4750 break;
4751 default:
4752 llvm_unreachable("missing register type checking");
4753 }
4754
4756 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4757}
4758
4759// If kill is not the last instruction, split the block so kill is always a
4760// proper terminator.
4763 MachineBasicBlock *BB) const {
4764 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4766 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4767 return SplitBB;
4768}
4769
4770// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4771// \p MI will be the only instruction in the loop body block. Otherwise, it will
4772// be the first instruction in the remainder block.
4773//
4774/// \returns { LoopBody, Remainder }
4775static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4777 MachineFunction *MF = MBB.getParent();
4779
4780 // To insert the loop we need to split the block. Move everything after this
4781 // point to a new block, and insert a new empty block between the two.
4783 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4785 ++MBBI;
4786
4787 MF->insert(MBBI, LoopBB);
4788 MF->insert(MBBI, RemainderBB);
4789
4790 LoopBB->addSuccessor(LoopBB);
4791 LoopBB->addSuccessor(RemainderBB);
4792
4793 // Move the rest of the block into a new block.
4794 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4795
4796 if (InstInLoop) {
4797 auto Next = std::next(I);
4798
4799 // Move instruction to loop body.
4800 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4801
4802 // Move the rest of the block.
4803 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4804 } else {
4805 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4806 }
4807
4808 MBB.addSuccessor(LoopBB);
4809
4810 return std::pair(LoopBB, RemainderBB);
4811}
4812
4813/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4815 MachineBasicBlock *MBB = MI.getParent();
4817 auto I = MI.getIterator();
4818 auto E = std::next(I);
4819
4820 // clang-format off
4821 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4822 .addImm(0);
4823 // clang-format on
4824
4825 MIBundleBuilder Bundler(*MBB, I, E);
4826 finalizeBundle(*MBB, Bundler.begin());
4827}
4828
4831 MachineBasicBlock *BB) const {
4832 const DebugLoc &DL = MI.getDebugLoc();
4833
4835
4837
4838 // Apparently kill flags are only valid if the def is in the same block?
4839 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4840 Src->setIsKill(false);
4841
4842 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4843
4844 MachineBasicBlock::iterator I = LoopBB->end();
4845
4846 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4848
4849 // Clear TRAP_STS.MEM_VIOL
4850 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4851 .addImm(0)
4852 .addImm(EncodedReg);
4853
4855
4856 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4857
4858 // Load and check TRAP_STS.MEM_VIOL
4859 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4860 .addImm(EncodedReg);
4861
4862 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4863 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4864 .addReg(Reg, RegState::Kill)
4865 .addImm(0);
4866 // clang-format off
4867 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4868 .addMBB(LoopBB);
4869 // clang-format on
4870
4871 return RemainderBB;
4872}
4873
4874// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4875// wavefront. If the value is uniform and just happens to be in a VGPR, this
4876// will only do one iteration. In the worst case, this will loop 64 times.
4877//
4878// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4881 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4882 const DebugLoc &DL, const MachineOperand &Idx,
4883 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4884 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4885 Register &SGPRIdxReg) {
4886
4887 MachineFunction *MF = OrigBB.getParent();
4888 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4889 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4891
4892 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4893 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4894 Register NewExec = MRI.createVirtualRegister(BoolRC);
4895 Register CurrentIdxReg =
4896 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4897 Register CondReg = MRI.createVirtualRegister(BoolRC);
4898
4899 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4900 .addReg(InitReg)
4901 .addMBB(&OrigBB)
4902 .addReg(ResultReg)
4903 .addMBB(&LoopBB);
4904
4905 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4906 .addReg(InitSaveExecReg)
4907 .addMBB(&OrigBB)
4908 .addReg(NewExec)
4909 .addMBB(&LoopBB);
4910
4911 // Read the next variant <- also loop target.
4912 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4913 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4914
4915 // Compare the just read M0 value to all possible Idx values.
4916 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4917 .addReg(CurrentIdxReg)
4918 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4919
4920 // Update EXEC, save the original EXEC value to VCC.
4921 BuildMI(LoopBB, I, DL,
4922 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4923 : AMDGPU::S_AND_SAVEEXEC_B64),
4924 NewExec)
4925 .addReg(CondReg, RegState::Kill);
4926
4927 MRI.setSimpleHint(NewExec, CondReg);
4928
4929 if (UseGPRIdxMode) {
4930 if (Offset == 0) {
4931 SGPRIdxReg = CurrentIdxReg;
4932 } else {
4933 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4934 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4935 .addReg(CurrentIdxReg, RegState::Kill)
4936 .addImm(Offset);
4937 }
4938 } else {
4939 // Move index from VCC into M0
4940 if (Offset == 0) {
4941 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
4942 .addReg(CurrentIdxReg, RegState::Kill);
4943 } else {
4944 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4945 .addReg(CurrentIdxReg, RegState::Kill)
4946 .addImm(Offset);
4947 }
4948 }
4949
4950 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4951 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4952 MachineInstr *InsertPt =
4953 BuildMI(LoopBB, I, DL,
4954 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4955 : AMDGPU::S_XOR_B64_term),
4956 Exec)
4957 .addReg(Exec)
4958 .addReg(NewExec);
4959
4960 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4961 // s_cbranch_scc0?
4962
4963 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4964 // clang-format off
4965 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4966 .addMBB(&LoopBB);
4967 // clang-format on
4968
4969 return InsertPt->getIterator();
4970}
4971
4972// This has slightly sub-optimal regalloc when the source vector is killed by
4973// the read. The register allocator does not understand that the kill is
4974// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4975// subregister from it, using 1 more VGPR than necessary. This was saved when
4976// this was expanded after register allocation.
4979 unsigned InitResultReg, unsigned PhiReg, int Offset,
4980 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4981 MachineFunction *MF = MBB.getParent();
4982 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4985 const DebugLoc &DL = MI.getDebugLoc();
4987
4988 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4989 Register DstReg = MI.getOperand(0).getReg();
4990 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4991 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4992 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4993 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4994
4995 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4996
4997 // Save the EXEC mask
4998 // clang-format off
4999 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
5000 .addReg(Exec);
5001 // clang-format on
5002
5003 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5004
5005 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5006
5007 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5008 InitResultReg, DstReg, PhiReg, TmpExec,
5009 Offset, UseGPRIdxMode, SGPRIdxReg);
5010
5011 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5013 ++MBBI;
5014 MF->insert(MBBI, LandingPad);
5015 LoopBB->removeSuccessor(RemainderBB);
5016 LandingPad->addSuccessor(RemainderBB);
5017 LoopBB->addSuccessor(LandingPad);
5018 MachineBasicBlock::iterator First = LandingPad->begin();
5019 // clang-format off
5020 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
5021 .addReg(SaveExec);
5022 // clang-format on
5023
5024 return InsPt;
5025}
5026
5027// Returns subreg index, offset
5028static std::pair<unsigned, int>
5030 const TargetRegisterClass *SuperRC, unsigned VecReg,
5031 int Offset) {
5032 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5033
5034 // Skip out of bounds offsets, or else we would end up using an undefined
5035 // register.
5036 if (Offset >= NumElts || Offset < 0)
5037 return std::pair(AMDGPU::sub0, Offset);
5038
5039 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5040}
5041
5044 int Offset) {
5045 MachineBasicBlock *MBB = MI.getParent();
5046 const DebugLoc &DL = MI.getDebugLoc();
5048
5049 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5050
5051 assert(Idx->getReg() != AMDGPU::NoRegister);
5052
5053 if (Offset == 0) {
5054 // clang-format off
5055 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5056 .add(*Idx);
5057 // clang-format on
5058 } else {
5059 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5060 .add(*Idx)
5061 .addImm(Offset);
5062 }
5063}
5064
5067 int Offset) {
5068 MachineBasicBlock *MBB = MI.getParent();
5069 const DebugLoc &DL = MI.getDebugLoc();
5071
5072 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5073
5074 if (Offset == 0)
5075 return Idx->getReg();
5076
5077 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5078 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5079 .add(*Idx)
5080 .addImm(Offset);
5081 return Tmp;
5082}
5083
5086 const GCNSubtarget &ST) {
5087 const SIInstrInfo *TII = ST.getInstrInfo();
5088 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5089 MachineFunction *MF = MBB.getParent();
5091
5092 Register Dst = MI.getOperand(0).getReg();
5093 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5094 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5095 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5096
5097 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5098 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5099
5100 unsigned SubReg;
5101 std::tie(SubReg, Offset) =
5102 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5103
5104 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5105
5106 // Check for a SGPR index.
5107 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5109 const DebugLoc &DL = MI.getDebugLoc();
5110
5111 if (UseGPRIdxMode) {
5112 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5113 // to avoid interfering with other uses, so probably requires a new
5114 // optimization pass.
5116
5117 const MCInstrDesc &GPRIDXDesc =
5118 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5119 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5120 .addReg(SrcReg)
5121 .addReg(Idx)
5122 .addImm(SubReg);
5123 } else {
5125
5126 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5127 .addReg(SrcReg, 0, SubReg)
5128 .addReg(SrcReg, RegState::Implicit);
5129 }
5130
5131 MI.eraseFromParent();
5132
5133 return &MBB;
5134 }
5135
5136 // Control flow needs to be inserted if indexing with a VGPR.
5137 const DebugLoc &DL = MI.getDebugLoc();
5139
5140 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5141 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5142
5143 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5144
5145 Register SGPRIdxReg;
5146 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5147 UseGPRIdxMode, SGPRIdxReg);
5148
5149 MachineBasicBlock *LoopBB = InsPt->getParent();
5150
5151 if (UseGPRIdxMode) {
5152 const MCInstrDesc &GPRIDXDesc =
5153 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5154
5155 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5156 .addReg(SrcReg)
5157 .addReg(SGPRIdxReg)
5158 .addImm(SubReg);
5159 } else {
5160 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5161 .addReg(SrcReg, 0, SubReg)
5162 .addReg(SrcReg, RegState::Implicit);
5163 }
5164
5165 MI.eraseFromParent();
5166
5167 return LoopBB;
5168}
5169
5172 const GCNSubtarget &ST) {
5173 const SIInstrInfo *TII = ST.getInstrInfo();
5174 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5175 MachineFunction *MF = MBB.getParent();
5177
5178 Register Dst = MI.getOperand(0).getReg();
5179 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5180 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5181 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5182 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5183 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5184 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5185
5186 // This can be an immediate, but will be folded later.
5187 assert(Val->getReg());
5188
5189 unsigned SubReg;
5190 std::tie(SubReg, Offset) =
5191 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5192 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5193
5194 if (Idx->getReg() == AMDGPU::NoRegister) {
5196 const DebugLoc &DL = MI.getDebugLoc();
5197
5198 assert(Offset == 0);
5199
5200 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5201 .add(*SrcVec)
5202 .add(*Val)
5203 .addImm(SubReg);
5204
5205 MI.eraseFromParent();
5206 return &MBB;
5207 }
5208
5209 // Check for a SGPR index.
5210 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5212 const DebugLoc &DL = MI.getDebugLoc();
5213
5214 if (UseGPRIdxMode) {
5216
5217 const MCInstrDesc &GPRIDXDesc =
5218 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5219 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5220 .addReg(SrcVec->getReg())
5221 .add(*Val)
5222 .addReg(Idx)
5223 .addImm(SubReg);
5224 } else {
5226
5227 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5228 TRI.getRegSizeInBits(*VecRC), 32, false);
5229 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5230 .addReg(SrcVec->getReg())
5231 .add(*Val)
5232 .addImm(SubReg);
5233 }
5234 MI.eraseFromParent();
5235 return &MBB;
5236 }
5237
5238 // Control flow needs to be inserted if indexing with a VGPR.
5239 if (Val->isReg())
5240 MRI.clearKillFlags(Val->getReg());
5241
5242 const DebugLoc &DL = MI.getDebugLoc();
5243
5244 Register PhiReg = MRI.createVirtualRegister(VecRC);
5245
5246 Register SGPRIdxReg;
5247 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5248 UseGPRIdxMode, SGPRIdxReg);
5249 MachineBasicBlock *LoopBB = InsPt->getParent();
5250
5251 if (UseGPRIdxMode) {
5252 const MCInstrDesc &GPRIDXDesc =
5253 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5254
5255 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5256 .addReg(PhiReg)
5257 .add(*Val)
5258 .addReg(SGPRIdxReg)
5259 .addImm(SubReg);
5260 } else {
5261 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5262 TRI.getRegSizeInBits(*VecRC), 32, false);
5263 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5264 .addReg(PhiReg)
5265 .add(*Val)
5266 .addImm(SubReg);
5267 }
5268
5269 MI.eraseFromParent();
5270 return LoopBB;
5271}
5272
5274 MachineBasicBlock *BB) {
5275 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5276 // For GFX12, we emit s_add_u64 and s_sub_u64.
5277 MachineFunction *MF = BB->getParent();
5278 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5279 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5281 const DebugLoc &DL = MI.getDebugLoc();
5282 MachineOperand &Dest = MI.getOperand(0);
5283 MachineOperand &Src0 = MI.getOperand(1);
5284 MachineOperand &Src1 = MI.getOperand(2);
5285 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5286 if (ST.hasScalarAddSub64()) {
5287 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5288 // clang-format off
5289 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5290 .add(Src0)
5291 .add(Src1);
5292 // clang-format on
5293 } else {
5294 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5295 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5296
5297 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5298 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5299
5300 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5301 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5302 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5303 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5304
5305 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5306 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5307 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5308 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5309
5310 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5311 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5312 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5313 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5314 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5315 .addReg(DestSub0)
5316 .addImm(AMDGPU::sub0)
5317 .addReg(DestSub1)
5318 .addImm(AMDGPU::sub1);
5319 }
5320 MI.eraseFromParent();
5321 return BB;
5322}
5323
5325 switch (Opc) {
5326 case AMDGPU::S_MIN_U32:
5327 return std::numeric_limits<uint32_t>::max();
5328 case AMDGPU::S_MIN_I32:
5329 return std::numeric_limits<int32_t>::max();
5330 case AMDGPU::S_MAX_U32:
5331 return std::numeric_limits<uint32_t>::min();
5332 case AMDGPU::S_MAX_I32:
5333 return std::numeric_limits<int32_t>::min();
5334 case AMDGPU::S_ADD_I32:
5335 case AMDGPU::S_SUB_I32:
5336 case AMDGPU::S_OR_B32:
5337 case AMDGPU::S_XOR_B32:
5338 return std::numeric_limits<uint32_t>::min();
5339 case AMDGPU::S_AND_B32:
5340 return std::numeric_limits<uint32_t>::max();
5341 default:
5343 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5344 }
5345}
5346
5348 switch (Opc) {
5349 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5350 return std::numeric_limits<uint64_t>::max();
5351 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5352 return std::numeric_limits<int64_t>::max();
5353 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5354 return std::numeric_limits<uint64_t>::min();
5355 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5356 return std::numeric_limits<int64_t>::min();
5357 case AMDGPU::S_ADD_U64_PSEUDO:
5358 case AMDGPU::S_SUB_U64_PSEUDO:
5359 case AMDGPU::S_OR_B64:
5360 case AMDGPU::S_XOR_B64:
5361 return std::numeric_limits<uint64_t>::min();
5362 case AMDGPU::S_AND_B64:
5363 return std::numeric_limits<uint64_t>::max();
5364 default:
5366 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5367 }
5368}
5369
5370static bool is32bitWaveReduceOperation(unsigned Opc) {
5371 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5372 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5373 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5374 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5375 Opc == AMDGPU::S_XOR_B32;
5376}
5377
5380 const GCNSubtarget &ST,
5381 unsigned Opc) {
5383 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5384 const DebugLoc &DL = MI.getDebugLoc();
5385 const SIInstrInfo *TII = ST.getInstrInfo();
5386
5387 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5388 Register SrcReg = MI.getOperand(1).getReg();
5389 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5390 Register DstReg = MI.getOperand(0).getReg();
5391 MachineBasicBlock *RetBB = nullptr;
5392 if (isSGPR) {
5393 switch (Opc) {
5394 case AMDGPU::S_MIN_U32:
5395 case AMDGPU::S_MIN_I32:
5396 case AMDGPU::S_MAX_U32:
5397 case AMDGPU::S_MAX_I32:
5398 case AMDGPU::S_AND_B32:
5399 case AMDGPU::S_OR_B32: {
5400 // Idempotent operations.
5401 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5402 RetBB = &BB;
5403 break;
5404 }
5405 case AMDGPU::V_CMP_LT_U64_e64: // umin
5406 case AMDGPU::V_CMP_LT_I64_e64: // min
5407 case AMDGPU::V_CMP_GT_U64_e64: // umax
5408 case AMDGPU::V_CMP_GT_I64_e64: // max
5409 case AMDGPU::S_AND_B64:
5410 case AMDGPU::S_OR_B64: {
5411 // Idempotent operations.
5412 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5413 RetBB = &BB;
5414 break;
5415 }
5416 case AMDGPU::S_XOR_B32:
5417 case AMDGPU::S_XOR_B64:
5418 case AMDGPU::S_ADD_I32:
5419 case AMDGPU::S_ADD_U64_PSEUDO:
5420 case AMDGPU::S_SUB_I32:
5421 case AMDGPU::S_SUB_U64_PSEUDO: {
5422 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5423 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5424 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5425 Register NumActiveLanes =
5426 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5427
5428 bool IsWave32 = ST.isWave32();
5429 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5430 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5431 unsigned BitCountOpc =
5432 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5433
5434 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5435
5436 auto NewAccumulator =
5437 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5438 .addReg(ExecMask);
5439
5440 switch (Opc) {
5441 case AMDGPU::S_XOR_B32:
5442 case AMDGPU::S_XOR_B64: {
5443 // Performing an XOR operation on a uniform value
5444 // depends on the parity of the number of active lanes.
5445 // For even parity, the result will be 0, for odd
5446 // parity the result will be the same as the input value.
5447 Register ParityRegister =
5448 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5449
5450 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5451 .addReg(NewAccumulator->getOperand(0).getReg())
5452 .addImm(1)
5453 .setOperandDead(3); // Dead scc
5454 if (Opc == AMDGPU::S_XOR_B32) {
5455 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5456 .addReg(SrcReg)
5457 .addReg(ParityRegister);
5458 } else {
5459 Register DestSub0 =
5460 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5461 Register DestSub1 =
5462 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5463
5464 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5465 const TargetRegisterClass *SrcSubRC =
5466 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5467
5468 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5469 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5470 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5471 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5472
5473 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5474 .add(Op1L)
5475 .addReg(ParityRegister);
5476
5477 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5478 .add(Op1H)
5479 .addReg(ParityRegister);
5480
5481 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5482 .addReg(DestSub0)
5483 .addImm(AMDGPU::sub0)
5484 .addReg(DestSub1)
5485 .addImm(AMDGPU::sub1);
5486 }
5487 break;
5488 }
5489 case AMDGPU::S_SUB_I32: {
5490 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5491
5492 // Take the negation of the source operand.
5493 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5494 .addImm(0)
5495 .addReg(SrcReg);
5496 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5497 .addReg(NegatedVal)
5498 .addReg(NewAccumulator->getOperand(0).getReg());
5499 break;
5500 }
5501 case AMDGPU::S_ADD_I32: {
5502 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5503 .addReg(SrcReg)
5504 .addReg(NewAccumulator->getOperand(0).getReg());
5505 break;
5506 }
5507 case AMDGPU::S_ADD_U64_PSEUDO:
5508 case AMDGPU::S_SUB_U64_PSEUDO: {
5509 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5510 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5511 Register Op1H_Op0L_Reg =
5512 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5513 Register Op1L_Op0H_Reg =
5514 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5515 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5516 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5517 Register NegatedValLo =
5518 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5519 Register NegatedValHi =
5520 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5521
5522 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5523 const TargetRegisterClass *Src1SubRC =
5524 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5525
5526 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5527 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5528 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5529 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5530
5531 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5532 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5533 .addImm(0)
5534 .addReg(NewAccumulator->getOperand(0).getReg())
5535 .setOperandDead(3); // Dead scc
5536 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5537 .addReg(NegatedValLo)
5538 .addImm(31)
5539 .setOperandDead(3); // Dead scc
5540 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5541 .add(Op1L)
5542 .addReg(NegatedValHi);
5543 }
5544 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5545 ? NegatedValLo
5546 : NewAccumulator->getOperand(0).getReg();
5547 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5548 .add(Op1L)
5549 .addReg(LowOpcode);
5550 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5551 .add(Op1L)
5552 .addReg(LowOpcode);
5553 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5554 .add(Op1H)
5555 .addReg(LowOpcode);
5556
5557 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5558 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5559 .addReg(CarryReg)
5560 .addReg(Op1H_Op0L_Reg)
5561 .setOperandDead(3); // Dead scc
5562
5563 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5564 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5565 .addReg(HiVal)
5566 .addReg(Op1L_Op0H_Reg)
5567 .setOperandDead(3); // Dead scc
5568 }
5569 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5570 .addReg(DestSub0)
5571 .addImm(AMDGPU::sub0)
5572 .addReg(DestSub1)
5573 .addImm(AMDGPU::sub1);
5574 break;
5575 }
5576 }
5577 RetBB = &BB;
5578 }
5579 }
5580 } else {
5581 // TODO: Implement DPP Strategy and switch based on immediate strategy
5582 // operand. For now, for all the cases (default, Iterative and DPP we use
5583 // iterative approach by default.)
5584
5585 // To reduce the VGPR using iterative approach, we need to iterate
5586 // over all the active lanes. Lowering consists of ComputeLoop,
5587 // which iterate over only active lanes. We use copy of EXEC register
5588 // as induction variable and every active lane modifies it using bitset0
5589 // so that we will get the next active lane for next iteration.
5591 Register SrcReg = MI.getOperand(1).getReg();
5592 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5593
5594 // Create Control flow for loop
5595 // Split MI's Machine Basic block into For loop
5596 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5597
5598 // Create virtual registers required for lowering.
5599 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5600 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5601 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5602 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5603 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5604 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5605 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5606 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5607 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5608
5609 bool IsWave32 = ST.isWave32();
5610 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5611 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5612
5613 // Create initial values of induction variable from Exec, Accumulator and
5614 // insert branch instr to newly created ComputeBlock
5615 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5616 if (is32BitOpc) {
5618 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5619 .addImm(IdentityValue);
5620 } else {
5622 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5623 .addImm(IdentityValue);
5624 }
5625 // clang-format off
5626 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5627 .addMBB(ComputeLoop);
5628 // clang-format on
5629
5630 // Start constructing ComputeLoop
5631 I = ComputeLoop->begin();
5632 auto Accumulator =
5633 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5634 .addReg(IdentityValReg)
5635 .addMBB(&BB);
5636 auto ActiveBits =
5637 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5638 .addReg(LoopIterator)
5639 .addMBB(&BB);
5640
5641 I = ComputeLoop->end();
5642 MachineInstr *NewAccumulator;
5643 // Perform the computations
5644 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5645 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5646 .addReg(ActiveBitsReg);
5647 if (is32BitOpc) {
5648 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5649 LaneValueReg)
5650 .addReg(SrcReg)
5651 .addReg(FF1Reg);
5652 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5653 .addReg(Accumulator->getOperand(0).getReg())
5654 .addReg(LaneValueReg);
5655 } else {
5656 Register LaneValueLoReg =
5657 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5658 Register LaneValueHiReg =
5659 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5660 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5661 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5662 const TargetRegisterClass *SrcSubRC =
5663 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5664 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5665 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5666 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5667 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5668 // lane value input should be in an sgpr
5669 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5670 LaneValueLoReg)
5671 .add(Op1L)
5672 .addReg(FF1Reg);
5673 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5674 LaneValueHiReg)
5675 .add(Op1H)
5676 .addReg(FF1Reg);
5677 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5678 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5679 .addReg(LaneValueLoReg)
5680 .addImm(AMDGPU::sub0)
5681 .addReg(LaneValueHiReg)
5682 .addImm(AMDGPU::sub1);
5683 switch (Opc) {
5684 case AMDGPU::S_OR_B64:
5685 case AMDGPU::S_AND_B64:
5686 case AMDGPU::S_XOR_B64: {
5687 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5688 .addReg(Accumulator->getOperand(0).getReg())
5689 .addReg(LaneValue->getOperand(0).getReg())
5690 .setOperandDead(3); // Dead scc
5691 break;
5692 }
5693 case AMDGPU::V_CMP_GT_I64_e64:
5694 case AMDGPU::V_CMP_GT_U64_e64:
5695 case AMDGPU::V_CMP_LT_I64_e64:
5696 case AMDGPU::V_CMP_LT_U64_e64: {
5697 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5698 Register ComparisonResultReg =
5699 MRI.createVirtualRegister(WaveMaskRegClass);
5700 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5701 const TargetRegisterClass *VSubRegClass =
5702 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5703 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5704 MachineOperand SrcReg0Sub0 =
5705 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5706 VregClass, AMDGPU::sub0, VSubRegClass);
5707 MachineOperand SrcReg0Sub1 =
5708 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5709 VregClass, AMDGPU::sub1, VSubRegClass);
5710 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5711 AccumulatorVReg)
5712 .add(SrcReg0Sub0)
5713 .addImm(AMDGPU::sub0)
5714 .add(SrcReg0Sub1)
5715 .addImm(AMDGPU::sub1);
5716 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5717 .addReg(LaneValue->getOperand(0).getReg())
5718 .addReg(AccumulatorVReg);
5719
5720 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5721 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5722 .addReg(LaneMaskReg)
5723 .addReg(ActiveBitsReg);
5724
5725 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5726 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5727 .addReg(LaneValue->getOperand(0).getReg())
5728 .addReg(Accumulator->getOperand(0).getReg());
5729 break;
5730 }
5731 case AMDGPU::S_ADD_U64_PSEUDO:
5732 case AMDGPU::S_SUB_U64_PSEUDO: {
5733 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5734 .addReg(Accumulator->getOperand(0).getReg())
5735 .addReg(LaneValue->getOperand(0).getReg());
5736 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5737 break;
5738 }
5739 }
5740 }
5741 // Manipulate the iterator to get the next active lane
5742 unsigned BITSETOpc =
5743 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5744 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5745 .addReg(FF1Reg)
5746 .addReg(ActiveBitsReg);
5747
5748 // Add phi nodes
5749 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5750 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5751
5752 // Creating branching
5753 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5754 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5755 .addReg(NewActiveBitsReg)
5756 .addImm(0);
5757 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5758 .addMBB(ComputeLoop);
5759
5760 RetBB = ComputeEnd;
5761 }
5762 MI.eraseFromParent();
5763 return RetBB;
5764}
5765
5768 MachineBasicBlock *BB) const {
5769
5771 MachineFunction *MF = BB->getParent();
5773
5774 switch (MI.getOpcode()) {
5775 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5776 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5777 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5778 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5779 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5780 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5781 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5782 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5783 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5784 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5785 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5786 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5787 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5788 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5789 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5790 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5791 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5792 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5793 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5794 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5795 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5796 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5797 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5798 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5799 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5800 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5801 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5802 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5803 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5804 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5805 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5806 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5807 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5808 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5809 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5810 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5811 case AMDGPU::S_UADDO_PSEUDO:
5812 case AMDGPU::S_USUBO_PSEUDO: {
5813 const DebugLoc &DL = MI.getDebugLoc();
5814 MachineOperand &Dest0 = MI.getOperand(0);
5815 MachineOperand &Dest1 = MI.getOperand(1);
5816 MachineOperand &Src0 = MI.getOperand(2);
5817 MachineOperand &Src1 = MI.getOperand(3);
5818
5819 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5820 ? AMDGPU::S_ADD_I32
5821 : AMDGPU::S_SUB_I32;
5822 // clang-format off
5823 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5824 .add(Src0)
5825 .add(Src1);
5826 // clang-format on
5827
5828 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5829 .addImm(1)
5830 .addImm(0);
5831
5832 MI.eraseFromParent();
5833 return BB;
5834 }
5835 case AMDGPU::S_ADD_U64_PSEUDO:
5836 case AMDGPU::S_SUB_U64_PSEUDO: {
5837 return Expand64BitScalarArithmetic(MI, BB);
5838 }
5839 case AMDGPU::V_ADD_U64_PSEUDO:
5840 case AMDGPU::V_SUB_U64_PSEUDO: {
5842 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5843 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5844 const DebugLoc &DL = MI.getDebugLoc();
5845
5846 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5847
5848 MachineOperand &Dest = MI.getOperand(0);
5849 MachineOperand &Src0 = MI.getOperand(1);
5850 MachineOperand &Src1 = MI.getOperand(2);
5851
5852 if (ST.hasAddSubU64Insts()) {
5853 auto I = BuildMI(*BB, MI, DL,
5854 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5855 : AMDGPU::V_SUB_U64_e64),
5856 Dest.getReg())
5857 .add(Src0)
5858 .add(Src1)
5859 .addImm(0); // clamp
5860 TII->legalizeOperands(*I);
5861 MI.eraseFromParent();
5862 return BB;
5863 }
5864
5865 if (IsAdd && ST.hasLshlAddU64Inst()) {
5866 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5867 Dest.getReg())
5868 .add(Src0)
5869 .addImm(0)
5870 .add(Src1);
5871 TII->legalizeOperands(*Add);
5872 MI.eraseFromParent();
5873 return BB;
5874 }
5875
5876 const auto *CarryRC = TRI->getWaveMaskRegClass();
5877
5878 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5879 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5880
5881 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5882 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5883
5884 const TargetRegisterClass *Src0RC = Src0.isReg()
5885 ? MRI.getRegClass(Src0.getReg())
5886 : &AMDGPU::VReg_64RegClass;
5887 const TargetRegisterClass *Src1RC = Src1.isReg()
5888 ? MRI.getRegClass(Src1.getReg())
5889 : &AMDGPU::VReg_64RegClass;
5890
5891 const TargetRegisterClass *Src0SubRC =
5892 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5893 const TargetRegisterClass *Src1SubRC =
5894 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5895
5896 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5897 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5898 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5899 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5900
5901 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5902 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5903 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5904 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5905
5906 unsigned LoOpc =
5907 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5908 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5909 .addReg(CarryReg, RegState::Define)
5910 .add(SrcReg0Sub0)
5911 .add(SrcReg1Sub0)
5912 .addImm(0); // clamp bit
5913
5914 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5915 MachineInstr *HiHalf =
5916 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5917 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5918 .add(SrcReg0Sub1)
5919 .add(SrcReg1Sub1)
5920 .addReg(CarryReg, RegState::Kill)
5921 .addImm(0); // clamp bit
5922
5923 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5924 .addReg(DestSub0)
5925 .addImm(AMDGPU::sub0)
5926 .addReg(DestSub1)
5927 .addImm(AMDGPU::sub1);
5928 TII->legalizeOperands(*LoHalf);
5929 TII->legalizeOperands(*HiHalf);
5930 MI.eraseFromParent();
5931 return BB;
5932 }
5933 case AMDGPU::S_ADD_CO_PSEUDO:
5934 case AMDGPU::S_SUB_CO_PSEUDO: {
5935 // This pseudo has a chance to be selected
5936 // only from uniform add/subcarry node. All the VGPR operands
5937 // therefore assumed to be splat vectors.
5939 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5940 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5942 const DebugLoc &DL = MI.getDebugLoc();
5943 MachineOperand &Dest = MI.getOperand(0);
5944 MachineOperand &CarryDest = MI.getOperand(1);
5945 MachineOperand &Src0 = MI.getOperand(2);
5946 MachineOperand &Src1 = MI.getOperand(3);
5947 MachineOperand &Src2 = MI.getOperand(4);
5948 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5949 ? AMDGPU::S_ADDC_U32
5950 : AMDGPU::S_SUBB_U32;
5951 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5952 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5953 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5954 .addReg(Src0.getReg());
5955 Src0.setReg(RegOp0);
5956 }
5957 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5958 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5959 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5960 .addReg(Src1.getReg());
5961 Src1.setReg(RegOp1);
5962 }
5963 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5964 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5965 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5966 .addReg(Src2.getReg());
5967 Src2.setReg(RegOp2);
5968 }
5969
5970 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5971 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5972 assert(WaveSize == 64 || WaveSize == 32);
5973
5974 if (WaveSize == 64) {
5975 if (ST.hasScalarCompareEq64()) {
5976 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5977 .addReg(Src2.getReg())
5978 .addImm(0);
5979 } else {
5980 const TargetRegisterClass *SubRC =
5981 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5982 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5983 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5984 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5985 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5986 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5987
5988 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5989 .add(Src2Sub0)
5990 .add(Src2Sub1);
5991
5992 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5993 .addReg(Src2_32, RegState::Kill)
5994 .addImm(0);
5995 }
5996 } else {
5997 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5998 .addReg(Src2.getReg())
5999 .addImm(0);
6000 }
6001
6002 // clang-format off
6003 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6004 .add(Src0)
6005 .add(Src1);
6006 // clang-format on
6007
6008 unsigned SelOpc =
6009 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6010
6011 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6012 .addImm(-1)
6013 .addImm(0);
6014
6015 MI.eraseFromParent();
6016 return BB;
6017 }
6018 case AMDGPU::SI_INIT_M0: {
6019 MachineOperand &M0Init = MI.getOperand(0);
6020 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6021 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6022 AMDGPU::M0)
6023 .add(M0Init);
6024 MI.eraseFromParent();
6025 return BB;
6026 }
6027 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6028 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6029 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6030 TII->get(AMDGPU::S_CMP_EQ_U32))
6031 .addImm(0)
6032 .addImm(0);
6033 return BB;
6034 }
6035 case AMDGPU::GET_GROUPSTATICSIZE: {
6036 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6037 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6038 DebugLoc DL = MI.getDebugLoc();
6039 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6040 .add(MI.getOperand(0))
6041 .addImm(MFI->getLDSSize());
6042 MI.eraseFromParent();
6043 return BB;
6044 }
6045 case AMDGPU::GET_SHADERCYCLESHILO: {
6048 const DebugLoc &DL = MI.getDebugLoc();
6049 // The algorithm is:
6050 //
6051 // hi1 = getreg(SHADER_CYCLES_HI)
6052 // lo1 = getreg(SHADER_CYCLES_LO)
6053 // hi2 = getreg(SHADER_CYCLES_HI)
6054 //
6055 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6056 // Otherwise there was overflow and the result is hi2:0. In both cases the
6057 // result should represent the actual time at some point during the sequence
6058 // of three getregs.
6059 using namespace AMDGPU::Hwreg;
6060 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6061 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6062 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6063 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6064 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6065 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6066 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6067 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6068 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6069 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6070 .addReg(RegHi1)
6071 .addReg(RegHi2);
6072 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6073 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6074 .addReg(RegLo1)
6075 .addImm(0);
6076 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6077 .add(MI.getOperand(0))
6078 .addReg(RegLo)
6079 .addImm(AMDGPU::sub0)
6080 .addReg(RegHi2)
6081 .addImm(AMDGPU::sub1);
6082 MI.eraseFromParent();
6083 return BB;
6084 }
6085 case AMDGPU::SI_INDIRECT_SRC_V1:
6086 case AMDGPU::SI_INDIRECT_SRC_V2:
6087 case AMDGPU::SI_INDIRECT_SRC_V4:
6088 case AMDGPU::SI_INDIRECT_SRC_V8:
6089 case AMDGPU::SI_INDIRECT_SRC_V9:
6090 case AMDGPU::SI_INDIRECT_SRC_V10:
6091 case AMDGPU::SI_INDIRECT_SRC_V11:
6092 case AMDGPU::SI_INDIRECT_SRC_V12:
6093 case AMDGPU::SI_INDIRECT_SRC_V16:
6094 case AMDGPU::SI_INDIRECT_SRC_V32:
6095 return emitIndirectSrc(MI, *BB, *getSubtarget());
6096 case AMDGPU::SI_INDIRECT_DST_V1:
6097 case AMDGPU::SI_INDIRECT_DST_V2:
6098 case AMDGPU::SI_INDIRECT_DST_V4:
6099 case AMDGPU::SI_INDIRECT_DST_V8:
6100 case AMDGPU::SI_INDIRECT_DST_V9:
6101 case AMDGPU::SI_INDIRECT_DST_V10:
6102 case AMDGPU::SI_INDIRECT_DST_V11:
6103 case AMDGPU::SI_INDIRECT_DST_V12:
6104 case AMDGPU::SI_INDIRECT_DST_V16:
6105 case AMDGPU::SI_INDIRECT_DST_V32:
6106 return emitIndirectDst(MI, *BB, *getSubtarget());
6107 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6108 case AMDGPU::SI_KILL_I1_PSEUDO:
6109 return splitKillBlock(MI, BB);
6110 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6112 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6113 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6114
6115 Register Dst = MI.getOperand(0).getReg();
6116 const MachineOperand &Src0 = MI.getOperand(1);
6117 const MachineOperand &Src1 = MI.getOperand(2);
6118 const DebugLoc &DL = MI.getDebugLoc();
6119 Register SrcCond = MI.getOperand(3).getReg();
6120
6121 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6122 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6123 const auto *CondRC = TRI->getWaveMaskRegClass();
6124 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6125
6126 const TargetRegisterClass *Src0RC = Src0.isReg()
6127 ? MRI.getRegClass(Src0.getReg())
6128 : &AMDGPU::VReg_64RegClass;
6129 const TargetRegisterClass *Src1RC = Src1.isReg()
6130 ? MRI.getRegClass(Src1.getReg())
6131 : &AMDGPU::VReg_64RegClass;
6132
6133 const TargetRegisterClass *Src0SubRC =
6134 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6135 const TargetRegisterClass *Src1SubRC =
6136 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6137
6138 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6139 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6140 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6141 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6142
6143 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6144 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6145 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6146 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6147
6148 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6149 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6150 .addImm(0)
6151 .add(Src0Sub0)
6152 .addImm(0)
6153 .add(Src1Sub0)
6154 .addReg(SrcCondCopy);
6155 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6156 .addImm(0)
6157 .add(Src0Sub1)
6158 .addImm(0)
6159 .add(Src1Sub1)
6160 .addReg(SrcCondCopy);
6161
6162 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6163 .addReg(DstLo)
6164 .addImm(AMDGPU::sub0)
6165 .addReg(DstHi)
6166 .addImm(AMDGPU::sub1);
6167 MI.eraseFromParent();
6168 return BB;
6169 }
6170 case AMDGPU::SI_BR_UNDEF: {
6172 const DebugLoc &DL = MI.getDebugLoc();
6173 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6174 .add(MI.getOperand(0));
6175 Br->getOperand(1).setIsUndef(); // read undef SCC
6176 MI.eraseFromParent();
6177 return BB;
6178 }
6179 case AMDGPU::ADJCALLSTACKUP:
6180 case AMDGPU::ADJCALLSTACKDOWN: {
6182 MachineInstrBuilder MIB(*MF, &MI);
6183 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6184 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6185 return BB;
6186 }
6187 case AMDGPU::SI_CALL_ISEL: {
6189 const DebugLoc &DL = MI.getDebugLoc();
6190
6191 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6192
6194 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6195
6196 for (const MachineOperand &MO : MI.operands())
6197 MIB.add(MO);
6198
6199 MIB.cloneMemRefs(MI);
6200 MI.eraseFromParent();
6201 return BB;
6202 }
6203 case AMDGPU::V_ADD_CO_U32_e32:
6204 case AMDGPU::V_SUB_CO_U32_e32:
6205 case AMDGPU::V_SUBREV_CO_U32_e32: {
6206 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6207 const DebugLoc &DL = MI.getDebugLoc();
6208 unsigned Opc = MI.getOpcode();
6209
6210 bool NeedClampOperand = false;
6211 if (TII->pseudoToMCOpcode(Opc) == -1) {
6213 NeedClampOperand = true;
6214 }
6215
6216 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6217 if (TII->isVOP3(*I)) {
6218 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6219 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6220 I.addReg(TRI->getVCC(), RegState::Define);
6221 }
6222 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6223 if (NeedClampOperand)
6224 I.addImm(0); // clamp bit for e64 encoding
6225
6226 TII->legalizeOperands(*I);
6227
6228 MI.eraseFromParent();
6229 return BB;
6230 }
6231 case AMDGPU::V_ADDC_U32_e32:
6232 case AMDGPU::V_SUBB_U32_e32:
6233 case AMDGPU::V_SUBBREV_U32_e32:
6234 // These instructions have an implicit use of vcc which counts towards the
6235 // constant bus limit.
6236 TII->legalizeOperands(MI);
6237 return BB;
6238 case AMDGPU::DS_GWS_INIT:
6239 case AMDGPU::DS_GWS_SEMA_BR:
6240 case AMDGPU::DS_GWS_BARRIER:
6241 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6242 [[fallthrough]];
6243 case AMDGPU::DS_GWS_SEMA_V:
6244 case AMDGPU::DS_GWS_SEMA_P:
6245 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6246 // A s_waitcnt 0 is required to be the instruction immediately following.
6247 if (getSubtarget()->hasGWSAutoReplay()) {
6249 return BB;
6250 }
6251
6252 return emitGWSMemViolTestLoop(MI, BB);
6253 case AMDGPU::S_SETREG_B32: {
6254 // Try to optimize cases that only set the denormal mode or rounding mode.
6255 //
6256 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6257 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6258 // instead.
6259 //
6260 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6261 // allow you to have a no side effect instruction in the output of a
6262 // sideeffecting pattern.
6263 auto [ID, Offset, Width] =
6264 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6266 return BB;
6267
6268 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6269 const unsigned SetMask = WidthMask << Offset;
6270
6271 if (getSubtarget()->hasDenormModeInst()) {
6272 unsigned SetDenormOp = 0;
6273 unsigned SetRoundOp = 0;
6274
6275 // The dedicated instructions can only set the whole denorm or round mode
6276 // at once, not a subset of bits in either.
6277 if (SetMask ==
6279 // If this fully sets both the round and denorm mode, emit the two
6280 // dedicated instructions for these.
6281 SetRoundOp = AMDGPU::S_ROUND_MODE;
6282 SetDenormOp = AMDGPU::S_DENORM_MODE;
6283 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6284 SetRoundOp = AMDGPU::S_ROUND_MODE;
6285 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6286 SetDenormOp = AMDGPU::S_DENORM_MODE;
6287 }
6288
6289 if (SetRoundOp || SetDenormOp) {
6291 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6292 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6293 unsigned ImmVal = Def->getOperand(1).getImm();
6294 if (SetRoundOp) {
6295 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6296 .addImm(ImmVal & 0xf);
6297
6298 // If we also have the denorm mode, get just the denorm mode bits.
6299 ImmVal >>= 4;
6300 }
6301
6302 if (SetDenormOp) {
6303 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6304 .addImm(ImmVal & 0xf);
6305 }
6306
6307 MI.eraseFromParent();
6308 return BB;
6309 }
6310 }
6311 }
6312
6313 // If only FP bits are touched, used the no side effects pseudo.
6314 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6315 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6316 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6317
6318 return BB;
6319 }
6320 case AMDGPU::S_INVERSE_BALLOT_U32:
6321 case AMDGPU::S_INVERSE_BALLOT_U64:
6322 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6323 // necessary. After that they are equivalent to a COPY.
6324 MI.setDesc(TII->get(AMDGPU::COPY));
6325 return BB;
6326 case AMDGPU::ENDPGM_TRAP: {
6327 const DebugLoc &DL = MI.getDebugLoc();
6328 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6329 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6330 MI.addOperand(MachineOperand::CreateImm(0));
6331 return BB;
6332 }
6333
6334 // We need a block split to make the real endpgm a terminator. We also don't
6335 // want to break phis in successor blocks, so we can't just delete to the
6336 // end of the block.
6337
6338 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6340 MF->push_back(TrapBB);
6341 // clang-format off
6342 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6343 .addImm(0);
6344 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6345 .addMBB(TrapBB);
6346 // clang-format on
6347
6348 BB->addSuccessor(TrapBB);
6349 MI.eraseFromParent();
6350 return SplitBB;
6351 }
6352 case AMDGPU::SIMULATED_TRAP: {
6353 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6355 MachineBasicBlock *SplitBB =
6356 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6357 MI.eraseFromParent();
6358 return SplitBB;
6359 }
6360 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6361 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6363
6364 // During ISel, it's difficult to propagate the original EXEC mask to use as
6365 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6366 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6367 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6368 Register OriginalExec = Setup->getOperand(0).getReg();
6369 MF->getRegInfo().clearKillFlags(OriginalExec);
6370 MI.getOperand(0).setReg(OriginalExec);
6371 return BB;
6372 }
6373 default:
6374 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6375 if (!MI.mayStore())
6377 return BB;
6378 }
6380 }
6381}
6382
6384 // This currently forces unfolding various combinations of fsub into fma with
6385 // free fneg'd operands. As long as we have fast FMA (controlled by
6386 // isFMAFasterThanFMulAndFAdd), we should perform these.
6387
6388 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6389 // most of these combines appear to be cycle neutral but save on instruction
6390 // count / code size.
6391 return true;
6392}
6393
6395
6397 EVT VT) const {
6398 if (!VT.isVector()) {
6399 return MVT::i1;
6400 }
6401 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6402}
6403
6405 // TODO: Should i16 be used always if legal? For now it would force VALU
6406 // shifts.
6407 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6408}
6409
6411 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6412 ? Ty.changeElementSize(16)
6413 : Ty.changeElementSize(32);
6414}
6415
6416// Answering this is somewhat tricky and depends on the specific device which
6417// have different rates for fma or all f64 operations.
6418//
6419// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6420// regardless of which device (although the number of cycles differs between
6421// devices), so it is always profitable for f64.
6422//
6423// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6424// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6425// which we can always do even without fused FP ops since it returns the same
6426// result as the separate operations and since it is always full
6427// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6428// however does not support denormals, so we do report fma as faster if we have
6429// a fast fma device and require denormals.
6430//
6432 EVT VT) const {
6433 VT = VT.getScalarType();
6434
6435 switch (VT.getSimpleVT().SimpleTy) {
6436 case MVT::f32: {
6437 // If mad is not available this depends only on if f32 fma is full rate.
6438 if (!Subtarget->hasMadMacF32Insts())
6439 return Subtarget->hasFastFMAF32();
6440
6441 // Otherwise f32 mad is always full rate and returns the same result as
6442 // the separate operations so should be preferred over fma.
6443 // However does not support denormals.
6445 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6446
6447 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6448 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6449 }
6450 case MVT::f64:
6451 return true;
6452 case MVT::f16:
6453 case MVT::bf16:
6454 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6455 default:
6456 break;
6457 }
6458
6459 return false;
6460}
6461
6463 LLT Ty) const {
6464 switch (Ty.getScalarSizeInBits()) {
6465 case 16:
6466 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6467 case 32:
6468 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6469 case 64:
6470 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6471 default:
6472 break;
6473 }
6474
6475 return false;
6476}
6477
6479 if (!Ty.isScalar())
6480 return false;
6481
6482 if (Ty.getScalarSizeInBits() == 16)
6483 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6484 if (Ty.getScalarSizeInBits() == 32)
6485 return Subtarget->hasMadMacF32Insts() &&
6486 denormalModeIsFlushAllF32(*MI.getMF());
6487
6488 return false;
6489}
6490
6492 const SDNode *N) const {
6493 // TODO: Check future ftz flag
6494 // v_mad_f32/v_mac_f32 do not support denormals.
6495 EVT VT = N->getValueType(0);
6496 if (VT == MVT::f32)
6497 return Subtarget->hasMadMacF32Insts() &&
6499 if (VT == MVT::f16) {
6500 return Subtarget->hasMadF16() &&
6502 }
6503
6504 return false;
6505}
6506
6507//===----------------------------------------------------------------------===//
6508// Custom DAG Lowering Operations
6509//===----------------------------------------------------------------------===//
6510
6511// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6512// wider vector type is legal.
6514 SelectionDAG &DAG) const {
6515 unsigned Opc = Op.getOpcode();
6516 EVT VT = Op.getValueType();
6517 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6518 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6519 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6520 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6521
6522 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6523
6524 SDLoc SL(Op);
6525 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6526 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6527
6528 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6529}
6530
6531// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6532// wider vector type is legal.
6534 SelectionDAG &DAG) const {
6535 unsigned Opc = Op.getOpcode();
6536 EVT VT = Op.getValueType();
6537 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6538 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6539 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6540 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6541 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6542 VT == MVT::v32bf16);
6543
6544 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6545 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6546
6547 SDLoc SL(Op);
6548
6549 SDValue OpLo =
6550 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6551 SDValue OpHi =
6552 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6553
6554 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6555}
6556
6558 SelectionDAG &DAG) const {
6559 unsigned Opc = Op.getOpcode();
6560 EVT VT = Op.getValueType();
6561 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6562 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6563 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6564 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6565 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6566 VT == MVT::v32bf16);
6567
6568 SDValue Op0 = Op.getOperand(0);
6569 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6570 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6571 : std::pair(Op0, Op0);
6572
6573 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6574 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6575
6576 SDLoc SL(Op);
6577 auto ResVT = DAG.GetSplitDestVTs(VT);
6578
6579 SDValue OpLo =
6580 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6581 SDValue OpHi =
6582 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6583
6584 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6585}
6586
6588 switch (Op.getOpcode()) {
6589 default:
6591 case ISD::BRCOND:
6592 return LowerBRCOND(Op, DAG);
6593 case ISD::RETURNADDR:
6594 return LowerRETURNADDR(Op, DAG);
6595 case ISD::LOAD: {
6596 SDValue Result = LowerLOAD(Op, DAG);
6597 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6598 "Load should return a value and a chain");
6599 return Result;
6600 }
6601 case ISD::FSQRT: {
6602 EVT VT = Op.getValueType();
6603 if (VT == MVT::f32)
6604 return lowerFSQRTF32(Op, DAG);
6605 if (VT == MVT::f64)
6606 return lowerFSQRTF64(Op, DAG);
6607 return SDValue();
6608 }
6609 case ISD::FSIN:
6610 case ISD::FCOS:
6611 return LowerTrig(Op, DAG);
6612 case ISD::SELECT:
6613 return LowerSELECT(Op, DAG);
6614 case ISD::FDIV:
6615 return LowerFDIV(Op, DAG);
6616 case ISD::FFREXP:
6617 return LowerFFREXP(Op, DAG);
6618 case ISD::ATOMIC_CMP_SWAP:
6619 return LowerATOMIC_CMP_SWAP(Op, DAG);
6620 case ISD::STORE:
6621 return LowerSTORE(Op, DAG);
6622 case ISD::GlobalAddress: {
6625 return LowerGlobalAddress(MFI, Op, DAG);
6626 }
6628 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6630 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6632 return LowerINTRINSIC_VOID(Op, DAG);
6633 case ISD::ADDRSPACECAST:
6634 return lowerADDRSPACECAST(Op, DAG);
6636 return lowerINSERT_SUBVECTOR(Op, DAG);
6638 return lowerINSERT_VECTOR_ELT(Op, DAG);
6640 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6642 return lowerVECTOR_SHUFFLE(Op, DAG);
6644 return lowerSCALAR_TO_VECTOR(Op, DAG);
6645 case ISD::BUILD_VECTOR:
6646 return lowerBUILD_VECTOR(Op, DAG);
6647 case ISD::FP_ROUND:
6649 return lowerFP_ROUND(Op, DAG);
6650 case ISD::TRAP:
6651 return lowerTRAP(Op, DAG);
6652 case ISD::DEBUGTRAP:
6653 return lowerDEBUGTRAP(Op, DAG);
6654 case ISD::ABS:
6655 case ISD::FABS:
6656 case ISD::FNEG:
6657 case ISD::FCANONICALIZE:
6658 case ISD::BSWAP:
6659 return splitUnaryVectorOp(Op, DAG);
6660 case ISD::FMINNUM:
6661 case ISD::FMAXNUM:
6662 return lowerFMINNUM_FMAXNUM(Op, DAG);
6663 case ISD::FMINIMUMNUM:
6664 case ISD::FMAXIMUMNUM:
6665 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6666 case ISD::FMINIMUM:
6667 case ISD::FMAXIMUM:
6668 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6669 case ISD::FLDEXP:
6670 case ISD::STRICT_FLDEXP:
6671 return lowerFLDEXP(Op, DAG);
6672 case ISD::FMA:
6673 return splitTernaryVectorOp(Op, DAG);
6674 case ISD::FP_TO_SINT:
6675 case ISD::FP_TO_UINT:
6676 return LowerFP_TO_INT(Op, DAG);
6677 case ISD::SHL:
6678 case ISD::SRA:
6679 case ISD::SRL:
6680 case ISD::ADD:
6681 case ISD::SUB:
6682 case ISD::SMIN:
6683 case ISD::SMAX:
6684 case ISD::UMIN:
6685 case ISD::UMAX:
6686 case ISD::FADD:
6687 case ISD::FMUL:
6688 case ISD::FMINNUM_IEEE:
6689 case ISD::FMAXNUM_IEEE:
6690 case ISD::UADDSAT:
6691 case ISD::USUBSAT:
6692 case ISD::SADDSAT:
6693 case ISD::SSUBSAT:
6694 return splitBinaryVectorOp(Op, DAG);
6695 case ISD::FCOPYSIGN:
6696 return lowerFCOPYSIGN(Op, DAG);
6697 case ISD::MUL:
6698 return lowerMUL(Op, DAG);
6699 case ISD::SMULO:
6700 case ISD::UMULO:
6701 return lowerXMULO(Op, DAG);
6702 case ISD::SMUL_LOHI:
6703 case ISD::UMUL_LOHI:
6704 return lowerXMUL_LOHI(Op, DAG);
6705 case ISD::DYNAMIC_STACKALLOC:
6706 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6707 case ISD::STACKSAVE:
6708 return LowerSTACKSAVE(Op, DAG);
6709 case ISD::GET_ROUNDING:
6710 return lowerGET_ROUNDING(Op, DAG);
6711 case ISD::SET_ROUNDING:
6712 return lowerSET_ROUNDING(Op, DAG);
6713 case ISD::PREFETCH:
6714 return lowerPREFETCH(Op, DAG);
6715 case ISD::FP_EXTEND:
6717 return lowerFP_EXTEND(Op, DAG);
6718 case ISD::GET_FPENV:
6719 return lowerGET_FPENV(Op, DAG);
6720 case ISD::SET_FPENV:
6721 return lowerSET_FPENV(Op, DAG);
6722 }
6723 return SDValue();
6724}
6725
6726// Used for D16: Casts the result of an instruction into the right vector,
6727// packs values if loads return unpacked values.
6729 const SDLoc &DL, SelectionDAG &DAG,
6730 bool Unpacked) {
6731 if (!LoadVT.isVector())
6732 return Result;
6733
6734 // Cast back to the original packed type or to a larger type that is a
6735 // multiple of 32 bit for D16. Widening the return type is a required for
6736 // legalization.
6737 EVT FittingLoadVT = LoadVT;
6738 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6739 FittingLoadVT =
6741 LoadVT.getVectorNumElements() + 1);
6742 }
6743
6744 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6745 // Truncate to v2i16/v4i16.
6746 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6747
6748 // Workaround legalizer not scalarizing truncate after vector op
6749 // legalization but not creating intermediate vector trunc.
6751 DAG.ExtractVectorElements(Result, Elts);
6752 for (SDValue &Elt : Elts)
6753 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6754
6755 // Pad illegal v1i16/v3fi6 to v4i16
6756 if ((LoadVT.getVectorNumElements() % 2) == 1)
6757 Elts.push_back(DAG.getPOISON(MVT::i16));
6758
6759 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6760
6761 // Bitcast to original type (v2f16/v4f16).
6762 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6763 }
6764
6765 // Cast back to the original packed type.
6766 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6767}
6768
6769SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6770 SelectionDAG &DAG,
6772 bool IsIntrinsic) const {
6773 SDLoc DL(M);
6774
6775 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6776 EVT LoadVT = M->getValueType(0);
6777
6778 EVT EquivLoadVT = LoadVT;
6779 if (LoadVT.isVector()) {
6780 if (Unpacked) {
6781 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6782 LoadVT.getVectorNumElements());
6783 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6784 // Widen v3f16 to legal type
6785 EquivLoadVT =
6787 LoadVT.getVectorNumElements() + 1);
6788 }
6789 }
6790
6791 // Change from v4f16/v2f16 to EquivLoadVT.
6792 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6793
6795 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6796 M->getMemoryVT(), M->getMemOperand());
6797
6798 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6799
6800 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6801}
6802
6803SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6804 SelectionDAG &DAG,
6805 ArrayRef<SDValue> Ops) const {
6806 SDLoc DL(M);
6807 EVT LoadVT = M->getValueType(0);
6808 EVT EltType = LoadVT.getScalarType();
6809 EVT IntVT = LoadVT.changeTypeToInteger();
6810
6811 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6812
6813 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6814 bool IsTFE = M->getNumValues() == 3;
6815
6816 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6818 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6819 : AMDGPUISD::BUFFER_LOAD;
6820
6821 if (IsD16) {
6822 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6823 }
6824
6825 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6826 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6827 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6828 IsTFE);
6829
6830 if (isTypeLegal(LoadVT)) {
6831 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6832 M->getMemOperand(), DAG);
6833 }
6834
6835 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6836 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6837 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6838 M->getMemOperand(), DAG);
6839 return DAG.getMergeValues(
6840 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6841 DL);
6842}
6843
6845 SelectionDAG &DAG) {
6846 EVT VT = N->getValueType(0);
6847 unsigned CondCode = N->getConstantOperandVal(3);
6848 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6849 return DAG.getPOISON(VT);
6850
6851 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6852
6853 SDValue LHS = N->getOperand(1);
6854 SDValue RHS = N->getOperand(2);
6855
6856 SDLoc DL(N);
6857
6858 EVT CmpVT = LHS.getValueType();
6859 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6860 unsigned PromoteOp =
6862 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6863 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6864 }
6865
6866 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6867
6868 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6869 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6870
6871 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6872 DAG.getCondCode(CCOpcode));
6873 if (VT.bitsEq(CCVT))
6874 return SetCC;
6875 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6876}
6877
6879 SelectionDAG &DAG) {
6880 EVT VT = N->getValueType(0);
6881
6882 unsigned CondCode = N->getConstantOperandVal(3);
6883 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6884 return DAG.getPOISON(VT);
6885
6886 SDValue Src0 = N->getOperand(1);
6887 SDValue Src1 = N->getOperand(2);
6888 EVT CmpVT = Src0.getValueType();
6889 SDLoc SL(N);
6890
6891 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6892 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6893 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6894 }
6895
6896 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6897 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6898 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6899 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6900 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
6901 DAG.getCondCode(CCOpcode));
6902 if (VT.bitsEq(CCVT))
6903 return SetCC;
6904 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6905}
6906
6908 SelectionDAG &DAG) {
6909 EVT VT = N->getValueType(0);
6910 SDValue Src = N->getOperand(1);
6911 SDLoc SL(N);
6912
6913 if (Src.getOpcode() == ISD::SETCC) {
6914 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6915 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6916 Src.getOperand(1), Src.getOperand(2));
6917 }
6918 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6919 // (ballot 0) -> 0
6920 if (Arg->isZero())
6921 return DAG.getConstant(0, SL, VT);
6922
6923 // (ballot 1) -> EXEC/EXEC_LO
6924 if (Arg->isOne()) {
6925 Register Exec;
6926 if (VT.getScalarSizeInBits() == 32)
6927 Exec = AMDGPU::EXEC_LO;
6928 else if (VT.getScalarSizeInBits() == 64)
6929 Exec = AMDGPU::EXEC;
6930 else
6931 return SDValue();
6932
6933 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6934 }
6935 }
6936
6937 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6938 // ISD::SETNE)
6939 return DAG.getNode(
6940 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6941 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6942}
6943
6945 SelectionDAG &DAG) {
6946 EVT VT = N->getValueType(0);
6947 unsigned ValSize = VT.getSizeInBits();
6948 unsigned IID = N->getConstantOperandVal(0);
6949 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6950 IID == Intrinsic::amdgcn_permlanex16;
6951 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6952 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6953 SDLoc SL(N);
6954 MVT IntVT = MVT::getIntegerVT(ValSize);
6955 const GCNSubtarget *ST = TLI.getSubtarget();
6956 unsigned SplitSize = 32;
6957 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6958 ST->hasDPALU_DPP() &&
6959 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
6960 SplitSize = 64;
6961
6962 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6963 SDValue Src2, MVT ValT) -> SDValue {
6965 switch (IID) {
6966 case Intrinsic::amdgcn_permlane16:
6967 case Intrinsic::amdgcn_permlanex16:
6968 case Intrinsic::amdgcn_update_dpp:
6969 Operands.push_back(N->getOperand(6));
6970 Operands.push_back(N->getOperand(5));
6971 Operands.push_back(N->getOperand(4));
6972 [[fallthrough]];
6973 case Intrinsic::amdgcn_writelane:
6974 Operands.push_back(Src2);
6975 [[fallthrough]];
6976 case Intrinsic::amdgcn_readlane:
6977 case Intrinsic::amdgcn_set_inactive:
6978 case Intrinsic::amdgcn_set_inactive_chain_arg:
6979 case Intrinsic::amdgcn_mov_dpp8:
6980 Operands.push_back(Src1);
6981 [[fallthrough]];
6982 case Intrinsic::amdgcn_readfirstlane:
6983 case Intrinsic::amdgcn_permlane64:
6984 Operands.push_back(Src0);
6985 break;
6986 default:
6987 llvm_unreachable("unhandled lane op");
6988 }
6989
6990 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6991 std::reverse(Operands.begin(), Operands.end());
6992
6993 if (SDNode *GL = N->getGluedNode()) {
6994 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6995 GL = GL->getOperand(0).getNode();
6996 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6997 SDValue(GL, 0)));
6998 }
6999
7000 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7001 };
7002
7003 SDValue Src0 = N->getOperand(1);
7004 SDValue Src1, Src2;
7005 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7006 IID == Intrinsic::amdgcn_mov_dpp8 ||
7007 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7008 Src1 = N->getOperand(2);
7009 if (IID == Intrinsic::amdgcn_writelane ||
7010 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7011 Src2 = N->getOperand(3);
7012 }
7013
7014 if (ValSize == SplitSize) {
7015 // Already legal
7016 return SDValue();
7017 }
7018
7019 if (ValSize < 32) {
7020 bool IsFloat = VT.isFloatingPoint();
7021 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7022 SL, MVT::i32);
7023
7024 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7025 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7026 SL, MVT::i32);
7027 }
7028
7029 if (IID == Intrinsic::amdgcn_writelane) {
7030 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7031 SL, MVT::i32);
7032 }
7033
7034 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7035 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7036 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7037 }
7038
7039 if (ValSize % SplitSize != 0)
7040 return SDValue();
7041
7042 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7043 EVT VT = N->getValueType(0);
7044 unsigned NE = VT.getVectorNumElements();
7045 EVT EltVT = VT.getVectorElementType();
7047 unsigned NumOperands = N->getNumOperands();
7048 SmallVector<SDValue, 4> Operands(NumOperands);
7049 SDNode *GL = N->getGluedNode();
7050
7051 // only handle convergencectrl_glue
7052 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7053
7054 for (unsigned i = 0; i != NE; ++i) {
7055 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7056 ++j) {
7057 SDValue Operand = N->getOperand(j);
7058 EVT OperandVT = Operand.getValueType();
7059 if (OperandVT.isVector()) {
7060 // A vector operand; extract a single element.
7061 EVT OperandEltVT = OperandVT.getVectorElementType();
7062 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7063 Operand, DAG.getVectorIdxConstant(i, SL));
7064 } else {
7065 // A scalar operand; just use it as is.
7066 Operands[j] = Operand;
7067 }
7068 }
7069
7070 if (GL)
7071 Operands[NumOperands - 1] =
7072 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7073 SDValue(GL->getOperand(0).getNode(), 0));
7074
7075 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7076 }
7077
7078 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7079 return DAG.getBuildVector(VecVT, SL, Scalars);
7080 };
7081
7082 if (VT.isVector()) {
7083 switch (MVT::SimpleValueType EltTy =
7085 case MVT::i32:
7086 case MVT::f32:
7087 if (SplitSize == 32) {
7088 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7089 return unrollLaneOp(LaneOp.getNode());
7090 }
7091 [[fallthrough]];
7092 case MVT::i16:
7093 case MVT::f16:
7094 case MVT::bf16: {
7095 unsigned SubVecNumElt =
7096 SplitSize / VT.getVectorElementType().getSizeInBits();
7097 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7099 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7100 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7101 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7102 DAG.getConstant(EltIdx, SL, MVT::i32));
7103
7104 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7105 IsPermLane16)
7106 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7107 DAG.getConstant(EltIdx, SL, MVT::i32));
7108
7109 if (IID == Intrinsic::amdgcn_writelane)
7110 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7111 DAG.getConstant(EltIdx, SL, MVT::i32));
7112
7113 Pieces.push_back(
7114 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7115 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7116 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7117 EltIdx += SubVecNumElt;
7118 }
7119 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7120 }
7121 default:
7122 // Handle all other cases by bitcasting to i32 vectors
7123 break;
7124 }
7125 }
7126
7127 MVT VecVT =
7128 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7129 Src0 = DAG.getBitcast(VecVT, Src0);
7130
7131 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7132 Src1 = DAG.getBitcast(VecVT, Src1);
7133
7134 if (IID == Intrinsic::amdgcn_writelane)
7135 Src2 = DAG.getBitcast(VecVT, Src2);
7136
7137 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7138 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7139 return DAG.getBitcast(VT, UnrolledLaneOp);
7140}
7141
7144 SelectionDAG &DAG) const {
7145 switch (N->getOpcode()) {
7147 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7148 Results.push_back(Res);
7149 return;
7150 }
7152 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7153 Results.push_back(Res);
7154 return;
7155 }
7157 unsigned IID = N->getConstantOperandVal(0);
7158 switch (IID) {
7159 case Intrinsic::amdgcn_make_buffer_rsrc:
7160 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7161 return;
7162 case Intrinsic::amdgcn_cvt_pkrtz: {
7163 SDValue Src0 = N->getOperand(1);
7164 SDValue Src1 = N->getOperand(2);
7165 SDLoc SL(N);
7166 SDValue Cvt =
7167 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7168 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7169 return;
7170 }
7171 case Intrinsic::amdgcn_cvt_pknorm_i16:
7172 case Intrinsic::amdgcn_cvt_pknorm_u16:
7173 case Intrinsic::amdgcn_cvt_pk_i16:
7174 case Intrinsic::amdgcn_cvt_pk_u16: {
7175 SDValue Src0 = N->getOperand(1);
7176 SDValue Src1 = N->getOperand(2);
7177 SDLoc SL(N);
7178 unsigned Opcode;
7179
7180 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7182 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7184 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7186 else
7188
7189 EVT VT = N->getValueType(0);
7190 if (isTypeLegal(VT))
7191 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7192 else {
7193 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7194 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7195 }
7196 return;
7197 }
7198 case Intrinsic::amdgcn_s_buffer_load: {
7199 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7200 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7201 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7202 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7203 // s_buffer_load_i8.
7204 if (!Subtarget->hasScalarSubwordLoads())
7205 return;
7206 SDValue Op = SDValue(N, 0);
7207 SDValue Rsrc = Op.getOperand(1);
7208 SDValue Offset = Op.getOperand(2);
7209 SDValue CachePolicy = Op.getOperand(3);
7210 EVT VT = Op.getValueType();
7211 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7212 SDLoc DL(Op);
7214 const DataLayout &DataLayout = DAG.getDataLayout();
7215 Align Alignment =
7221 VT.getStoreSize(), Alignment);
7222 SDValue LoadVal;
7223 if (!Offset->isDivergent()) {
7224 SDValue Ops[] = {Rsrc, // source register
7225 Offset, CachePolicy};
7226 SDValue BufferLoad =
7228 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7229 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7230 } else {
7231 SDValue Ops[] = {
7232 DAG.getEntryNode(), // Chain
7233 Rsrc, // rsrc
7234 DAG.getConstant(0, DL, MVT::i32), // vindex
7235 {}, // voffset
7236 {}, // soffset
7237 {}, // offset
7238 CachePolicy, // cachepolicy
7239 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7240 };
7241 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7242 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7243 }
7244 Results.push_back(LoadVal);
7245 return;
7246 }
7247 case Intrinsic::amdgcn_dead: {
7248 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7249 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7250 return;
7251 }
7252 }
7253 break;
7254 }
7256 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7257 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7258 // FIXME: Hacky
7259 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7260 Results.push_back(Res.getOperand(I));
7261 }
7262 } else {
7263 Results.push_back(Res);
7264 Results.push_back(Res.getValue(1));
7265 }
7266 return;
7267 }
7268
7269 break;
7270 }
7271 case ISD::SELECT: {
7272 SDLoc SL(N);
7273 EVT VT = N->getValueType(0);
7274 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7275 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7276 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7277
7278 EVT SelectVT = NewVT;
7279 if (NewVT.bitsLT(MVT::i32)) {
7280 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7281 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7282 SelectVT = MVT::i32;
7283 }
7284
7285 SDValue NewSelect =
7286 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7287
7288 if (NewVT != SelectVT)
7289 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7290 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7291 return;
7292 }
7293 case ISD::FNEG: {
7294 if (N->getValueType(0) != MVT::v2f16)
7295 break;
7296
7297 SDLoc SL(N);
7298 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7299
7300 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7301 DAG.getConstant(0x80008000, SL, MVT::i32));
7302 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7303 return;
7304 }
7305 case ISD::FABS: {
7306 if (N->getValueType(0) != MVT::v2f16)
7307 break;
7308
7309 SDLoc SL(N);
7310 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7311
7312 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7313 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7314 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7315 return;
7316 }
7317 case ISD::FSQRT: {
7318 if (N->getValueType(0) != MVT::f16)
7319 break;
7320 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7321 break;
7322 }
7323 default:
7325 break;
7326 }
7327}
7328
7329/// Helper function for LowerBRCOND
7330static SDNode *findUser(SDValue Value, unsigned Opcode) {
7331
7332 for (SDUse &U : Value->uses()) {
7333 if (U.get() != Value)
7334 continue;
7335
7336 if (U.getUser()->getOpcode() == Opcode)
7337 return U.getUser();
7338 }
7339 return nullptr;
7340}
7341
7342unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7343 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7344 switch (Intr->getConstantOperandVal(1)) {
7345 case Intrinsic::amdgcn_if:
7346 return AMDGPUISD::IF;
7347 case Intrinsic::amdgcn_else:
7348 return AMDGPUISD::ELSE;
7349 case Intrinsic::amdgcn_loop:
7350 return AMDGPUISD::LOOP;
7351 case Intrinsic::amdgcn_end_cf:
7352 llvm_unreachable("should not occur");
7353 default:
7354 return 0;
7355 }
7356 }
7357
7358 // break, if_break, else_break are all only used as inputs to loop, not
7359 // directly as branch conditions.
7360 return 0;
7361}
7362
7369
7371 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7372 return false;
7373
7374 // FIXME: Either avoid relying on address space here or change the default
7375 // address space for functions to avoid the explicit check.
7376 return (GV->getValueType()->isFunctionTy() ||
7379}
7380
7382 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7383}
7384
7386 if (!GV->hasExternalLinkage())
7387 return true;
7388
7389 const auto OS = getTargetMachine().getTargetTriple().getOS();
7390 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7391}
7392
7393/// This transforms the control flow intrinsics to get the branch destination as
7394/// last parameter, also switches branch target with BR if the need arise
7395SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7396 SDLoc DL(BRCOND);
7397
7398 SDNode *Intr = BRCOND.getOperand(1).getNode();
7399 SDValue Target = BRCOND.getOperand(2);
7400 SDNode *BR = nullptr;
7401 SDNode *SetCC = nullptr;
7402
7403 if (Intr->getOpcode() == ISD::SETCC) {
7404 // As long as we negate the condition everything is fine
7405 SetCC = Intr;
7406 Intr = SetCC->getOperand(0).getNode();
7407
7408 } else {
7409 // Get the target from BR if we don't negate the condition
7410 BR = findUser(BRCOND, ISD::BR);
7411 assert(BR && "brcond missing unconditional branch user");
7412 Target = BR->getOperand(1);
7413 }
7414
7415 unsigned CFNode = isCFIntrinsic(Intr);
7416 if (CFNode == 0) {
7417 // This is a uniform branch so we don't need to legalize.
7418 return BRCOND;
7419 }
7420
7421 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7423
7424 assert(!SetCC ||
7425 (SetCC->getConstantOperandVal(1) == 1 &&
7426 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7427 ISD::SETNE));
7428
7429 // operands of the new intrinsic call
7431 if (HaveChain)
7432 Ops.push_back(BRCOND.getOperand(0));
7433
7434 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7435 Ops.push_back(Target);
7436
7437 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7438
7439 // build the new intrinsic call
7440 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7441
7442 if (!HaveChain) {
7443 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7444
7446 }
7447
7448 if (BR) {
7449 // Give the branch instruction our target
7450 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7451 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7452 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7453 }
7454
7455 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7456
7457 // Copy the intrinsic results to registers
7458 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7459 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7460 if (!CopyToReg)
7461 continue;
7462
7463 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7464 SDValue(Result, i - 1), SDValue());
7465
7466 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7467 }
7468
7469 // Remove the old intrinsic from the chain
7470 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7471 Intr->getOperand(0));
7472
7473 return Chain;
7474}
7475
7476SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7477 MVT VT = Op.getSimpleValueType();
7478 SDLoc DL(Op);
7479 // Checking the depth
7480 if (Op.getConstantOperandVal(0) != 0)
7481 return DAG.getConstant(0, DL, VT);
7482
7483 MachineFunction &MF = DAG.getMachineFunction();
7484 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7485 // Check for kernel and shader functions
7486 if (Info->isEntryFunction())
7487 return DAG.getConstant(0, DL, VT);
7488
7489 MachineFrameInfo &MFI = MF.getFrameInfo();
7490 // There is a call to @llvm.returnaddress in this function
7491 MFI.setReturnAddressIsTaken(true);
7492
7493 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7494 // Get the return address reg and mark it as an implicit live-in
7495 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7496 getRegClassFor(VT, Op.getNode()->isDivergent()));
7497
7498 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7499}
7500
7501SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7502 const SDLoc &DL, EVT VT) const {
7503 return Op.getValueType().bitsLE(VT)
7504 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7505 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7506 DAG.getTargetConstant(0, DL, MVT::i32));
7507}
7508
7509SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7510 SelectionDAG &DAG) const {
7511 EVT DstVT = Op.getValueType();
7512 unsigned NumElts = DstVT.getVectorNumElements();
7513 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7514
7515 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7516
7517 SDLoc DL(Op);
7518 unsigned Opc = Op.getOpcode();
7519 SDValue Flags = Op.getOperand(1);
7520 EVT HalfDstVT =
7521 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7522 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7523 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7524
7525 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7526}
7527
7528SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7529 SDValue Src = Op.getOperand(0);
7530 EVT SrcVT = Src.getValueType();
7531 EVT DstVT = Op.getValueType();
7532
7533 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7534 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7535 if (SrcVT.getScalarType() != MVT::f32)
7536 return SDValue();
7537 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7538 }
7539
7540 if (SrcVT.getScalarType() != MVT::f64)
7541 return Op;
7542
7543 SDLoc DL(Op);
7544 if (DstVT == MVT::f16) {
7545 // TODO: Handle strictfp
7546 if (Op.getOpcode() != ISD::FP_ROUND)
7547 return Op;
7548
7549 if (!Subtarget->has16BitInsts()) {
7550 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7551 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7552 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7553 }
7554 if (Op->getFlags().hasApproximateFuncs()) {
7555 SDValue Flags = Op.getOperand(1);
7556 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7557 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7558 }
7559 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7560 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7561 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7562 }
7563
7564 assert(DstVT.getScalarType() == MVT::bf16 &&
7565 "custom lower FP_ROUND for f16 or bf16");
7566 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7567
7568 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7569 // hardware f32 -> bf16 instruction.
7570 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7571 MVT::f32;
7572 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7573 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7574 DAG.getTargetConstant(0, DL, MVT::i32));
7575}
7576
7577SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7578 SelectionDAG &DAG) const {
7579 EVT VT = Op.getValueType();
7580 const MachineFunction &MF = DAG.getMachineFunction();
7581 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7582 bool IsIEEEMode = Info->getMode().IEEE;
7583
7584 // FIXME: Assert during selection that this is only selected for
7585 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7586 // mode functions, but this happens to be OK since it's only done in cases
7587 // where there is known no sNaN.
7588 if (IsIEEEMode)
7589 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7590
7591 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7592 VT == MVT::v16bf16)
7593 return splitBinaryVectorOp(Op, DAG);
7594 return Op;
7595}
7596
7597SDValue
7598SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7599 SelectionDAG &DAG) const {
7600 EVT VT = Op.getValueType();
7601 const MachineFunction &MF = DAG.getMachineFunction();
7602 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7603 bool IsIEEEMode = Info->getMode().IEEE;
7604
7605 if (IsIEEEMode)
7606 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7607
7608 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7609 VT == MVT::v16bf16)
7610 return splitBinaryVectorOp(Op, DAG);
7611 return Op;
7612}
7613
7614SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7615 SelectionDAG &DAG) const {
7616 EVT VT = Op.getValueType();
7617 if (VT.isVector())
7618 return splitBinaryVectorOp(Op, DAG);
7619
7620 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7621 !Subtarget->hasMinimum3Maximum3F16() &&
7622 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7623 "should not need to widen f16 minimum/maximum to v2f16");
7624
7625 // Widen f16 operation to v2f16
7626
7627 // fminimum f16:x, f16:y ->
7628 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7629 // (v2f16 (scalar_to_vector y))), 0
7630 SDLoc SL(Op);
7631 SDValue WideSrc0 =
7632 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7633 SDValue WideSrc1 =
7634 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7635
7636 SDValue Widened =
7637 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7638
7639 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7640 DAG.getConstant(0, SL, MVT::i32));
7641}
7642
7643SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7644 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7645 EVT VT = Op.getValueType();
7646 assert(VT == MVT::f16);
7647
7648 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7649 EVT ExpVT = Exp.getValueType();
7650 if (ExpVT == MVT::i16)
7651 return Op;
7652
7653 SDLoc DL(Op);
7654
7655 // Correct the exponent type for f16 to i16.
7656 // Clamp the range of the exponent to the instruction's range.
7657
7658 // TODO: This should be a generic narrowing legalization, and can easily be
7659 // for GlobalISel.
7660
7661 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7662 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7663
7664 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7665 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7666
7667 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7668
7669 if (IsStrict) {
7670 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7671 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7672 }
7673
7674 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7675}
7676
7678 switch (Op->getOpcode()) {
7679 case ISD::SRA:
7680 case ISD::SMIN:
7681 case ISD::SMAX:
7682 return ISD::SIGN_EXTEND;
7683 case ISD::SRL:
7684 case ISD::UMIN:
7685 case ISD::UMAX:
7686 return ISD::ZERO_EXTEND;
7687 case ISD::ADD:
7688 case ISD::SUB:
7689 case ISD::AND:
7690 case ISD::OR:
7691 case ISD::XOR:
7692 case ISD::SHL:
7693 case ISD::SELECT:
7694 case ISD::MUL:
7695 // operation result won't be influenced by garbage high bits.
7696 // TODO: are all of those cases correct, and are there more?
7697 return ISD::ANY_EXTEND;
7698 case ISD::SETCC: {
7699 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7701 }
7702 default:
7703 llvm_unreachable("unexpected opcode!");
7704 }
7705}
7706
7707SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7708 DAGCombinerInfo &DCI) const {
7709 const unsigned Opc = Op.getOpcode();
7710 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7711 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7712 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7713 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7714 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7715
7716 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7717 : Op->getOperand(0).getValueType();
7718 auto ExtTy = OpTy.changeElementType(MVT::i32);
7719
7720 if (DCI.isBeforeLegalizeOps() ||
7721 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7722 return SDValue();
7723
7724 auto &DAG = DCI.DAG;
7725
7726 SDLoc DL(Op);
7727 SDValue LHS;
7728 SDValue RHS;
7729 if (Opc == ISD::SELECT) {
7730 LHS = Op->getOperand(1);
7731 RHS = Op->getOperand(2);
7732 } else {
7733 LHS = Op->getOperand(0);
7734 RHS = Op->getOperand(1);
7735 }
7736
7737 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7738 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7739
7740 // Special case: for shifts, the RHS always needs a zext.
7741 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7742 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7743 else
7744 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7745
7746 // setcc always return i1/i1 vec so no need to truncate after.
7747 if (Opc == ISD::SETCC) {
7748 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7749 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7750 }
7751
7752 // For other ops, we extend the operation's return type as well so we need to
7753 // truncate back to the original type.
7754 SDValue NewVal;
7755 if (Opc == ISD::SELECT)
7756 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7757 else
7758 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7759
7760 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7761}
7762
7763SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7764 SDValue Mag = Op.getOperand(0);
7765 EVT MagVT = Mag.getValueType();
7766
7767 if (MagVT.getVectorNumElements() > 2)
7768 return splitBinaryVectorOp(Op, DAG);
7769
7770 SDValue Sign = Op.getOperand(1);
7771 EVT SignVT = Sign.getValueType();
7772
7773 if (MagVT == SignVT)
7774 return Op;
7775
7776 // fcopysign v2f16:mag, v2f32:sign ->
7777 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7778
7779 SDLoc SL(Op);
7780 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7781 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7782
7783 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7784
7785 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7786}
7787
7788// Custom lowering for vector multiplications and s_mul_u64.
7789SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7790 EVT VT = Op.getValueType();
7791
7792 // Split vector operands.
7793 if (VT.isVector())
7794 return splitBinaryVectorOp(Op, DAG);
7795
7796 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7797
7798 // There are four ways to lower s_mul_u64:
7799 //
7800 // 1. If all the operands are uniform, then we lower it as it is.
7801 //
7802 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7803 // multiplications because there is not a vector equivalent of s_mul_u64.
7804 //
7805 // 3. If the cost model decides that it is more efficient to use vector
7806 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7807 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7808 //
7809 // 4. If the cost model decides to use vector registers and both of the
7810 // operands are zero-extended/sign-extended from 32-bits, then we split the
7811 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7812 // possible to check if the operands are zero-extended or sign-extended in
7813 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7814 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7815 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7816 // If the cost model decides that we have to use vector registers, then
7817 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7818 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7819 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7820 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7821 // SIInstrInfo.cpp .
7822
7823 if (Op->isDivergent())
7824 return SDValue();
7825
7826 SDValue Op0 = Op.getOperand(0);
7827 SDValue Op1 = Op.getOperand(1);
7828 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7829 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7830 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7831 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7832 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7833 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7834 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7835 SDLoc SL(Op);
7836 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7837 return SDValue(
7838 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7839 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7840 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7841 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7842 return SDValue(
7843 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7844 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7845 return Op;
7846}
7847
7848SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7849 EVT VT = Op.getValueType();
7850 SDLoc SL(Op);
7851 SDValue LHS = Op.getOperand(0);
7852 SDValue RHS = Op.getOperand(1);
7853 bool isSigned = Op.getOpcode() == ISD::SMULO;
7854
7855 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7856 const APInt &C = RHSC->getAPIntValue();
7857 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7858 if (C.isPowerOf2()) {
7859 // smulo(x, signed_min) is same as umulo(x, signed_min).
7860 bool UseArithShift = isSigned && !C.isMinSignedValue();
7861 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7862 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7863 SDValue Overflow =
7864 DAG.getSetCC(SL, MVT::i1,
7865 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
7866 Result, ShiftAmt),
7867 LHS, ISD::SETNE);
7868 return DAG.getMergeValues({Result, Overflow}, SL);
7869 }
7870 }
7871
7872 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
7873 SDValue Top =
7874 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
7875
7876 SDValue Sign = isSigned
7877 ? DAG.getNode(ISD::SRA, SL, VT, Result,
7878 DAG.getConstant(VT.getScalarSizeInBits() - 1,
7879 SL, MVT::i32))
7880 : DAG.getConstant(0, SL, VT);
7881 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
7882
7883 return DAG.getMergeValues({Result, Overflow}, SL);
7884}
7885
7886SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7887 if (Op->isDivergent()) {
7888 // Select to V_MAD_[IU]64_[IU]32.
7889 return Op;
7890 }
7891 if (Subtarget->hasSMulHi()) {
7892 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7893 return SDValue();
7894 }
7895 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7896 // calculate the high part, so we might as well do the whole thing with
7897 // V_MAD_[IU]64_[IU]32.
7898 return Op;
7899}
7900
7901SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7902 if (!Subtarget->isTrapHandlerEnabled() ||
7903 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7904 return lowerTrapEndpgm(Op, DAG);
7905
7906 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7907 : lowerTrapHsaQueuePtr(Op, DAG);
7908}
7909
7910SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7911 SDLoc SL(Op);
7912 SDValue Chain = Op.getOperand(0);
7913 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
7914}
7915
7916SDValue
7917SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7918 const SDLoc &DL, Align Alignment,
7919 ImplicitParameter Param) const {
7920 MachineFunction &MF = DAG.getMachineFunction();
7921 uint64_t Offset = getImplicitParameterOffset(MF, Param);
7922 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
7923 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7924 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7927}
7928
7929SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7930 SelectionDAG &DAG) const {
7931 SDLoc SL(Op);
7932 SDValue Chain = Op.getOperand(0);
7933
7934 SDValue QueuePtr;
7935 // For code object version 5, QueuePtr is passed through implicit kernarg.
7936 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7938 QueuePtr =
7939 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
7940 } else {
7941 MachineFunction &MF = DAG.getMachineFunction();
7942 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7943 Register UserSGPR = Info->getQueuePtrUserSGPR();
7944
7945 if (UserSGPR == AMDGPU::NoRegister) {
7946 // We probably are in a function incorrectly marked with
7947 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7948 // trap, so just use a null pointer.
7949 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
7950 } else {
7951 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
7952 MVT::i64);
7953 }
7954 }
7955
7956 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
7957 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
7958
7959 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7960 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
7961 ToReg.getValue(1)};
7962 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7963}
7964
7965SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7966 SDLoc SL(Op);
7967 SDValue Chain = Op.getOperand(0);
7968
7969 // We need to simulate the 's_trap 2' instruction on targets that run in
7970 // PRIV=1 (where it is treated as a nop).
7971 if (Subtarget->hasPrivEnabledTrap2NopBug())
7972 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7973
7974 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7975 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7976 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7977}
7978
7979SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7980 SDLoc SL(Op);
7981 SDValue Chain = Op.getOperand(0);
7982 MachineFunction &MF = DAG.getMachineFunction();
7983
7984 if (!Subtarget->isTrapHandlerEnabled() ||
7985 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7986 LLVMContext &Ctx = MF.getFunction().getContext();
7987 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
7988 "debugtrap handler not supported",
7989 Op.getDebugLoc(), DS_Warning));
7990 return Chain;
7991 }
7992
7993 uint64_t TrapID =
7994 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
7995 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
7996 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7997}
7998
7999SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8000 SelectionDAG &DAG) const {
8001 if (Subtarget->hasApertureRegs()) {
8002 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8003 ? AMDGPU::SRC_SHARED_BASE
8004 : AMDGPU::SRC_PRIVATE_BASE;
8005 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8006 !Subtarget->hasGloballyAddressableScratch()) &&
8007 "Cannot use src_private_base with globally addressable scratch!");
8008 // Note: this feature (register) is broken. When used as a 32-bit operand,
8009 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8010 // bits.
8011 //
8012 // To work around the issue, directly emit a 64 bit mov from this register
8013 // then extract the high bits. Note that this shouldn't even result in a
8014 // shift being emitted and simply become a pair of registers (e.g.):
8015 // s_mov_b64 s[6:7], src_shared_base
8016 // v_mov_b32_e32 v1, s7
8017 //
8018 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
8019 // coalescing would kick in and it would think it's okay to use the "HI"
8020 // subregister directly (instead of extracting the HI 32 bits) which is an
8021 // artificial (unusable) register.
8022 // Register TableGen definitions would need an overhaul to get rid of the
8023 // artificial "HI" aperture registers and prevent this kind of issue from
8024 // happening.
8025 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
8026 DAG.getRegister(ApertureRegNo, MVT::i64));
8027 return DAG.getNode(
8028 ISD::TRUNCATE, DL, MVT::i32,
8029 DAG.getNode(ISD::SRL, DL, MVT::i64,
8030 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
8031 }
8032
8033 // For code object version 5, private_base and shared_base are passed through
8034 // implicit kernargs.
8035 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8039 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8040 }
8041
8042 MachineFunction &MF = DAG.getMachineFunction();
8043 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8044 Register UserSGPR = Info->getQueuePtrUserSGPR();
8045 if (UserSGPR == AMDGPU::NoRegister) {
8046 // We probably are in a function incorrectly marked with
8047 // amdgpu-no-queue-ptr. This is undefined.
8048 return DAG.getPOISON(MVT::i32);
8049 }
8050
8051 SDValue QueuePtr =
8052 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8053
8054 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8055 // private_segment_aperture_base_hi.
8056 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8057
8058 SDValue Ptr =
8059 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8060
8061 // TODO: Use custom target PseudoSourceValue.
8062 // TODO: We should use the value from the IR intrinsic call, but it might not
8063 // be available and how do we get it?
8064 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8065 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8066 commonAlignment(Align(64), StructOffset),
8069}
8070
8071/// Return true if the value is a known valid address, such that a null check is
8072/// not necessary.
8074 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8076 return true;
8077
8078 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8079 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8080
8081 // TODO: Search through arithmetic, handle arguments and loads
8082 // marked nonnull.
8083 return false;
8084}
8085
8086SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8087 SelectionDAG &DAG) const {
8088 SDLoc SL(Op);
8089
8090 const AMDGPUTargetMachine &TM =
8091 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8092
8093 unsigned DestAS, SrcAS;
8094 SDValue Src;
8095 bool IsNonNull = false;
8096 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8097 SrcAS = ASC->getSrcAddressSpace();
8098 Src = ASC->getOperand(0);
8099 DestAS = ASC->getDestAddressSpace();
8100 } else {
8101 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8102 Op.getConstantOperandVal(0) ==
8103 Intrinsic::amdgcn_addrspacecast_nonnull);
8104 Src = Op->getOperand(1);
8105 SrcAS = Op->getConstantOperandVal(2);
8106 DestAS = Op->getConstantOperandVal(3);
8107 IsNonNull = true;
8108 }
8109
8110 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8111
8112 // flat -> local/private
8113 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8114 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8115 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8116 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8117
8118 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8119 Subtarget->hasGloballyAddressableScratch()) {
8120 // flat -> private with globally addressable scratch: subtract
8121 // src_flat_scratch_base_lo.
8122 SDValue FlatScratchBaseLo(
8123 DAG.getMachineNode(
8124 AMDGPU::S_MOV_B32, SL, MVT::i32,
8125 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8126 0);
8127 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8128 }
8129
8130 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8131 return Ptr;
8132
8133 unsigned NullVal = TM.getNullPointerValue(DestAS);
8134 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8135 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8136
8137 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8138 SegmentNullPtr);
8139 }
8140 }
8141
8142 // local/private -> flat
8143 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8144 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8145 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8146 SDValue CvtPtr;
8147 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8148 Subtarget->hasGloballyAddressableScratch()) {
8149 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8150 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8151 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8152 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8153 ThreadID = DAG.getNode(
8154 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8155 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8156 AllOnes, ThreadID);
8157 if (Subtarget->isWave64())
8158 ThreadID = DAG.getNode(
8159 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8160 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8161 AllOnes, ThreadID);
8162 SDValue ShAmt = DAG.getShiftAmountConstant(
8163 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8164 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8165 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8166 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8167 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8168 // 64-bit hi:lo value.
8169 SDValue FlatScratchBase = {
8170 DAG.getMachineNode(
8171 AMDGPU::S_MOV_B64, SL, MVT::i64,
8172 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8173 0};
8174 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8175 } else {
8176 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8177 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8178 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8179 }
8180
8181 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8182 return CvtPtr;
8183
8184 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8185 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8186
8187 SDValue NonNull =
8188 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8189
8190 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8191 FlatNullPtr);
8192 }
8193 }
8194
8195 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8196 Op.getValueType() == MVT::i64) {
8197 const SIMachineFunctionInfo *Info =
8198 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8199 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8200 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8201 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8202 }
8203
8204 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8205 Src.getValueType() == MVT::i64)
8206 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8207
8208 // global <-> flat are no-ops and never emitted.
8209
8210 // Invalid casts are poison.
8211 return DAG.getPOISON(Op->getValueType(0));
8212}
8213
8214// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8215// the small vector and inserting them into the big vector. That is better than
8216// the default expansion of doing it via a stack slot. Even though the use of
8217// the stack slot would be optimized away afterwards, the stack slot itself
8218// remains.
8219SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8220 SelectionDAG &DAG) const {
8221 SDValue Vec = Op.getOperand(0);
8222 SDValue Ins = Op.getOperand(1);
8223 SDValue Idx = Op.getOperand(2);
8224 EVT VecVT = Vec.getValueType();
8225 EVT InsVT = Ins.getValueType();
8226 EVT EltVT = VecVT.getVectorElementType();
8227 unsigned InsNumElts = InsVT.getVectorNumElements();
8228 unsigned IdxVal = Idx->getAsZExtVal();
8229 SDLoc SL(Op);
8230
8231 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8232 // Insert 32-bit registers at a time.
8233 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8234
8235 unsigned VecNumElts = VecVT.getVectorNumElements();
8236 EVT NewVecVT =
8237 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8238 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8240 MVT::i32, InsNumElts / 2);
8241
8242 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8243 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8244
8245 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8246 SDValue Elt;
8247 if (InsNumElts == 2) {
8248 Elt = Ins;
8249 } else {
8250 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8251 DAG.getConstant(I, SL, MVT::i32));
8252 }
8253 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8254 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8255 }
8256
8257 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8258 }
8259
8260 for (unsigned I = 0; I != InsNumElts; ++I) {
8261 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8262 DAG.getConstant(I, SL, MVT::i32));
8263 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8264 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8265 }
8266 return Vec;
8267}
8268
8269SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8270 SelectionDAG &DAG) const {
8271 SDValue Vec = Op.getOperand(0);
8272 SDValue InsVal = Op.getOperand(1);
8273 SDValue Idx = Op.getOperand(2);
8274 EVT VecVT = Vec.getValueType();
8275 EVT EltVT = VecVT.getVectorElementType();
8276 unsigned VecSize = VecVT.getSizeInBits();
8277 unsigned EltSize = EltVT.getSizeInBits();
8278 SDLoc SL(Op);
8279
8280 // Specially handle the case of v4i16 with static indexing.
8281 unsigned NumElts = VecVT.getVectorNumElements();
8282 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8283 if (NumElts == 4 && EltSize == 16 && KIdx) {
8284 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8285
8286 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8287 DAG.getConstant(0, SL, MVT::i32));
8288 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8289 DAG.getConstant(1, SL, MVT::i32));
8290
8291 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8292 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8293
8294 unsigned Idx = KIdx->getZExtValue();
8295 bool InsertLo = Idx < 2;
8296 SDValue InsHalf = DAG.getNode(
8297 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8298 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8299 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8300
8301 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8302
8303 SDValue Concat =
8304 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8305 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8306
8307 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8308 }
8309
8310 // Static indexing does not lower to stack access, and hence there is no need
8311 // for special custom lowering to avoid stack access.
8312 if (isa<ConstantSDNode>(Idx))
8313 return SDValue();
8314
8315 // Avoid stack access for dynamic indexing by custom lowering to
8316 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8317
8318 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8319
8320 MVT IntVT = MVT::getIntegerVT(VecSize);
8321
8322 // Convert vector index to bit-index and get the required bit mask.
8323 assert(isPowerOf2_32(EltSize));
8324 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8325 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8326 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8327 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8328 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8329
8330 // 1. Create a congruent vector with the target value in each element.
8331 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8332 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8333
8334 // 2. Mask off all other indices except the required index within (1).
8335 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8336
8337 // 3. Mask off the required index within the target vector.
8338 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8339 SDValue RHS =
8340 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8341
8342 // 4. Get (2) and (3) ORed into the target vector.
8343 SDValue BFI =
8344 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8345
8346 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8347}
8348
8349SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8350 SelectionDAG &DAG) const {
8351 SDLoc SL(Op);
8352
8353 EVT ResultVT = Op.getValueType();
8354 SDValue Vec = Op.getOperand(0);
8355 SDValue Idx = Op.getOperand(1);
8356 EVT VecVT = Vec.getValueType();
8357 unsigned VecSize = VecVT.getSizeInBits();
8358 EVT EltVT = VecVT.getVectorElementType();
8359
8360 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8361
8362 // Make sure we do any optimizations that will make it easier to fold
8363 // source modifiers before obscuring it with bit operations.
8364
8365 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8366 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8367 return Combined;
8368
8369 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8370 SDValue Lo, Hi;
8371 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8372
8373 if (VecSize == 128) {
8374 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8375 Lo = DAG.getBitcast(LoVT,
8376 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8377 DAG.getConstant(0, SL, MVT::i32)));
8378 Hi = DAG.getBitcast(HiVT,
8379 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8380 DAG.getConstant(1, SL, MVT::i32)));
8381 } else if (VecSize == 256) {
8382 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8383 SDValue Parts[4];
8384 for (unsigned P = 0; P < 4; ++P) {
8385 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8386 DAG.getConstant(P, SL, MVT::i32));
8387 }
8388
8389 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8390 Parts[0], Parts[1]));
8391 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8392 Parts[2], Parts[3]));
8393 } else {
8394 assert(VecSize == 512);
8395
8396 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8397 SDValue Parts[8];
8398 for (unsigned P = 0; P < 8; ++P) {
8399 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8400 DAG.getConstant(P, SL, MVT::i32));
8401 }
8402
8403 Lo = DAG.getBitcast(LoVT,
8404 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8405 Parts[0], Parts[1], Parts[2], Parts[3]));
8406 Hi = DAG.getBitcast(HiVT,
8407 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8408 Parts[4], Parts[5], Parts[6], Parts[7]));
8409 }
8410
8411 EVT IdxVT = Idx.getValueType();
8412 unsigned NElem = VecVT.getVectorNumElements();
8413 assert(isPowerOf2_32(NElem));
8414 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8415 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8416 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8417 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8418 }
8419
8420 assert(VecSize <= 64);
8421
8422 MVT IntVT = MVT::getIntegerVT(VecSize);
8423
8424 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8425 SDValue VecBC = peekThroughBitcasts(Vec);
8426 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8427 SDValue Src = VecBC.getOperand(0);
8428 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8429 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8430 }
8431
8432 unsigned EltSize = EltVT.getSizeInBits();
8433 assert(isPowerOf2_32(EltSize));
8434
8435 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8436
8437 // Convert vector index to bit-index (* EltSize)
8438 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8439
8440 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8441 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8442
8443 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8444 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8445 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8446 }
8447
8448 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8449}
8450
8451static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8452 assert(Elt % 2 == 0);
8453 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8454}
8455
8456static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8457 assert(Elt % 2 == 0);
8458 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8459 !(Mask[Elt + 1] & 1);
8460}
8461
8462SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8463 SelectionDAG &DAG) const {
8464 SDLoc SL(Op);
8465 EVT ResultVT = Op.getValueType();
8466 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8467 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8468 const int NewSrcNumElts = 2;
8469 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8470 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8471
8472 // Break up the shuffle into registers sized pieces.
8473 //
8474 // We're trying to form sub-shuffles that the register allocation pipeline
8475 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8476 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8477 // pair of copies into a consecutive register copy, so use the ordinary
8478 // extract_vector_elt lowering unless we can use the shuffle.
8479 //
8480 // TODO: This is a bit of hack, and we should probably always use
8481 // extract_subvector for the largest possible subvector we can (or at least
8482 // use it for PackVT aligned pieces). However we have worse support for
8483 // combines on them don't directly treat extract_subvector / insert_subvector
8484 // as legal. The DAG scheduler also ends up doing a worse job with the
8485 // extract_subvectors.
8486 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8487
8488 // vector_shuffle <0,1,6,7> lhs, rhs
8489 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8490 //
8491 // vector_shuffle <6,7,2,3> lhs, rhs
8492 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8493 //
8494 // vector_shuffle <6,7,0,1> lhs, rhs
8495 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8496
8497 // Avoid scalarizing when both halves are reading from consecutive elements.
8498
8499 // If we're treating 2 element shuffles as legal, also create odd-to-even
8500 // shuffles of neighboring pairs.
8501 //
8502 // vector_shuffle <3,2,7,6> lhs, rhs
8503 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8504 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8505
8507 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8508 if (ShouldUseConsecutiveExtract &&
8510 const int Idx = SVN->getMaskElt(I);
8511 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8512 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8513 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8514 SVN->getOperand(VecIdx),
8515 DAG.getConstant(EltIdx, SL, MVT::i32));
8516 Pieces.push_back(SubVec);
8517 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8519 int Idx0 = SVN->getMaskElt(I);
8520 int Idx1 = SVN->getMaskElt(I + 1);
8521
8522 SDValue SrcOp0 = SVN->getOperand(0);
8523 SDValue SrcOp1 = SrcOp0;
8524 if (Idx0 >= SrcNumElts) {
8525 SrcOp0 = SVN->getOperand(1);
8526 Idx0 -= SrcNumElts;
8527 }
8528
8529 if (Idx1 >= SrcNumElts) {
8530 SrcOp1 = SVN->getOperand(1);
8531 Idx1 -= SrcNumElts;
8532 }
8533
8534 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8535 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8536
8537 // Extract nearest even aligned piece.
8538 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8539 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8540 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8541 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8542
8543 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8544 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8545
8546 SDValue Result0 = SubVec0;
8547 SDValue Result1 = SubVec0;
8548
8549 if (SubVec0 != SubVec1) {
8550 NewMaskIdx1 += NewSrcNumElts;
8551 Result1 = SubVec1;
8552 } else {
8553 Result1 = DAG.getPOISON(PackVT);
8554 }
8555
8556 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8557 {NewMaskIdx0, NewMaskIdx1});
8558 Pieces.push_back(Shuf);
8559 } else {
8560 const int Idx0 = SVN->getMaskElt(I);
8561 const int Idx1 = SVN->getMaskElt(I + 1);
8562 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8563 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8564 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8565 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8566
8567 SDValue Vec0 = SVN->getOperand(VecIdx0);
8568 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8569 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8570
8571 SDValue Vec1 = SVN->getOperand(VecIdx1);
8572 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8573 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8574 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8575 }
8576 }
8577
8578 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8579}
8580
8581SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8582 SelectionDAG &DAG) const {
8583 SDValue SVal = Op.getOperand(0);
8584 EVT ResultVT = Op.getValueType();
8585 EVT SValVT = SVal.getValueType();
8586 SDValue UndefVal = DAG.getPOISON(SValVT);
8587 SDLoc SL(Op);
8588
8590 VElts.push_back(SVal);
8591 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8592 VElts.push_back(UndefVal);
8593
8594 return DAG.getBuildVector(ResultVT, SL, VElts);
8595}
8596
8597SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8598 SelectionDAG &DAG) const {
8599 SDLoc SL(Op);
8600 EVT VT = Op.getValueType();
8601
8602 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8603 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8604
8605 SDValue Lo = Op.getOperand(0);
8606 SDValue Hi = Op.getOperand(1);
8607
8608 // Avoid adding defined bits with the zero_extend.
8609 if (Hi.isUndef()) {
8610 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8611 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8612 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8613 }
8614
8615 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8616 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8617
8618 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8619 DAG.getConstant(16, SL, MVT::i32));
8620 if (Lo.isUndef())
8621 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8622
8623 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8624 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8625
8626 SDValue Or =
8627 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8628 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8629 }
8630
8631 // Split into 2-element chunks.
8632 const unsigned NumParts = VT.getVectorNumElements() / 2;
8633 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8634 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8635
8637 for (unsigned P = 0; P < NumParts; ++P) {
8638 SDValue Vec = DAG.getBuildVector(
8639 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8640 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8641 }
8642
8643 SDValue Blend =
8644 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8645 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8646}
8647
8649 const GlobalAddressSDNode *GA) const {
8650 // OSes that use ELF REL relocations (instead of RELA) can only store a
8651 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8652 // which can create arbitrary 64-bit addends. (This is only a problem for
8653 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8654 // the high 32 bits of the addend.)
8655 //
8656 // This should be kept in sync with how HasRelocationAddend is initialized in
8657 // the constructor of ELFAMDGPUAsmBackend.
8658 if (!Subtarget->isAmdHsaOS())
8659 return false;
8660
8661 // We can fold offsets for anything that doesn't require a GOT relocation.
8662 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8666}
8667
8668static SDValue
8670 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8671 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8672 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8673 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8674 // lowered to the following code sequence:
8675 //
8676 // For constant address space:
8677 // s_getpc_b64 s[0:1]
8678 // s_add_u32 s0, s0, $symbol
8679 // s_addc_u32 s1, s1, 0
8680 //
8681 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8682 // a fixup or relocation is emitted to replace $symbol with a literal
8683 // constant, which is a pc-relative offset from the encoding of the $symbol
8684 // operand to the global variable.
8685 //
8686 // For global address space:
8687 // s_getpc_b64 s[0:1]
8688 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8689 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8690 //
8691 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8692 // fixups or relocations are emitted to replace $symbol@*@lo and
8693 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8694 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8695 // operand to the global variable.
8696 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8697 assert(GAFlags != SIInstrInfo::MO_NONE);
8698
8699 SDValue Ptr =
8700 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8701 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8702 }
8703
8704 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8705 SDValue PtrHi;
8706 if (GAFlags == SIInstrInfo::MO_NONE)
8707 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8708 else
8709 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8710 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8711}
8712
8713SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8714 SDValue Op,
8715 SelectionDAG &DAG) const {
8716 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8717 SDLoc DL(GSD);
8718 EVT PtrVT = Op.getValueType();
8719
8720 const GlobalValue *GV = GSD->getGlobal();
8726 GV->hasExternalLinkage()) {
8727 Type *Ty = GV->getValueType();
8728 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8729 // zero-sized type in other languages to declare the dynamic shared
8730 // memory which size is not known at the compile time. They will be
8731 // allocated by the runtime and placed directly after the static
8732 // allocated ones. They all share the same offset.
8733 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8734 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8735 // Adjust alignment for that dynamic shared memory array.
8738 MFI->setUsesDynamicLDS(true);
8739 return SDValue(
8740 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8741 }
8742 }
8744 }
8745
8747 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8749 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8750 }
8751
8752 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8753 if (Subtarget->has64BitLiterals()) {
8755 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8756 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8757 0);
8758 }
8759
8760 SDValue AddrLo = DAG.getTargetGlobalAddress(
8761 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8762 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8763
8764 SDValue AddrHi = DAG.getTargetGlobalAddress(
8765 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8766 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8767
8768 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8769 }
8770
8771 if (shouldEmitFixup(GV))
8772 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8773
8774 if (shouldEmitPCReloc(GV))
8775 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8777
8778 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8780 PointerType *PtrTy =
8782 const DataLayout &DataLayout = DAG.getDataLayout();
8783 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8784 MachinePointerInfo PtrInfo =
8786
8787 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8790}
8791
8793 const SDLoc &DL, SDValue V) const {
8794 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8795 // the destination register.
8796 //
8797 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8798 // so we will end up with redundant moves to m0.
8799 //
8800 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8801
8802 // A Null SDValue creates a glue result.
8803 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8804 V, Chain);
8805 return SDValue(M0, 0);
8806}
8807
8808SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8809 MVT VT,
8810 unsigned Offset) const {
8811 SDLoc SL(Op);
8812 SDValue Param = lowerKernargMemParameter(
8813 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8814 // The local size values will have the hi 16-bits as zero.
8815 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8816 DAG.getValueType(VT));
8817}
8818
8820 EVT VT) {
8823 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8824 return DAG.getPOISON(VT);
8825}
8826
8828 EVT VT) {
8831 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8832 return DAG.getPOISON(VT);
8833}
8834
8836 ArrayRef<SDValue> Elts) {
8837 assert(!Elts.empty());
8838 MVT Type;
8839 unsigned NumElts = Elts.size();
8840
8841 if (NumElts <= 12) {
8842 Type = MVT::getVectorVT(MVT::f32, NumElts);
8843 } else {
8844 assert(Elts.size() <= 16);
8845 Type = MVT::v16f32;
8846 NumElts = 16;
8847 }
8848
8849 SmallVector<SDValue, 16> VecElts(NumElts);
8850 for (unsigned i = 0; i < Elts.size(); ++i) {
8851 SDValue Elt = Elts[i];
8852 if (Elt.getValueType() != MVT::f32)
8853 Elt = DAG.getBitcast(MVT::f32, Elt);
8854 VecElts[i] = Elt;
8855 }
8856 for (unsigned i = Elts.size(); i < NumElts; ++i)
8857 VecElts[i] = DAG.getPOISON(MVT::f32);
8858
8859 if (NumElts == 1)
8860 return VecElts[0];
8861 return DAG.getBuildVector(Type, DL, VecElts);
8862}
8863
8864static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8865 SDValue Src, int ExtraElts) {
8866 EVT SrcVT = Src.getValueType();
8867
8869
8870 if (SrcVT.isVector())
8871 DAG.ExtractVectorElements(Src, Elts);
8872 else
8873 Elts.push_back(Src);
8874
8875 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
8876 while (ExtraElts--)
8877 Elts.push_back(Undef);
8878
8879 return DAG.getBuildVector(CastVT, DL, Elts);
8880}
8881
8882// Re-construct the required return value for a image load intrinsic.
8883// This is more complicated due to the optional use TexFailCtrl which means the
8884// required return type is an aggregate
8886 ArrayRef<EVT> ResultTypes, bool IsTexFail,
8887 bool Unpacked, bool IsD16, int DMaskPop,
8888 int NumVDataDwords, bool IsAtomicPacked16Bit,
8889 const SDLoc &DL) {
8890 // Determine the required return type. This is the same regardless of
8891 // IsTexFail flag
8892 EVT ReqRetVT = ResultTypes[0];
8893 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8894 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8895 ? (ReqRetNumElts + 1) / 2
8896 : ReqRetNumElts;
8897
8898 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8899
8900 MVT DataDwordVT =
8901 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
8902
8903 MVT MaskPopVT =
8904 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
8905
8906 SDValue Data(Result, 0);
8907 SDValue TexFail;
8908
8909 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8910 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
8911 if (MaskPopVT.isVector()) {
8912 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
8913 SDValue(Result, 0), ZeroIdx);
8914 } else {
8915 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
8916 SDValue(Result, 0), ZeroIdx);
8917 }
8918 }
8919
8920 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8921 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
8922 NumDataDwords - MaskPopDwords);
8923
8924 if (IsD16)
8925 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
8926
8927 EVT LegalReqRetVT = ReqRetVT;
8928 if (!ReqRetVT.isVector()) {
8929 if (!Data.getValueType().isInteger())
8930 Data = DAG.getNode(ISD::BITCAST, DL,
8931 Data.getValueType().changeTypeToInteger(), Data);
8932 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
8933 } else {
8934 // We need to widen the return vector to a legal type
8935 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8936 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8937 LegalReqRetVT =
8939 ReqRetVT.getVectorNumElements() + 1);
8940 }
8941 }
8942 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
8943
8944 if (IsTexFail) {
8945 TexFail =
8946 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
8947 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
8948
8949 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
8950 }
8951
8952 if (Result->getNumValues() == 1)
8953 return Data;
8954
8955 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
8956}
8957
8958static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8959 SDValue *LWE, bool &IsTexFail) {
8960 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
8961
8962 uint64_t Value = TexFailCtrlConst->getZExtValue();
8963 if (Value) {
8964 IsTexFail = true;
8965 }
8966
8967 SDLoc DL(TexFailCtrlConst);
8968 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
8969 Value &= ~(uint64_t)0x1;
8970 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
8971 Value &= ~(uint64_t)0x2;
8972
8973 return Value == 0;
8974}
8975
8977 MVT PackVectorVT,
8978 SmallVectorImpl<SDValue> &PackedAddrs,
8979 unsigned DimIdx, unsigned EndIdx,
8980 unsigned NumGradients) {
8981 SDLoc DL(Op);
8982 for (unsigned I = DimIdx; I < EndIdx; I++) {
8983 SDValue Addr = Op.getOperand(I);
8984
8985 // Gradients are packed with undef for each coordinate.
8986 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8987 // 1D: undef,dx/dh; undef,dx/dv
8988 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8989 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8990 if (((I + 1) >= EndIdx) ||
8991 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8992 I == DimIdx + NumGradients - 1))) {
8993 if (Addr.getValueType() != MVT::i16)
8994 Addr = DAG.getBitcast(MVT::i16, Addr);
8995 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
8996 } else {
8997 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
8998 I++;
8999 }
9000 Addr = DAG.getBitcast(MVT::f32, Addr);
9001 PackedAddrs.push_back(Addr);
9002 }
9003}
9004
9005SDValue SITargetLowering::lowerImage(SDValue Op,
9007 SelectionDAG &DAG, bool WithChain) const {
9008 SDLoc DL(Op);
9009 MachineFunction &MF = DAG.getMachineFunction();
9010 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9011 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9013 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9014 unsigned IntrOpcode = Intr->BaseOpcode;
9015 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9016 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9017 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9018
9019 SmallVector<EVT, 3> ResultTypes(Op->values());
9020 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9021 bool IsD16 = false;
9022 bool IsG16 = false;
9023 bool IsA16 = false;
9024 SDValue VData;
9025 int NumVDataDwords = 0;
9026 bool AdjustRetType = false;
9027 bool IsAtomicPacked16Bit = false;
9028
9029 // Offset of intrinsic arguments
9030 const unsigned ArgOffset = WithChain ? 2 : 1;
9031
9032 unsigned DMask;
9033 unsigned DMaskLanes = 0;
9034
9035 if (BaseOpcode->Atomic) {
9036 VData = Op.getOperand(2);
9037
9038 IsAtomicPacked16Bit =
9039 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9040 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9041
9042 bool Is64Bit = VData.getValueSizeInBits() == 64;
9043 if (BaseOpcode->AtomicX2) {
9044 SDValue VData2 = Op.getOperand(3);
9045 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9046 {VData, VData2});
9047 if (Is64Bit)
9048 VData = DAG.getBitcast(MVT::v4i32, VData);
9049
9050 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9051 DMask = Is64Bit ? 0xf : 0x3;
9052 NumVDataDwords = Is64Bit ? 4 : 2;
9053 } else {
9054 DMask = Is64Bit ? 0x3 : 0x1;
9055 NumVDataDwords = Is64Bit ? 2 : 1;
9056 }
9057 } else {
9058 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9059 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9060
9061 if (BaseOpcode->Store) {
9062 VData = Op.getOperand(2);
9063
9064 MVT StoreVT = VData.getSimpleValueType();
9065 if (StoreVT.getScalarType() == MVT::f16) {
9066 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9067 return Op; // D16 is unsupported for this instruction
9068
9069 IsD16 = true;
9070 VData = handleD16VData(VData, DAG, true);
9071 }
9072
9073 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9074 } else if (!BaseOpcode->NoReturn) {
9075 // Work out the num dwords based on the dmask popcount and underlying type
9076 // and whether packing is supported.
9077 MVT LoadVT = ResultTypes[0].getSimpleVT();
9078 if (LoadVT.getScalarType() == MVT::f16) {
9079 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9080 return Op; // D16 is unsupported for this instruction
9081
9082 IsD16 = true;
9083 }
9084
9085 // Confirm that the return type is large enough for the dmask specified
9086 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9087 (!LoadVT.isVector() && DMaskLanes > 1))
9088 return Op;
9089
9090 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9091 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9092 // instructions.
9093 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9094 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9095 NumVDataDwords = (DMaskLanes + 1) / 2;
9096 else
9097 NumVDataDwords = DMaskLanes;
9098
9099 AdjustRetType = true;
9100 }
9101 }
9102
9103 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9105
9106 // Check for 16 bit addresses or derivatives and pack if true.
9107 MVT VAddrVT =
9108 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9109 MVT VAddrScalarVT = VAddrVT.getScalarType();
9110 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9111 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9112
9113 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9114 VAddrScalarVT = VAddrVT.getScalarType();
9115 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9116 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9117
9118 // Push back extra arguments.
9119 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9120 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9121 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9122 // Special handling of bias when A16 is on. Bias is of type half but
9123 // occupies full 32-bit.
9124 SDValue Bias = DAG.getBuildVector(
9125 MVT::v2f16, DL,
9126 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9127 VAddrs.push_back(Bias);
9128 } else {
9129 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9130 "Bias needs to be converted to 16 bit in A16 mode");
9131 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9132 }
9133 }
9134
9135 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9136 // 16 bit gradients are supported, but are tied to the A16 control
9137 // so both gradients and addresses must be 16 bit
9138 LLVM_DEBUG(
9139 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9140 "require 16 bit args for both gradients and addresses");
9141 return Op;
9142 }
9143
9144 if (IsA16) {
9145 if (!ST->hasA16()) {
9146 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9147 "support 16 bit addresses\n");
9148 return Op;
9149 }
9150 }
9151
9152 // We've dealt with incorrect input so we know that if IsA16, IsG16
9153 // are set then we have to compress/pack operands (either address,
9154 // gradient or both)
9155 // In the case where a16 and gradients are tied (no G16 support) then we
9156 // have already verified that both IsA16 and IsG16 are true
9157 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9158 // Activate g16
9159 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9161 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9162 }
9163
9164 // Add gradients (packed or unpacked)
9165 if (IsG16) {
9166 // Pack the gradients
9167 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9168 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9169 ArgOffset + Intr->GradientStart,
9170 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9171 } else {
9172 for (unsigned I = ArgOffset + Intr->GradientStart;
9173 I < ArgOffset + Intr->CoordStart; I++)
9174 VAddrs.push_back(Op.getOperand(I));
9175 }
9176
9177 // Add addresses (packed or unpacked)
9178 if (IsA16) {
9179 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9180 ArgOffset + Intr->CoordStart, VAddrEnd,
9181 0 /* No gradients */);
9182 } else {
9183 // Add uncompressed address
9184 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9185 VAddrs.push_back(Op.getOperand(I));
9186 }
9187
9188 // If the register allocator cannot place the address registers contiguously
9189 // without introducing moves, then using the non-sequential address encoding
9190 // is always preferable, since it saves VALU instructions and is usually a
9191 // wash in terms of code size or even better.
9192 //
9193 // However, we currently have no way of hinting to the register allocator that
9194 // MIMG addresses should be placed contiguously when it is possible to do so,
9195 // so force non-NSA for the common 2-address case as a heuristic.
9196 //
9197 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9198 // allocation when possible.
9199 //
9200 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9201 // set of the remaining addresses.
9202 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9203 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9204 const bool UseNSA = ST->hasNSAEncoding() &&
9205 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9206 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9207 const bool UsePartialNSA =
9208 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9209
9210 SDValue VAddr;
9211 if (UsePartialNSA) {
9212 VAddr = getBuildDwordsVector(DAG, DL,
9213 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9214 } else if (!UseNSA) {
9215 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9216 }
9217
9218 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9219 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9220 SDValue Unorm;
9221 if (!BaseOpcode->Sampler) {
9222 Unorm = True;
9223 } else {
9224 uint64_t UnormConst =
9225 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9226
9227 Unorm = UnormConst ? True : False;
9228 }
9229
9230 SDValue TFE;
9231 SDValue LWE;
9232 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9233 bool IsTexFail = false;
9234 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9235 return Op;
9236
9237 if (IsTexFail) {
9238 if (!DMaskLanes) {
9239 // Expecting to get an error flag since TFC is on - and dmask is 0
9240 // Force dmask to be at least 1 otherwise the instruction will fail
9241 DMask = 0x1;
9242 DMaskLanes = 1;
9243 NumVDataDwords = 1;
9244 }
9245 NumVDataDwords += 1;
9246 AdjustRetType = true;
9247 }
9248
9249 // Has something earlier tagged that the return type needs adjusting
9250 // This happens if the instruction is a load or has set TexFailCtrl flags
9251 if (AdjustRetType) {
9252 // NumVDataDwords reflects the true number of dwords required in the return
9253 // type
9254 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9255 // This is a no-op load. This can be eliminated
9256 SDValue Undef = DAG.getPOISON(Op.getValueType());
9257 if (isa<MemSDNode>(Op))
9258 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9259 return Undef;
9260 }
9261
9262 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9263 MVT::i32, NumVDataDwords)
9264 : MVT::i32;
9265
9266 ResultTypes[0] = NewVT;
9267 if (ResultTypes.size() == 3) {
9268 // Original result was aggregate type used for TexFailCtrl results
9269 // The actual instruction returns as a vector type which has now been
9270 // created. Remove the aggregate result.
9271 ResultTypes.erase(&ResultTypes[1]);
9272 }
9273 }
9274
9275 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9276 if (BaseOpcode->Atomic)
9277 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9278 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9280 return Op;
9281
9283 if (BaseOpcode->Store || BaseOpcode->Atomic)
9284 Ops.push_back(VData); // vdata
9285 if (UsePartialNSA) {
9286 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9287 Ops.push_back(VAddr);
9288 } else if (UseNSA)
9289 append_range(Ops, VAddrs);
9290 else
9291 Ops.push_back(VAddr);
9292 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9293 EVT RsrcVT = Rsrc.getValueType();
9294 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9295 return Op;
9296 Ops.push_back(Rsrc);
9297 if (BaseOpcode->Sampler) {
9298 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9299 if (Samp.getValueType() != MVT::v4i32)
9300 return Op;
9301 Ops.push_back(Samp);
9302 }
9303 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9304 if (IsGFX10Plus)
9305 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9306 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9307 Ops.push_back(Unorm);
9308 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9309 Ops.push_back(IsA16 && // r128, a16 for gfx9
9310 ST->hasFeature(AMDGPU::FeatureR128A16)
9311 ? True
9312 : False);
9313 if (IsGFX10Plus)
9314 Ops.push_back(IsA16 ? True : False);
9315
9316 if (!Subtarget->hasGFX90AInsts())
9317 Ops.push_back(TFE); // tfe
9318 else if (TFE->getAsZExtVal()) {
9319 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9321 "TFE is not supported on this GPU", DL.getDebugLoc()));
9322 }
9323
9324 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9325 Ops.push_back(LWE); // lwe
9326 if (!IsGFX10Plus)
9327 Ops.push_back(DimInfo->DA ? True : False);
9328 if (BaseOpcode->HasD16)
9329 Ops.push_back(IsD16 ? True : False);
9330 if (isa<MemSDNode>(Op))
9331 Ops.push_back(Op.getOperand(0)); // chain
9332
9333 int NumVAddrDwords =
9334 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9335 int Opcode = -1;
9336
9337 if (IsGFX12Plus) {
9338 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9339 NumVDataDwords, NumVAddrDwords);
9340 } else if (IsGFX11Plus) {
9341 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9342 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9343 : AMDGPU::MIMGEncGfx11Default,
9344 NumVDataDwords, NumVAddrDwords);
9345 } else if (IsGFX10Plus) {
9346 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9347 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9348 : AMDGPU::MIMGEncGfx10Default,
9349 NumVDataDwords, NumVAddrDwords);
9350 } else {
9351 if (Subtarget->hasGFX90AInsts()) {
9352 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9353 NumVDataDwords, NumVAddrDwords);
9354 if (Opcode == -1) {
9355 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9357 "requested image instruction is not supported on this GPU",
9358 DL.getDebugLoc()));
9359
9360 unsigned Idx = 0;
9361 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9362 for (EVT VT : OrigResultTypes) {
9363 if (VT == MVT::Other)
9364 RetValues[Idx++] = Op.getOperand(0); // Chain
9365 else
9366 RetValues[Idx++] = DAG.getPOISON(VT);
9367 }
9368
9369 return DAG.getMergeValues(RetValues, DL);
9370 }
9371 }
9372 if (Opcode == -1 &&
9373 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9374 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9375 NumVDataDwords, NumVAddrDwords);
9376 if (Opcode == -1)
9377 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9378 NumVDataDwords, NumVAddrDwords);
9379 }
9380 if (Opcode == -1)
9381 return Op;
9382
9383 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9384 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9385 MachineMemOperand *MemRef = MemOp->getMemOperand();
9386 DAG.setNodeMemRefs(NewNode, {MemRef});
9387 }
9388
9389 if (BaseOpcode->AtomicX2) {
9391 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9392 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9393 }
9394 if (BaseOpcode->NoReturn)
9395 return SDValue(NewNode, 0);
9396 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9397 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9398 NumVDataDwords, IsAtomicPacked16Bit, DL);
9399}
9400
9401SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9402 SDValue Offset, SDValue CachePolicy,
9403 SelectionDAG &DAG) const {
9404 MachineFunction &MF = DAG.getMachineFunction();
9405
9406 const DataLayout &DataLayout = DAG.getDataLayout();
9407 Align Alignment =
9408 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9409
9410 MachineMemOperand *MMO = MF.getMachineMemOperand(
9411 MachinePointerInfo(),
9414 VT.getStoreSize(), Alignment);
9415
9416 if (!Offset->isDivergent()) {
9417 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9418
9419 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9420 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9421 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9422 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9423 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9424 SDValue BufferLoad =
9426 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9427 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9428 }
9429
9430 // Widen vec3 load to vec4.
9431 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9432 !Subtarget->hasScalarDwordx3Loads()) {
9433 EVT WidenedVT =
9435 auto WidenedOp = DAG.getMemIntrinsicNode(
9436 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9437 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9438 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9439 DAG.getVectorIdxConstant(0, DL));
9440 return Subvector;
9441 }
9442
9444 DAG.getVTList(VT), Ops, VT, MMO);
9445 }
9446
9447 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9448 // assume that the buffer is unswizzled.
9449 SDValue Ops[] = {
9450 DAG.getEntryNode(), // Chain
9451 Rsrc, // rsrc
9452 DAG.getConstant(0, DL, MVT::i32), // vindex
9453 {}, // voffset
9454 {}, // soffset
9455 {}, // offset
9456 CachePolicy, // cachepolicy
9457 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9458 };
9459 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9460 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9461 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9462 }
9463
9465 unsigned NumLoads = 1;
9466 MVT LoadVT = VT.getSimpleVT();
9467 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9468 assert((LoadVT.getScalarType() == MVT::i32 ||
9469 LoadVT.getScalarType() == MVT::f32));
9470
9471 if (NumElts == 8 || NumElts == 16) {
9472 NumLoads = NumElts / 4;
9473 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9474 }
9475
9476 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9477
9478 // Use the alignment to ensure that the required offsets will fit into the
9479 // immediate offsets.
9480 setBufferOffsets(Offset, DAG, &Ops[3],
9481 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9482
9483 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9484 for (unsigned i = 0; i < NumLoads; ++i) {
9485 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9486 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9487 LoadVT, MMO, DAG));
9488 }
9489
9490 if (NumElts == 8 || NumElts == 16)
9491 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9492
9493 return Loads[0];
9494}
9495
9496SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9497 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9498 if (!Subtarget->hasArchitectedSGPRs())
9499 return {};
9500 SDLoc SL(Op);
9501 MVT VT = MVT::i32;
9502 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9503 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9504 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9505}
9506
9507SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9508 unsigned Dim,
9509 const ArgDescriptor &Arg) const {
9510 SDLoc SL(Op);
9511 MachineFunction &MF = DAG.getMachineFunction();
9512 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9513 if (MaxID == 0)
9514 return DAG.getConstant(0, SL, MVT::i32);
9515
9516 // It's undefined behavior if a function marked with the amdgpu-no-*
9517 // attributes uses the corresponding intrinsic.
9518 if (!Arg)
9519 return DAG.getPOISON(Op->getValueType(0));
9520
9521 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9522 SDLoc(DAG.getEntryNode()), Arg);
9523
9524 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9525 // masking operations anyway.
9526 //
9527 // TODO: We could assert the top bit is 0 for the source copy.
9528 if (Arg.isMasked())
9529 return Val;
9530
9531 // Preserve the known bits after expansion to a copy.
9532 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9533 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9534 DAG.getValueType(SmallVT));
9535}
9536
9537SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9538 SelectionDAG &DAG) const {
9539 MachineFunction &MF = DAG.getMachineFunction();
9540 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9541
9542 EVT VT = Op.getValueType();
9543 SDLoc DL(Op);
9544 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9545
9546 // TODO: Should this propagate fast-math-flags?
9547
9548 switch (IntrinsicID) {
9549 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9550 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9551 return emitNonHSAIntrinsicError(DAG, DL, VT);
9552 return getPreloadedValue(DAG, *MFI, VT,
9554 }
9555 case Intrinsic::amdgcn_dispatch_ptr:
9556 case Intrinsic::amdgcn_queue_ptr: {
9557 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9558 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9559 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9560 DL.getDebugLoc()));
9561 return DAG.getPOISON(VT);
9562 }
9563
9564 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9567 return getPreloadedValue(DAG, *MFI, VT, RegID);
9568 }
9569 case Intrinsic::amdgcn_implicitarg_ptr: {
9570 if (MFI->isEntryFunction())
9571 return getImplicitArgPtr(DAG, DL);
9572 return getPreloadedValue(DAG, *MFI, VT,
9574 }
9575 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9577 // This only makes sense to call in a kernel, so just lower to null.
9578 return DAG.getConstant(0, DL, VT);
9579 }
9580
9581 return getPreloadedValue(DAG, *MFI, VT,
9583 }
9584 case Intrinsic::amdgcn_dispatch_id: {
9585 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9586 }
9587 case Intrinsic::amdgcn_rcp:
9588 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9589 case Intrinsic::amdgcn_rsq:
9590 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9591 case Intrinsic::amdgcn_rsq_legacy:
9592 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9593 return emitRemovedIntrinsicError(DAG, DL, VT);
9594 return SDValue();
9595 case Intrinsic::amdgcn_rcp_legacy:
9596 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9597 return emitRemovedIntrinsicError(DAG, DL, VT);
9598 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9599 case Intrinsic::amdgcn_rsq_clamp: {
9600 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9601 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9602
9603 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9604 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9605 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9606
9607 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9608 SDValue Tmp =
9609 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9610 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9611 DAG.getConstantFP(Min, DL, VT));
9612 }
9613 case Intrinsic::r600_read_ngroups_x:
9614 if (Subtarget->isAmdHsaOS())
9615 return emitNonHSAIntrinsicError(DAG, DL, VT);
9616
9617 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9619 false);
9620 case Intrinsic::r600_read_ngroups_y:
9621 if (Subtarget->isAmdHsaOS())
9622 return emitNonHSAIntrinsicError(DAG, DL, VT);
9623
9624 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9626 false);
9627 case Intrinsic::r600_read_ngroups_z:
9628 if (Subtarget->isAmdHsaOS())
9629 return emitNonHSAIntrinsicError(DAG, DL, VT);
9630
9631 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9633 false);
9634 case Intrinsic::r600_read_local_size_x:
9635 if (Subtarget->isAmdHsaOS())
9636 return emitNonHSAIntrinsicError(DAG, DL, VT);
9637
9638 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9640 case Intrinsic::r600_read_local_size_y:
9641 if (Subtarget->isAmdHsaOS())
9642 return emitNonHSAIntrinsicError(DAG, DL, VT);
9643
9644 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9646 case Intrinsic::r600_read_local_size_z:
9647 if (Subtarget->isAmdHsaOS())
9648 return emitNonHSAIntrinsicError(DAG, DL, VT);
9649
9650 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9652 case Intrinsic::amdgcn_workgroup_id_x:
9653 return getPreloadedValue(DAG, *MFI, VT,
9655 case Intrinsic::amdgcn_workgroup_id_y:
9656 return getPreloadedValue(DAG, *MFI, VT,
9658 case Intrinsic::amdgcn_workgroup_id_z:
9659 return getPreloadedValue(DAG, *MFI, VT,
9661 case Intrinsic::amdgcn_wave_id:
9662 return lowerWaveID(DAG, Op);
9663 case Intrinsic::amdgcn_lds_kernel_id: {
9664 if (MFI->isEntryFunction())
9665 return getLDSKernelId(DAG, DL);
9666 return getPreloadedValue(DAG, *MFI, VT,
9668 }
9669 case Intrinsic::amdgcn_workitem_id_x:
9670 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9671 case Intrinsic::amdgcn_workitem_id_y:
9672 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9673 case Intrinsic::amdgcn_workitem_id_z:
9674 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9675 case Intrinsic::amdgcn_wavefrontsize:
9676 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9677 SDLoc(Op), MVT::i32);
9678 case Intrinsic::amdgcn_s_buffer_load: {
9679 unsigned CPol = Op.getConstantOperandVal(3);
9680 // s_buffer_load, because of how it's optimized, can't be volatile
9681 // so reject ones with the volatile bit set.
9682 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9685 return Op;
9686 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9687 Op.getOperand(3), DAG);
9688 }
9689 case Intrinsic::amdgcn_fdiv_fast:
9690 return lowerFDIV_FAST(Op, DAG);
9691 case Intrinsic::amdgcn_sin:
9692 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9693
9694 case Intrinsic::amdgcn_cos:
9695 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9696
9697 case Intrinsic::amdgcn_mul_u24:
9698 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9699 Op.getOperand(2));
9700 case Intrinsic::amdgcn_mul_i24:
9701 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9702 Op.getOperand(2));
9703
9704 case Intrinsic::amdgcn_log_clamp: {
9705 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9706 return SDValue();
9707
9708 return emitRemovedIntrinsicError(DAG, DL, VT);
9709 }
9710 case Intrinsic::amdgcn_fract:
9711 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9712
9713 case Intrinsic::amdgcn_class:
9714 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9715 Op.getOperand(2));
9716 case Intrinsic::amdgcn_div_fmas:
9717 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9718 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9719
9720 case Intrinsic::amdgcn_div_fixup:
9721 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9722 Op.getOperand(2), Op.getOperand(3));
9723
9724 case Intrinsic::amdgcn_div_scale: {
9725 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9726
9727 // Translate to the operands expected by the machine instruction. The
9728 // first parameter must be the same as the first instruction.
9729 SDValue Numerator = Op.getOperand(1);
9730 SDValue Denominator = Op.getOperand(2);
9731
9732 // Note this order is opposite of the machine instruction's operations,
9733 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9734 // intrinsic has the numerator as the first operand to match a normal
9735 // division operation.
9736
9737 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9738
9739 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9740 Denominator, Numerator);
9741 }
9742 case Intrinsic::amdgcn_icmp: {
9743 // There is a Pat that handles this variant, so return it as-is.
9744 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9745 Op.getConstantOperandVal(2) == 0 &&
9746 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9747 return Op;
9748 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9749 }
9750 case Intrinsic::amdgcn_fcmp: {
9751 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9752 }
9753 case Intrinsic::amdgcn_ballot:
9754 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9755 case Intrinsic::amdgcn_fmed3:
9756 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9757 Op.getOperand(2), Op.getOperand(3));
9758 case Intrinsic::amdgcn_fdot2:
9759 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9760 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9761 case Intrinsic::amdgcn_fmul_legacy:
9762 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9763 Op.getOperand(2));
9764 case Intrinsic::amdgcn_sffbh:
9765 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9766 case Intrinsic::amdgcn_sbfe:
9767 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9768 Op.getOperand(2), Op.getOperand(3));
9769 case Intrinsic::amdgcn_ubfe:
9770 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9771 Op.getOperand(2), Op.getOperand(3));
9772 case Intrinsic::amdgcn_cvt_pkrtz:
9773 case Intrinsic::amdgcn_cvt_pknorm_i16:
9774 case Intrinsic::amdgcn_cvt_pknorm_u16:
9775 case Intrinsic::amdgcn_cvt_pk_i16:
9776 case Intrinsic::amdgcn_cvt_pk_u16: {
9777 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9778 EVT VT = Op.getValueType();
9779 unsigned Opcode;
9780
9781 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9783 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9785 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9787 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9789 else
9791
9792 if (isTypeLegal(VT))
9793 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
9794
9795 SDValue Node =
9796 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
9797 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
9798 }
9799 case Intrinsic::amdgcn_fmad_ftz:
9800 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
9801 Op.getOperand(2), Op.getOperand(3));
9802
9803 case Intrinsic::amdgcn_if_break:
9804 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
9805 Op->getOperand(1), Op->getOperand(2)),
9806 0);
9807
9808 case Intrinsic::amdgcn_groupstaticsize: {
9810 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
9811 return Op;
9812
9813 const Module *M = MF.getFunction().getParent();
9814 const GlobalValue *GV =
9815 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
9816 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
9818 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9819 }
9820 case Intrinsic::amdgcn_is_shared:
9821 case Intrinsic::amdgcn_is_private: {
9822 SDLoc SL(Op);
9823 SDValue SrcVec =
9824 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9825 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
9826 DAG.getConstant(1, SL, MVT::i32));
9827
9828 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9830 : AMDGPUAS::PRIVATE_ADDRESS;
9831 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
9832 Subtarget->hasGloballyAddressableScratch()) {
9833 SDValue FlatScratchBaseHi(
9834 DAG.getMachineNode(
9835 AMDGPU::S_MOV_B32, DL, MVT::i32,
9836 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
9837 0);
9838 // Test bits 63..58 against the aperture address.
9839 return DAG.getSetCC(
9840 SL, MVT::i1,
9841 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
9842 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
9843 }
9844
9845 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
9846 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
9847 }
9848 case Intrinsic::amdgcn_perm:
9849 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
9850 Op.getOperand(2), Op.getOperand(3));
9851 case Intrinsic::amdgcn_reloc_constant: {
9852 Module *M = MF.getFunction().getParent();
9853 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
9854 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
9855 auto *RelocSymbol = cast<GlobalVariable>(
9856 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
9857 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
9859 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
9860 }
9861 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9862 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9863 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9864 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9865 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9866 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9867 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9868 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9869 if (Op.getOperand(4).getValueType() == MVT::i32)
9870 return SDValue();
9871
9872 SDLoc SL(Op);
9873 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
9874 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9875 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9876 Op.getOperand(3), IndexKeyi32);
9877 }
9878 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
9879 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
9880 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
9881 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
9882 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
9883 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
9884 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
9885 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
9886 if (Op.getOperand(4).getValueType() == MVT::i64)
9887 return SDValue();
9888
9889 SDLoc SL(Op);
9890 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
9891 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9892 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9893 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
9894 Op.getOperand(6)});
9895 }
9896 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
9897 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
9898 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
9899 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
9900 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
9901 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
9902 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
9903 ? MVT::i64
9904 : MVT::i32;
9905 if (Op.getOperand(6).getValueType() == IndexKeyTy)
9906 return SDValue();
9907
9908 SDLoc SL(Op);
9909 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
9910 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9911 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9912 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9913 IndexKey, Op.getOperand(7),
9914 Op.getOperand(8)}); // No clamp operand
9915 }
9916 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9917 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9918 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9919 if (Op.getOperand(6).getValueType() == MVT::i32)
9920 return SDValue();
9921
9922 SDLoc SL(Op);
9923 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
9924 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
9925 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
9926 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
9927 IndexKeyi32, Op.getOperand(7)});
9928 }
9929 case Intrinsic::amdgcn_addrspacecast_nonnull:
9930 return lowerADDRSPACECAST(Op, DAG);
9931 case Intrinsic::amdgcn_readlane:
9932 case Intrinsic::amdgcn_readfirstlane:
9933 case Intrinsic::amdgcn_writelane:
9934 case Intrinsic::amdgcn_permlane16:
9935 case Intrinsic::amdgcn_permlanex16:
9936 case Intrinsic::amdgcn_permlane64:
9937 case Intrinsic::amdgcn_set_inactive:
9938 case Intrinsic::amdgcn_set_inactive_chain_arg:
9939 case Intrinsic::amdgcn_mov_dpp8:
9940 case Intrinsic::amdgcn_update_dpp:
9941 return lowerLaneOp(*this, Op.getNode(), DAG);
9942 case Intrinsic::amdgcn_dead: {
9944 for (const EVT ValTy : Op.getNode()->values())
9945 Poisons.push_back(DAG.getPOISON(ValTy));
9946 return DAG.getMergeValues(Poisons, SDLoc(Op));
9947 }
9948 default:
9949 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9951 return lowerImage(Op, ImageDimIntr, DAG, false);
9952
9953 return Op;
9954 }
9955}
9956
9957// On targets not supporting constant in soffset field, turn zero to
9958// SGPR_NULL to avoid generating an extra s_mov with zero.
9960 const GCNSubtarget *Subtarget) {
9961 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
9962 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
9963 return SOffset;
9964}
9965
9966SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9967 SelectionDAG &DAG,
9968 unsigned NewOpcode) const {
9969 SDLoc DL(Op);
9970
9971 SDValue VData = Op.getOperand(2);
9972 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9973 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
9974 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9975 SDValue Ops[] = {
9976 Op.getOperand(0), // Chain
9977 VData, // vdata
9978 Rsrc, // rsrc
9979 DAG.getConstant(0, DL, MVT::i32), // vindex
9980 VOffset, // voffset
9981 SOffset, // soffset
9982 Offset, // offset
9983 Op.getOperand(6), // cachepolicy
9984 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9985 };
9986
9987 auto *M = cast<MemSDNode>(Op);
9988
9989 EVT MemVT = VData.getValueType();
9990 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
9991 M->getMemOperand());
9992}
9993
9994SDValue
9995SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9996 unsigned NewOpcode) const {
9997 SDLoc DL(Op);
9998
9999 SDValue VData = Op.getOperand(2);
10000 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10001 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10002 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10003 SDValue Ops[] = {
10004 Op.getOperand(0), // Chain
10005 VData, // vdata
10006 Rsrc, // rsrc
10007 Op.getOperand(4), // vindex
10008 VOffset, // voffset
10009 SOffset, // soffset
10010 Offset, // offset
10011 Op.getOperand(7), // cachepolicy
10012 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10013 };
10014
10015 auto *M = cast<MemSDNode>(Op);
10016
10017 EVT MemVT = VData.getValueType();
10018 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10019 M->getMemOperand());
10020}
10021
10022SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10023 SelectionDAG &DAG) const {
10024 unsigned IntrID = Op.getConstantOperandVal(1);
10025 SDLoc DL(Op);
10026
10027 switch (IntrID) {
10028 case Intrinsic::amdgcn_ds_ordered_add:
10029 case Intrinsic::amdgcn_ds_ordered_swap: {
10030 MemSDNode *M = cast<MemSDNode>(Op);
10031 SDValue Chain = M->getOperand(0);
10032 SDValue M0 = M->getOperand(2);
10033 SDValue Value = M->getOperand(3);
10034 unsigned IndexOperand = M->getConstantOperandVal(7);
10035 unsigned WaveRelease = M->getConstantOperandVal(8);
10036 unsigned WaveDone = M->getConstantOperandVal(9);
10037
10038 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10039 IndexOperand &= ~0x3f;
10040 unsigned CountDw = 0;
10041
10042 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10043 CountDw = (IndexOperand >> 24) & 0xf;
10044 IndexOperand &= ~(0xf << 24);
10045
10046 if (CountDw < 1 || CountDw > 4) {
10047 const Function &Fn = DAG.getMachineFunction().getFunction();
10048 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10049 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10050 DL.getDebugLoc()));
10051 CountDw = 1;
10052 }
10053 }
10054
10055 if (IndexOperand) {
10056 const Function &Fn = DAG.getMachineFunction().getFunction();
10057 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10058 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10059 }
10060
10061 if (WaveDone && !WaveRelease) {
10062 // TODO: Move this to IR verifier
10063 const Function &Fn = DAG.getMachineFunction().getFunction();
10064 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10065 Fn, "ds_ordered_count: wave_done requires wave_release",
10066 DL.getDebugLoc()));
10067 }
10068
10069 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10070 unsigned ShaderType =
10072 unsigned Offset0 = OrderedCountIndex << 2;
10073 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10074
10075 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10076 Offset1 |= (CountDw - 1) << 6;
10077
10078 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10079 Offset1 |= ShaderType << 2;
10080
10081 unsigned Offset = Offset0 | (Offset1 << 8);
10082
10083 SDValue Ops[] = {
10084 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10085 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10086 };
10088 M->getVTList(), Ops, M->getMemoryVT(),
10089 M->getMemOperand());
10090 }
10091 case Intrinsic::amdgcn_raw_buffer_load:
10092 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10093 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10094 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10095 case Intrinsic::amdgcn_raw_buffer_load_format:
10096 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10097 const bool IsFormat =
10098 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10099 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10100
10101 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10102 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10103 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10104 SDValue Ops[] = {
10105 Op.getOperand(0), // Chain
10106 Rsrc, // rsrc
10107 DAG.getConstant(0, DL, MVT::i32), // vindex
10108 VOffset, // voffset
10109 SOffset, // soffset
10110 Offset, // offset
10111 Op.getOperand(5), // cachepolicy, swizzled buffer
10112 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10113 };
10114
10115 auto *M = cast<MemSDNode>(Op);
10116 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10117 }
10118 case Intrinsic::amdgcn_struct_buffer_load:
10119 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10120 case Intrinsic::amdgcn_struct_buffer_load_format:
10121 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10122 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10123 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10124 const bool IsFormat =
10125 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10126 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10127
10128 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10129 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10130 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10131 SDValue Ops[] = {
10132 Op.getOperand(0), // Chain
10133 Rsrc, // rsrc
10134 Op.getOperand(3), // vindex
10135 VOffset, // voffset
10136 SOffset, // soffset
10137 Offset, // offset
10138 Op.getOperand(6), // cachepolicy, swizzled buffer
10139 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10140 };
10141
10142 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10143 }
10144 case Intrinsic::amdgcn_raw_tbuffer_load:
10145 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10146 MemSDNode *M = cast<MemSDNode>(Op);
10147 EVT LoadVT = Op.getValueType();
10148 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10149 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10150 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10151
10152 SDValue Ops[] = {
10153 Op.getOperand(0), // Chain
10154 Rsrc, // rsrc
10155 DAG.getConstant(0, DL, MVT::i32), // vindex
10156 VOffset, // voffset
10157 SOffset, // soffset
10158 Offset, // offset
10159 Op.getOperand(5), // format
10160 Op.getOperand(6), // cachepolicy, swizzled buffer
10161 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10162 };
10163
10164 if (LoadVT.getScalarType() == MVT::f16)
10165 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10166 Ops);
10167 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10168 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10169 DAG);
10170 }
10171 case Intrinsic::amdgcn_struct_tbuffer_load:
10172 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10173 MemSDNode *M = cast<MemSDNode>(Op);
10174 EVT LoadVT = Op.getValueType();
10175 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10176 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10177 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10178
10179 SDValue Ops[] = {
10180 Op.getOperand(0), // Chain
10181 Rsrc, // rsrc
10182 Op.getOperand(3), // vindex
10183 VOffset, // voffset
10184 SOffset, // soffset
10185 Offset, // offset
10186 Op.getOperand(6), // format
10187 Op.getOperand(7), // cachepolicy, swizzled buffer
10188 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10189 };
10190
10191 if (LoadVT.getScalarType() == MVT::f16)
10192 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10193 Ops);
10194 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10195 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10196 DAG);
10197 }
10198 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10200 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10201 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10202 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10203 return lowerStructBufferAtomicIntrin(Op, DAG,
10205 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10207 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10208 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10209 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10210 return lowerStructBufferAtomicIntrin(Op, DAG,
10212 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10213 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10214 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10215 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10216 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10217 return lowerStructBufferAtomicIntrin(Op, DAG,
10219 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10220 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10221 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10222 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10223 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10224 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10225 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10226 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10227 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10228 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10229 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10230 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10231 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10232 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10233 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10234 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10235 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10236 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10237 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10238 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10239 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10240 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10241 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10242 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10243 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10244 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10245 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10246 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10247 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10248 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10249 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10250 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10251 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10252 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10253 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10254 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10255 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10256 return lowerRawBufferAtomicIntrin(Op, DAG,
10258 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10259 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10260 return lowerStructBufferAtomicIntrin(Op, DAG,
10262 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10263 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10264 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10265 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10266 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10267 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10268 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10269 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10270 return lowerStructBufferAtomicIntrin(Op, DAG,
10272 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10274 return lowerStructBufferAtomicIntrin(Op, DAG,
10276 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10277 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10278 return lowerStructBufferAtomicIntrin(Op, DAG,
10280 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10282 return lowerStructBufferAtomicIntrin(Op, DAG,
10284 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10286 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10287 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10288 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10289 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10290 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10291 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10292 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10293 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10294 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10295 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10296 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10297 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10298 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10299 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10300 return lowerStructBufferAtomicIntrin(Op, DAG,
10302
10303 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10305 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10306 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10307 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10308 SDValue Ops[] = {
10309 Op.getOperand(0), // Chain
10310 Op.getOperand(2), // src
10311 Op.getOperand(3), // cmp
10312 Rsrc, // rsrc
10313 DAG.getConstant(0, DL, MVT::i32), // vindex
10314 VOffset, // voffset
10315 SOffset, // soffset
10316 Offset, // offset
10317 Op.getOperand(7), // cachepolicy
10318 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10319 };
10320 EVT VT = Op.getValueType();
10321 auto *M = cast<MemSDNode>(Op);
10322
10324 Op->getVTList(), Ops, VT,
10325 M->getMemOperand());
10326 }
10327 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10328 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10329 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10330 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10331 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10332 SDValue Ops[] = {
10333 Op.getOperand(0), // Chain
10334 Op.getOperand(2), // src
10335 Op.getOperand(3), // cmp
10336 Rsrc, // rsrc
10337 Op.getOperand(5), // vindex
10338 VOffset, // voffset
10339 SOffset, // soffset
10340 Offset, // offset
10341 Op.getOperand(8), // cachepolicy
10342 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10343 };
10344 EVT VT = Op.getValueType();
10345 auto *M = cast<MemSDNode>(Op);
10346
10348 Op->getVTList(), Ops, VT,
10349 M->getMemOperand());
10350 }
10351 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10352 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10353 MemSDNode *M = cast<MemSDNode>(Op);
10354 SDValue NodePtr = M->getOperand(2);
10355 SDValue RayExtent = M->getOperand(3);
10356 SDValue InstanceMask = M->getOperand(4);
10357 SDValue RayOrigin = M->getOperand(5);
10358 SDValue RayDir = M->getOperand(6);
10359 SDValue Offsets = M->getOperand(7);
10360 SDValue TDescr = M->getOperand(8);
10361
10362 assert(NodePtr.getValueType() == MVT::i64);
10363 assert(RayDir.getValueType() == MVT::v3f32);
10364
10365 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10366 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10367 return SDValue();
10368 }
10369
10370 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10371 const unsigned NumVDataDwords = 10;
10372 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10373 int Opcode = AMDGPU::getMIMGOpcode(
10374 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10375 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10376 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10377 assert(Opcode != -1);
10378
10380 Ops.push_back(NodePtr);
10381 Ops.push_back(DAG.getBuildVector(
10382 MVT::v2i32, DL,
10383 {DAG.getBitcast(MVT::i32, RayExtent),
10384 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10385 Ops.push_back(RayOrigin);
10386 Ops.push_back(RayDir);
10387 Ops.push_back(Offsets);
10388 Ops.push_back(TDescr);
10389 Ops.push_back(M->getChain());
10390
10391 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10392 MachineMemOperand *MemRef = M->getMemOperand();
10393 DAG.setNodeMemRefs(NewNode, {MemRef});
10394 return SDValue(NewNode, 0);
10395 }
10396 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10397 MemSDNode *M = cast<MemSDNode>(Op);
10398 SDValue NodePtr = M->getOperand(2);
10399 SDValue RayExtent = M->getOperand(3);
10400 SDValue RayOrigin = M->getOperand(4);
10401 SDValue RayDir = M->getOperand(5);
10402 SDValue RayInvDir = M->getOperand(6);
10403 SDValue TDescr = M->getOperand(7);
10404
10405 assert(NodePtr.getValueType() == MVT::i32 ||
10406 NodePtr.getValueType() == MVT::i64);
10407 assert(RayDir.getValueType() == MVT::v3f16 ||
10408 RayDir.getValueType() == MVT::v3f32);
10409
10410 if (!Subtarget->hasGFX10_AEncoding()) {
10411 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10412 return SDValue();
10413 }
10414
10415 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10416 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10417 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10418 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10419 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10420 const unsigned NumVDataDwords = 4;
10421 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10422 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10423 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10424 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10425 IsGFX12Plus;
10426 const unsigned BaseOpcodes[2][2] = {
10427 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10428 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10429 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10430 int Opcode;
10431 if (UseNSA) {
10432 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10433 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10434 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10435 : AMDGPU::MIMGEncGfx10NSA,
10436 NumVDataDwords, NumVAddrDwords);
10437 } else {
10438 assert(!IsGFX12Plus);
10439 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10440 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10441 : AMDGPU::MIMGEncGfx10Default,
10442 NumVDataDwords, NumVAddrDwords);
10443 }
10444 assert(Opcode != -1);
10445
10447
10448 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10450 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10451 if (Lanes[0].getValueSizeInBits() == 32) {
10452 for (unsigned I = 0; I < 3; ++I)
10453 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10454 } else {
10455 if (IsAligned) {
10456 Ops.push_back(DAG.getBitcast(
10457 MVT::i32,
10458 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10459 Ops.push_back(Lanes[2]);
10460 } else {
10461 SDValue Elt0 = Ops.pop_back_val();
10462 Ops.push_back(DAG.getBitcast(
10463 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10464 Ops.push_back(DAG.getBitcast(
10465 MVT::i32,
10466 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10467 }
10468 }
10469 };
10470
10471 if (UseNSA && IsGFX11Plus) {
10472 Ops.push_back(NodePtr);
10473 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10474 Ops.push_back(RayOrigin);
10475 if (IsA16) {
10476 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10477 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10478 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10479 for (unsigned I = 0; I < 3; ++I) {
10480 MergedLanes.push_back(DAG.getBitcast(
10481 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10482 {DirLanes[I], InvDirLanes[I]})));
10483 }
10484 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10485 } else {
10486 Ops.push_back(RayDir);
10487 Ops.push_back(RayInvDir);
10488 }
10489 } else {
10490 if (Is64)
10491 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10492 2);
10493 else
10494 Ops.push_back(NodePtr);
10495
10496 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10497 packLanes(RayOrigin, true);
10498 packLanes(RayDir, true);
10499 packLanes(RayInvDir, false);
10500 }
10501
10502 if (!UseNSA) {
10503 // Build a single vector containing all the operands so far prepared.
10504 if (NumVAddrDwords > 12) {
10505 SDValue Undef = DAG.getPOISON(MVT::i32);
10506 Ops.append(16 - Ops.size(), Undef);
10507 }
10508 assert(Ops.size() >= 8 && Ops.size() <= 12);
10509 SDValue MergedOps =
10510 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10511 Ops.clear();
10512 Ops.push_back(MergedOps);
10513 }
10514
10515 Ops.push_back(TDescr);
10516 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10517 Ops.push_back(M->getChain());
10518
10519 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10520 MachineMemOperand *MemRef = M->getMemOperand();
10521 DAG.setNodeMemRefs(NewNode, {MemRef});
10522 return SDValue(NewNode, 0);
10523 }
10524 case Intrinsic::amdgcn_global_atomic_fmin_num:
10525 case Intrinsic::amdgcn_global_atomic_fmax_num:
10526 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10527 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10528 MemSDNode *M = cast<MemSDNode>(Op);
10529 SDValue Ops[] = {
10530 M->getOperand(0), // Chain
10531 M->getOperand(2), // Ptr
10532 M->getOperand(3) // Value
10533 };
10534 unsigned Opcode = 0;
10535 switch (IntrID) {
10536 case Intrinsic::amdgcn_global_atomic_fmin_num:
10537 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10538 Opcode = ISD::ATOMIC_LOAD_FMIN;
10539 break;
10540 }
10541 case Intrinsic::amdgcn_global_atomic_fmax_num:
10542 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10543 Opcode = ISD::ATOMIC_LOAD_FMAX;
10544 break;
10545 }
10546 default:
10547 llvm_unreachable("unhandled atomic opcode");
10548 }
10549 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10550 Ops, M->getMemOperand());
10551 }
10552 case Intrinsic::amdgcn_s_get_barrier_state:
10553 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10554 SDValue Chain = Op->getOperand(0);
10556 unsigned Opc;
10557
10558 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10559 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10560 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10561 BarID = (BarID >> 4) & 0x3F;
10562 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10563 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10564 Ops.push_back(K);
10565 Ops.push_back(Chain);
10566 } else {
10567 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10568 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10569 SDValue M0Val;
10570 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10571 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10572 M0Val = SDValue(
10573 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10574 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10575 0);
10576 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10577 } else
10578 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10579 }
10580
10581 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10582 return SDValue(NewMI, 0);
10583 }
10584 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10585 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10586 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10587 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10588 SDValue Chain = Op->getOperand(0);
10589 SDValue Ptr = Op->getOperand(2);
10590 EVT VT = Op->getValueType(0);
10591 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10592 Chain, Ptr, MII->getMemOperand());
10593 }
10594 default:
10595
10596 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10598 return lowerImage(Op, ImageDimIntr, DAG, true);
10599
10600 return SDValue();
10601 }
10602}
10603
10604// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10605// dwordx4 if on SI and handle TFE loads.
10606SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10607 SDVTList VTList,
10608 ArrayRef<SDValue> Ops, EVT MemVT,
10609 MachineMemOperand *MMO,
10610 SelectionDAG &DAG) const {
10611 LLVMContext &C = *DAG.getContext();
10612 MachineFunction &MF = DAG.getMachineFunction();
10613 EVT VT = VTList.VTs[0];
10614
10615 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10616 bool IsTFE = VTList.NumVTs == 3;
10617 if (IsTFE) {
10618 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10619 unsigned NumOpDWords = NumValueDWords + 1;
10620 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10621 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10622 MachineMemOperand *OpDWordsMMO =
10623 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10624 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10625 OpDWordsVT, OpDWordsMMO, DAG);
10626 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10627 DAG.getVectorIdxConstant(NumValueDWords, DL));
10628 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10629 SDValue ValueDWords =
10630 NumValueDWords == 1
10631 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10633 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10634 ZeroIdx);
10635 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10636 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10637 }
10638
10639 if (!Subtarget->hasDwordx3LoadStores() &&
10640 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10641 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10642 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10643 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10644 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10645 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10646 WidenedMemVT, WidenedMMO);
10648 DAG.getVectorIdxConstant(0, DL));
10649 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10650 }
10651
10652 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10653}
10654
10655SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10656 bool ImageStore) const {
10657 EVT StoreVT = VData.getValueType();
10658
10659 // No change for f16 and legal vector D16 types.
10660 if (!StoreVT.isVector())
10661 return VData;
10662
10663 SDLoc DL(VData);
10664 unsigned NumElements = StoreVT.getVectorNumElements();
10665
10666 if (Subtarget->hasUnpackedD16VMem()) {
10667 // We need to unpack the packed data to store.
10668 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10669 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10670
10671 EVT EquivStoreVT =
10672 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10673 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10674 return DAG.UnrollVectorOp(ZExt.getNode());
10675 }
10676
10677 // The sq block of gfx8.1 does not estimate register use correctly for d16
10678 // image store instructions. The data operand is computed as if it were not a
10679 // d16 image instruction.
10680 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10681 // Bitcast to i16
10682 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10683 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10684
10685 // Decompose into scalars
10687 DAG.ExtractVectorElements(IntVData, Elts);
10688
10689 // Group pairs of i16 into v2i16 and bitcast to i32
10690 SmallVector<SDValue, 4> PackedElts;
10691 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10692 SDValue Pair =
10693 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10694 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10695 PackedElts.push_back(IntPair);
10696 }
10697 if ((NumElements % 2) == 1) {
10698 // Handle v3i16
10699 unsigned I = Elts.size() / 2;
10700 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10701 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10702 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10703 PackedElts.push_back(IntPair);
10704 }
10705
10706 // Pad using UNDEF
10707 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10708
10709 // Build final vector
10710 EVT VecVT =
10711 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10712 return DAG.getBuildVector(VecVT, DL, PackedElts);
10713 }
10714
10715 if (NumElements == 3) {
10716 EVT IntStoreVT =
10718 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10719
10720 EVT WidenedStoreVT = EVT::getVectorVT(
10721 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10722 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10723 WidenedStoreVT.getStoreSizeInBits());
10724 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10725 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10726 }
10727
10728 assert(isTypeLegal(StoreVT));
10729 return VData;
10730}
10731
10732SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10733 SelectionDAG &DAG) const {
10734 SDLoc DL(Op);
10735 SDValue Chain = Op.getOperand(0);
10736 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10737 MachineFunction &MF = DAG.getMachineFunction();
10738
10739 switch (IntrinsicID) {
10740 case Intrinsic::amdgcn_exp_compr: {
10741 if (!Subtarget->hasCompressedExport()) {
10742 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10744 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10745 }
10746 SDValue Src0 = Op.getOperand(4);
10747 SDValue Src1 = Op.getOperand(5);
10748 // Hack around illegal type on SI by directly selecting it.
10749 if (isTypeLegal(Src0.getValueType()))
10750 return SDValue();
10751
10752 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10753 SDValue Undef = DAG.getPOISON(MVT::f32);
10754 const SDValue Ops[] = {
10755 Op.getOperand(2), // tgt
10756 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10757 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10758 Undef, // src2
10759 Undef, // src3
10760 Op.getOperand(7), // vm
10761 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10762 Op.getOperand(3), // en
10763 Op.getOperand(0) // Chain
10764 };
10765
10766 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10767 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10768 }
10769
10770 case Intrinsic::amdgcn_struct_tbuffer_store:
10771 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10772 SDValue VData = Op.getOperand(2);
10773 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10774 if (IsD16)
10775 VData = handleD16VData(VData, DAG);
10776 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10777 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10778 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10779 SDValue Ops[] = {
10780 Chain,
10781 VData, // vdata
10782 Rsrc, // rsrc
10783 Op.getOperand(4), // vindex
10784 VOffset, // voffset
10785 SOffset, // soffset
10786 Offset, // offset
10787 Op.getOperand(7), // format
10788 Op.getOperand(8), // cachepolicy, swizzled buffer
10789 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10790 };
10791 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10793 MemSDNode *M = cast<MemSDNode>(Op);
10794 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10795 M->getMemoryVT(), M->getMemOperand());
10796 }
10797
10798 case Intrinsic::amdgcn_raw_tbuffer_store:
10799 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10800 SDValue VData = Op.getOperand(2);
10801 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10802 if (IsD16)
10803 VData = handleD16VData(VData, DAG);
10804 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10805 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10806 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10807 SDValue Ops[] = {
10808 Chain,
10809 VData, // vdata
10810 Rsrc, // rsrc
10811 DAG.getConstant(0, DL, MVT::i32), // vindex
10812 VOffset, // voffset
10813 SOffset, // soffset
10814 Offset, // offset
10815 Op.getOperand(6), // format
10816 Op.getOperand(7), // cachepolicy, swizzled buffer
10817 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10818 };
10819 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10821 MemSDNode *M = cast<MemSDNode>(Op);
10822 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10823 M->getMemoryVT(), M->getMemOperand());
10824 }
10825
10826 case Intrinsic::amdgcn_raw_buffer_store:
10827 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10828 case Intrinsic::amdgcn_raw_buffer_store_format:
10829 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10830 const bool IsFormat =
10831 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10832 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10833
10834 SDValue VData = Op.getOperand(2);
10835 EVT VDataVT = VData.getValueType();
10836 EVT EltType = VDataVT.getScalarType();
10837 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10838 if (IsD16) {
10839 VData = handleD16VData(VData, DAG);
10840 VDataVT = VData.getValueType();
10841 }
10842
10843 if (!isTypeLegal(VDataVT)) {
10844 VData =
10845 DAG.getNode(ISD::BITCAST, DL,
10846 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10847 }
10848
10849 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10850 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10851 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10852 SDValue Ops[] = {
10853 Chain,
10854 VData,
10855 Rsrc,
10856 DAG.getConstant(0, DL, MVT::i32), // vindex
10857 VOffset, // voffset
10858 SOffset, // soffset
10859 Offset, // offset
10860 Op.getOperand(6), // cachepolicy, swizzled buffer
10861 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10862 };
10863 unsigned Opc =
10866 MemSDNode *M = cast<MemSDNode>(Op);
10867
10868 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10869 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10870 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
10871
10872 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10873 M->getMemoryVT(), M->getMemOperand());
10874 }
10875
10876 case Intrinsic::amdgcn_struct_buffer_store:
10877 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10878 case Intrinsic::amdgcn_struct_buffer_store_format:
10879 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10880 const bool IsFormat =
10881 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10882 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10883
10884 SDValue VData = Op.getOperand(2);
10885 EVT VDataVT = VData.getValueType();
10886 EVT EltType = VDataVT.getScalarType();
10887 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10888
10889 if (IsD16) {
10890 VData = handleD16VData(VData, DAG);
10891 VDataVT = VData.getValueType();
10892 }
10893
10894 if (!isTypeLegal(VDataVT)) {
10895 VData =
10896 DAG.getNode(ISD::BITCAST, DL,
10897 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
10898 }
10899
10900 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10901 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10902 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10903 SDValue Ops[] = {
10904 Chain,
10905 VData,
10906 Rsrc,
10907 Op.getOperand(4), // vindex
10908 VOffset, // voffset
10909 SOffset, // soffset
10910 Offset, // offset
10911 Op.getOperand(7), // cachepolicy, swizzled buffer
10912 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10913 };
10914 unsigned Opc =
10917 MemSDNode *M = cast<MemSDNode>(Op);
10918
10919 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10920 EVT VDataType = VData.getValueType().getScalarType();
10921 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10922 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
10923
10924 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
10925 M->getMemoryVT(), M->getMemOperand());
10926 }
10927 case Intrinsic::amdgcn_raw_buffer_load_lds:
10928 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10929 case Intrinsic::amdgcn_struct_buffer_load_lds:
10930 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10931 if (!Subtarget->hasVMemToLDSLoad())
10932 return SDValue();
10933 unsigned Opc;
10934 bool HasVIndex =
10935 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10936 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10937 unsigned OpOffset = HasVIndex ? 1 : 0;
10938 SDValue VOffset = Op.getOperand(5 + OpOffset);
10939 bool HasVOffset = !isNullConstant(VOffset);
10940 unsigned Size = Op->getConstantOperandVal(4);
10941
10942 switch (Size) {
10943 default:
10944 return SDValue();
10945 case 1:
10946 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10947 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10948 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10949 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10950 break;
10951 case 2:
10952 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10953 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10954 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10955 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10956 break;
10957 case 4:
10958 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10959 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10960 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10961 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10962 break;
10963 case 12:
10964 if (!Subtarget->hasLDSLoadB96_B128())
10965 return SDValue();
10966 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10967 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10968 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10969 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10970 break;
10971 case 16:
10972 if (!Subtarget->hasLDSLoadB96_B128())
10973 return SDValue();
10974 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10975 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10976 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10977 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10978 break;
10979 }
10980
10981 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
10982
10984
10985 if (HasVIndex && HasVOffset)
10986 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
10987 {Op.getOperand(5), // VIndex
10988 VOffset}));
10989 else if (HasVIndex)
10990 Ops.push_back(Op.getOperand(5));
10991 else if (HasVOffset)
10992 Ops.push_back(VOffset);
10993
10994 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10995 Ops.push_back(Rsrc);
10996 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
10997 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
10998 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10999 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11000 Ops.push_back(DAG.getTargetConstant(
11001 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11002 DL, MVT::i8)); // cpol
11003 Ops.push_back(DAG.getTargetConstant(
11004 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11005 ? 1
11006 : 0,
11007 DL, MVT::i8)); // swz
11008 Ops.push_back(M0Val.getValue(0)); // Chain
11009 Ops.push_back(M0Val.getValue(1)); // Glue
11010
11011 auto *M = cast<MemSDNode>(Op);
11012 MachineMemOperand *LoadMMO = M->getMemOperand();
11013 // Don't set the offset value here because the pointer points to the base of
11014 // the buffer.
11015 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11016
11017 MachinePointerInfo StorePtrI = LoadPtrI;
11018 LoadPtrI.V = PoisonValue::get(
11022
11023 auto F = LoadMMO->getFlags() &
11025 LoadMMO =
11027 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11028
11029 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11030 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11031 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11032
11033 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11034 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11035
11036 return SDValue(Load, 0);
11037 }
11038 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11039 // for "trust me" that the remaining cases are global pointers until
11040 // such time as we can put two mem operands on an intrinsic.
11041 case Intrinsic::amdgcn_load_to_lds:
11042 case Intrinsic::amdgcn_global_load_lds: {
11043 if (!Subtarget->hasVMemToLDSLoad())
11044 return SDValue();
11045
11046 unsigned Opc;
11047 unsigned Size = Op->getConstantOperandVal(4);
11048 switch (Size) {
11049 default:
11050 return SDValue();
11051 case 1:
11052 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11053 break;
11054 case 2:
11055 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11056 break;
11057 case 4:
11058 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11059 break;
11060 case 12:
11061 if (!Subtarget->hasLDSLoadB96_B128())
11062 return SDValue();
11063 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11064 break;
11065 case 16:
11066 if (!Subtarget->hasLDSLoadB96_B128())
11067 return SDValue();
11068 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11069 break;
11070 }
11071
11072 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11073
11075
11076 SDValue Addr = Op.getOperand(2); // Global ptr
11077 SDValue VOffset;
11078 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11079 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11080 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
11081 SDValue LHS = Addr.getOperand(0);
11082 SDValue RHS = Addr.getOperand(1);
11083
11084 if (LHS->isDivergent())
11085 std::swap(LHS, RHS);
11086
11087 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11088 RHS.getOperand(0).getValueType() == MVT::i32) {
11089 // add (i64 sgpr), (zero_extend (i32 vgpr))
11090 Addr = LHS;
11091 VOffset = RHS.getOperand(0);
11092 }
11093 }
11094
11095 Ops.push_back(Addr);
11096 if (!Addr->isDivergent()) {
11098 if (!VOffset)
11099 VOffset =
11100 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11101 DAG.getTargetConstant(0, DL, MVT::i32)),
11102 0);
11103 Ops.push_back(VOffset);
11104 }
11105
11106 Ops.push_back(Op.getOperand(5)); // Offset
11107 Ops.push_back(Op.getOperand(6)); // CPol
11108 Ops.push_back(M0Val.getValue(0)); // Chain
11109 Ops.push_back(M0Val.getValue(1)); // Glue
11110
11111 auto *M = cast<MemSDNode>(Op);
11112 MachineMemOperand *LoadMMO = M->getMemOperand();
11113 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11114 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11115 MachinePointerInfo StorePtrI = LoadPtrI;
11116 LoadPtrI.V = PoisonValue::get(
11120 auto F = LoadMMO->getFlags() &
11122 LoadMMO =
11124 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11125 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11126 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11127 LoadMMO->getAAInfo());
11128
11129 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11130 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11131
11132 return SDValue(Load, 0);
11133 }
11134 case Intrinsic::amdgcn_end_cf:
11135 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11136 Op->getOperand(2), Chain),
11137 0);
11138 case Intrinsic::amdgcn_s_barrier_init:
11139 case Intrinsic::amdgcn_s_barrier_signal_var: {
11140 // these two intrinsics have two operands: barrier pointer and member count
11141 SDValue Chain = Op->getOperand(0);
11143 SDValue BarOp = Op->getOperand(2);
11144 SDValue CntOp = Op->getOperand(3);
11145 SDValue M0Val;
11146 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11147 ? AMDGPU::S_BARRIER_INIT_M0
11148 : AMDGPU::S_BARRIER_SIGNAL_M0;
11149 // extract the BarrierID from bits 4-9 of BarOp
11150 SDValue BarID;
11151 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11152 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11153 BarID =
11154 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11155 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11156 0);
11157 // Member count should be put into M0[ShAmt:+6]
11158 // Barrier ID should be put into M0[5:0]
11159 M0Val =
11160 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11161 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11162 0);
11163 constexpr unsigned ShAmt = 16;
11164 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11165 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11166
11167 M0Val = SDValue(
11168 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11169
11170 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11171
11172 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11173 return SDValue(NewMI, 0);
11174 }
11175 case Intrinsic::amdgcn_s_barrier_join: {
11176 // these three intrinsics have one operand: barrier pointer
11177 SDValue Chain = Op->getOperand(0);
11179 SDValue BarOp = Op->getOperand(2);
11180 unsigned Opc;
11181
11182 if (isa<ConstantSDNode>(BarOp)) {
11183 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11184 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11185
11186 // extract the BarrierID from bits 4-9 of the immediate
11187 unsigned BarID = (BarVal >> 4) & 0x3F;
11188 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11189 Ops.push_back(K);
11190 Ops.push_back(Chain);
11191 } else {
11192 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11193
11194 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11195 SDValue M0Val;
11196 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11197 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11198 M0Val =
11199 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11200 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11201 0);
11202 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11203 }
11204
11205 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11206 return SDValue(NewMI, 0);
11207 }
11208 case Intrinsic::amdgcn_s_prefetch_data: {
11209 // For non-global address space preserve the chain and remove the call.
11211 return Op.getOperand(0);
11212 return Op;
11213 }
11214 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11215 SDValue Ops[] = {
11216 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11217 Op.getOperand(3), // offset
11218 Op.getOperand(4), // length
11219 };
11220
11221 MemSDNode *M = cast<MemSDNode>(Op);
11223 Op->getVTList(), Ops, M->getMemoryVT(),
11224 M->getMemOperand());
11225 }
11226 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11227 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11228 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11229 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11230 SDValue Chain = Op->getOperand(0);
11231 SDValue Ptr = Op->getOperand(2);
11232 SDValue Val = Op->getOperand(3);
11233 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11234 Ptr, MII->getMemOperand());
11235 }
11236 default: {
11237 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11239 return lowerImage(Op, ImageDimIntr, DAG, true);
11240
11241 return Op;
11242 }
11243 }
11244}
11245
11246// Return whether the operation has NoUnsignedWrap property.
11247static bool isNoUnsignedWrap(SDValue Addr) {
11248 return (Addr.getOpcode() == ISD::ADD &&
11249 Addr->getFlags().hasNoUnsignedWrap()) ||
11250 Addr->getOpcode() == ISD::OR;
11251}
11252
11254 EVT PtrVT) const {
11255 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11256}
11257
11258// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11259// offset (the offset that is included in bounds checking and swizzling, to be
11260// split between the instruction's voffset and immoffset fields) and soffset
11261// (the offset that is excluded from bounds checking and swizzling, to go in
11262// the instruction's soffset field). This function takes the first kind of
11263// offset and figures out how to split it between voffset and immoffset.
11264std::pair<SDValue, SDValue>
11265SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11266 SDLoc DL(Offset);
11267 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11268 SDValue N0 = Offset;
11269 ConstantSDNode *C1 = nullptr;
11270
11271 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11272 N0 = SDValue();
11273 else if (DAG.isBaseWithConstantOffset(N0)) {
11274 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11275 // being added, so we can only safely match a 32-bit addition with no
11276 // unsigned overflow.
11277 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11278 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11279 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11280 N0 = N0.getOperand(0);
11281 }
11282 }
11283
11284 if (C1) {
11285 unsigned ImmOffset = C1->getZExtValue();
11286 // If the immediate value is too big for the immoffset field, put only bits
11287 // that would normally fit in the immoffset field. The remaining value that
11288 // is copied/added for the voffset field is a large power of 2, and it
11289 // stands more chance of being CSEd with the copy/add for another similar
11290 // load/store.
11291 // However, do not do that rounding down if that is a negative
11292 // number, as it appears to be illegal to have a negative offset in the
11293 // vgpr, even if adding the immediate offset makes it positive.
11294 unsigned Overflow = ImmOffset & ~MaxImm;
11295 ImmOffset -= Overflow;
11296 if ((int32_t)Overflow < 0) {
11297 Overflow += ImmOffset;
11298 ImmOffset = 0;
11299 }
11300 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11301 if (Overflow) {
11302 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11303 if (!N0)
11304 N0 = OverflowVal;
11305 else {
11306 SDValue Ops[] = {N0, OverflowVal};
11307 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11308 }
11309 }
11310 }
11311 if (!N0)
11312 N0 = DAG.getConstant(0, DL, MVT::i32);
11313 if (!C1)
11314 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11315 return {N0, SDValue(C1, 0)};
11316}
11317
11318// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11319// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11320// pointed to by Offsets.
11321void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11322 SelectionDAG &DAG, SDValue *Offsets,
11323 Align Alignment) const {
11324 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11325 SDLoc DL(CombinedOffset);
11326 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11327 uint32_t Imm = C->getZExtValue();
11328 uint32_t SOffset, ImmOffset;
11329 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11330 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11331 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11332 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11333 return;
11334 }
11335 }
11336 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11337 SDValue N0 = CombinedOffset.getOperand(0);
11338 SDValue N1 = CombinedOffset.getOperand(1);
11339 uint32_t SOffset, ImmOffset;
11340 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11341 if (Offset >= 0 &&
11342 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11343 Offsets[0] = N0;
11344 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11345 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11346 return;
11347 }
11348 }
11349
11350 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11351 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11352 : DAG.getConstant(0, DL, MVT::i32);
11353
11354 Offsets[0] = CombinedOffset;
11355 Offsets[1] = SOffsetZero;
11356 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11357}
11358
11359SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11360 SelectionDAG &DAG) const {
11361 if (!MaybePointer.getValueType().isScalarInteger())
11362 return MaybePointer;
11363
11364 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11365 return Rsrc;
11366}
11367
11368// Wrap a global or flat pointer into a buffer intrinsic using the flags
11369// specified in the intrinsic.
11370SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11371 SelectionDAG &DAG) const {
11372 SDLoc Loc(Op);
11373
11374 SDValue Pointer = Op->getOperand(1);
11375 SDValue Stride = Op->getOperand(2);
11376 SDValue NumRecords = Op->getOperand(3);
11377 SDValue Flags = Op->getOperand(4);
11378
11379 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11380 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11381 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11382 std::optional<uint32_t> ConstStride = std::nullopt;
11383 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11384 ConstStride = ConstNode->getZExtValue();
11385
11386 SDValue NewHighHalf = Masked;
11387 if (!ConstStride || *ConstStride != 0) {
11388 SDValue ShiftedStride;
11389 if (ConstStride) {
11390 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11391 } else {
11392 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11393 ShiftedStride =
11394 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11395 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11396 }
11397 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11398 }
11399
11400 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11401 NewHighHalf, NumRecords, Flags);
11402 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11403 return RsrcPtr;
11404}
11405
11406// Handle 8 bit and 16 bit buffer loads
11407SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11408 EVT LoadVT, SDLoc DL,
11410 MachineMemOperand *MMO,
11411 bool IsTFE) const {
11412 EVT IntVT = LoadVT.changeTypeToInteger();
11413
11414 if (IsTFE) {
11415 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11418 MachineFunction &MF = DAG.getMachineFunction();
11419 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11420 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11421 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11422 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11423 DAG.getConstant(1, DL, MVT::i32));
11424 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11425 DAG.getConstant(0, DL, MVT::i32));
11426 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11427 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11428 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11429 }
11430
11431 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11434
11435 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11436 SDValue BufferLoad =
11437 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11438 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11439 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11440
11441 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11442}
11443
11444// Handle 8 bit and 16 bit buffer stores
11445SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11446 EVT VDataType, SDLoc DL,
11447 SDValue Ops[],
11448 MemSDNode *M) const {
11449 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11450 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11451
11452 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11453 Ops[1] = BufferStoreExt;
11454 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11455 : AMDGPUISD::BUFFER_STORE_SHORT;
11456 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11457 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11458 M->getMemOperand());
11459}
11460
11462 SDValue Op, const SDLoc &SL, EVT VT) {
11463 if (VT.bitsLT(Op.getValueType()))
11464 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11465
11466 switch (ExtType) {
11467 case ISD::SEXTLOAD:
11468 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11469 case ISD::ZEXTLOAD:
11470 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11471 case ISD::EXTLOAD:
11472 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11473 case ISD::NON_EXTLOAD:
11474 return Op;
11475 }
11476
11477 llvm_unreachable("invalid ext type");
11478}
11479
11480// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11481// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11482SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11483 DAGCombinerInfo &DCI) const {
11484 SelectionDAG &DAG = DCI.DAG;
11485 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11486 return SDValue();
11487
11488 // FIXME: Constant loads should all be marked invariant.
11489 unsigned AS = Ld->getAddressSpace();
11490 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11492 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11493 return SDValue();
11494
11495 // Don't do this early, since it may interfere with adjacent load merging for
11496 // illegal types. We can avoid losing alignment information for exotic types
11497 // pre-legalize.
11498 EVT MemVT = Ld->getMemoryVT();
11499 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11500 MemVT.getSizeInBits() >= 32)
11501 return SDValue();
11502
11503 SDLoc SL(Ld);
11504
11505 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11506 "unexpected vector extload");
11507
11508 // TODO: Drop only high part of range.
11509 SDValue Ptr = Ld->getBasePtr();
11510 SDValue NewLoad = DAG.getLoad(
11511 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11512 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11513 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11514 nullptr); // Drop ranges
11515
11516 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11517 if (MemVT.isFloatingPoint()) {
11519 "unexpected fp extload");
11520 TruncVT = MemVT.changeTypeToInteger();
11521 }
11522
11523 SDValue Cvt = NewLoad;
11524 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11525 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11526 DAG.getValueType(TruncVT));
11527 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11529 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11530 } else {
11532 }
11533
11534 EVT VT = Ld->getValueType(0);
11535 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11536
11537 DCI.AddToWorklist(Cvt.getNode());
11538
11539 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11540 // the appropriate extension from the 32-bit load.
11541 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11542 DCI.AddToWorklist(Cvt.getNode());
11543
11544 // Handle conversion back to floating point if necessary.
11545 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11546
11547 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11548}
11549
11551 const SIMachineFunctionInfo &Info) {
11552 // TODO: Should check if the address can definitely not access stack.
11553 if (Info.isEntryFunction())
11554 return Info.getUserSGPRInfo().hasFlatScratchInit();
11555 return true;
11556}
11557
11558SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11559 SDLoc DL(Op);
11560 LoadSDNode *Load = cast<LoadSDNode>(Op);
11561 ISD::LoadExtType ExtType = Load->getExtensionType();
11562 EVT MemVT = Load->getMemoryVT();
11563 MachineMemOperand *MMO = Load->getMemOperand();
11564
11565 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11566 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11567 return SDValue();
11568
11569 // FIXME: Copied from PPC
11570 // First, load into 32 bits, then truncate to 1 bit.
11571
11572 SDValue Chain = Load->getChain();
11573 SDValue BasePtr = Load->getBasePtr();
11574
11575 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11576
11577 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11578 RealMemVT, MMO);
11579
11580 if (!MemVT.isVector()) {
11581 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11582 NewLD.getValue(1)};
11583
11584 return DAG.getMergeValues(Ops, DL);
11585 }
11586
11588 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11589 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11590 DAG.getConstant(I, DL, MVT::i32));
11591
11592 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11593 }
11594
11595 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11596
11597 return DAG.getMergeValues(Ops, DL);
11598 }
11599
11600 if (!MemVT.isVector())
11601 return SDValue();
11602
11603 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11604 "Custom lowering for non-i32 vectors hasn't been implemented.");
11605
11606 Align Alignment = Load->getAlign();
11607 unsigned AS = Load->getAddressSpace();
11608 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11609 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11610 return SplitVectorLoad(Op, DAG);
11611 }
11612
11613 MachineFunction &MF = DAG.getMachineFunction();
11614 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11615 // If there is a possibility that flat instruction access scratch memory
11616 // then we need to use the same legalization rules we use for private.
11617 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11618 !Subtarget->hasMultiDwordFlatScratchAddressing())
11619 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11622
11623 unsigned NumElements = MemVT.getVectorNumElements();
11624
11625 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11627 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11628 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11630 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11631 Alignment >= Align(4) && NumElements < 32) {
11632 if (MemVT.isPow2VectorType() ||
11633 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11634 return SDValue();
11635 return WidenOrSplitVectorLoad(Op, DAG);
11636 }
11637 // Non-uniform loads will be selected to MUBUF instructions, so they
11638 // have the same legalization requirements as global and private
11639 // loads.
11640 //
11641 }
11642 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11645 if (NumElements > 4)
11646 return SplitVectorLoad(Op, DAG);
11647 // v3 loads not supported on SI.
11648 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11649 return WidenOrSplitVectorLoad(Op, DAG);
11650
11651 // v3 and v4 loads are supported for private and global memory.
11652 return SDValue();
11653 }
11654 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11655 // Depending on the setting of the private_element_size field in the
11656 // resource descriptor, we can only make private accesses up to a certain
11657 // size.
11658 switch (Subtarget->getMaxPrivateElementSize()) {
11659 case 4: {
11660 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11661 return DAG.getMergeValues({Op0, Op1}, DL);
11662 }
11663 case 8:
11664 if (NumElements > 2)
11665 return SplitVectorLoad(Op, DAG);
11666 return SDValue();
11667 case 16:
11668 // Same as global/flat
11669 if (NumElements > 4)
11670 return SplitVectorLoad(Op, DAG);
11671 // v3 loads not supported on SI.
11672 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11673 return WidenOrSplitVectorLoad(Op, DAG);
11674
11675 return SDValue();
11676 default:
11677 llvm_unreachable("unsupported private_element_size");
11678 }
11679 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11680 unsigned Fast = 0;
11681 auto Flags = Load->getMemOperand()->getFlags();
11683 Load->getAlign(), Flags, &Fast) &&
11684 Fast > 1)
11685 return SDValue();
11686
11687 if (MemVT.isVector())
11688 return SplitVectorLoad(Op, DAG);
11689 }
11690
11692 MemVT, *Load->getMemOperand())) {
11693 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11694 return DAG.getMergeValues({Op0, Op1}, DL);
11695 }
11696
11697 return SDValue();
11698}
11699
11700SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11701 EVT VT = Op.getValueType();
11702 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11703 VT.getSizeInBits() == 512)
11704 return splitTernaryVectorOp(Op, DAG);
11705
11706 assert(VT.getSizeInBits() == 64);
11707
11708 SDLoc DL(Op);
11709 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11710
11711 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11712 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11713
11714 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11715 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11716
11717 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11718 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11719
11720 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11721
11722 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11723 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11724
11725 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11726
11727 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11728 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11729}
11730
11731// Catch division cases where we can use shortcuts with rcp and rsq
11732// instructions.
11733SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11734 SelectionDAG &DAG) const {
11735 SDLoc SL(Op);
11736 SDValue LHS = Op.getOperand(0);
11737 SDValue RHS = Op.getOperand(1);
11738 EVT VT = Op.getValueType();
11739 const SDNodeFlags Flags = Op->getFlags();
11740
11741 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11742
11743 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11744 // Without !fpmath accuracy information, we can't do more because we don't
11745 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11746 // f16 is always accurate enough
11747 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11748 return SDValue();
11749
11750 if (CLHS->isExactlyValue(1.0)) {
11751 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11752 // the CI documentation has a worst case error of 1 ulp.
11753 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11754 // use it as long as we aren't trying to use denormals.
11755 //
11756 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11757
11758 // 1.0 / sqrt(x) -> rsq(x)
11759
11760 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11761 // error seems really high at 2^29 ULP.
11762 // 1.0 / x -> rcp(x)
11763 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11764 }
11765
11766 // Same as for 1.0, but expand the sign out of the constant.
11767 if (CLHS->isExactlyValue(-1.0)) {
11768 // -1.0 / x -> rcp (fneg x)
11769 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11770 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
11771 }
11772 }
11773
11774 // For f16 and bf16 require afn or arcp.
11775 // For f32 require afn.
11776 if (!AllowInaccurateRcp &&
11777 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
11778 return SDValue();
11779
11780 // Turn into multiply by the reciprocal.
11781 // x / y -> x * (1.0 / y)
11782 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11783 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
11784}
11785
11786SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
11787 SelectionDAG &DAG) const {
11788 SDLoc SL(Op);
11789 SDValue X = Op.getOperand(0);
11790 SDValue Y = Op.getOperand(1);
11791 EVT VT = Op.getValueType();
11792 const SDNodeFlags Flags = Op->getFlags();
11793
11794 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
11795 if (!AllowInaccurateDiv)
11796 return SDValue();
11797
11798 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
11799 SDValue One = DAG.getConstantFP(1.0, SL, VT);
11800
11801 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
11802 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11803
11804 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
11805 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
11806 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
11807 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
11808 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
11809 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
11810}
11811
11812static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11813 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
11814 SDNodeFlags Flags) {
11815 if (GlueChain->getNumValues() <= 1) {
11816 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
11817 }
11818
11819 assert(GlueChain->getNumValues() == 3);
11820
11821 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11822 switch (Opcode) {
11823 default:
11824 llvm_unreachable("no chain equivalent for opcode");
11825 case ISD::FMUL:
11826 Opcode = AMDGPUISD::FMUL_W_CHAIN;
11827 break;
11828 }
11829
11830 return DAG.getNode(Opcode, SL, VTList,
11831 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
11832 Flags);
11833}
11834
11835static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11836 EVT VT, SDValue A, SDValue B, SDValue C,
11837 SDValue GlueChain, SDNodeFlags Flags) {
11838 if (GlueChain->getNumValues() <= 1) {
11839 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
11840 }
11841
11842 assert(GlueChain->getNumValues() == 3);
11843
11844 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
11845 switch (Opcode) {
11846 default:
11847 llvm_unreachable("no chain equivalent for opcode");
11848 case ISD::FMA:
11849 Opcode = AMDGPUISD::FMA_W_CHAIN;
11850 break;
11851 }
11852
11853 return DAG.getNode(Opcode, SL, VTList,
11854 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
11855 Flags);
11856}
11857
11858SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
11859 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11860 return FastLowered;
11861
11862 SDLoc SL(Op);
11863 EVT VT = Op.getValueType();
11864 SDValue LHS = Op.getOperand(0);
11865 SDValue RHS = Op.getOperand(1);
11866
11867 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
11868 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
11869
11870 if (VT == MVT::bf16) {
11871 SDValue ExtDiv =
11872 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
11873 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
11874 DAG.getTargetConstant(0, SL, MVT::i32));
11875 }
11876
11877 assert(VT == MVT::f16);
11878
11879 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
11880 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
11881 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
11882 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
11883 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11884 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
11885 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11886 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
11887 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
11888 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
11889 // q16.u = opx(V_CVT_F16_F32, q32.u);
11890 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
11891
11892 // We will use ISD::FMA on targets that don't support ISD::FMAD.
11893 unsigned FMADOpCode =
11895 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
11896 SDValue Rcp =
11897 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
11898 SDValue Quot =
11899 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
11900 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11901 Op->getFlags());
11902 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
11903 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
11904 Op->getFlags());
11905 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
11906 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
11907 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
11908 DAG.getConstant(0xff800000, SL, MVT::i32));
11909 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
11910 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
11911 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
11912 DAG.getTargetConstant(0, SL, MVT::i32));
11913 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
11914 Op->getFlags());
11915}
11916
11917// Faster 2.5 ULP division that does not support denormals.
11918SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
11919 SDNodeFlags Flags = Op->getFlags();
11920 SDLoc SL(Op);
11921 SDValue LHS = Op.getOperand(1);
11922 SDValue RHS = Op.getOperand(2);
11923
11924 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
11925
11926 const APFloat K0Val(0x1p+96f);
11927 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
11928
11929 const APFloat K1Val(0x1p-32f);
11930 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
11931
11932 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11933
11934 EVT SetCCVT =
11935 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
11936
11937 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
11938
11939 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
11940
11941 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
11942
11943 // rcp does not support denormals.
11944 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
11945
11946 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
11947
11948 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
11949}
11950
11951// Returns immediate value for setting the F32 denorm mode when using the
11952// S_DENORM_MODE instruction.
11955 const GCNSubtarget *ST) {
11956 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
11957 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
11958 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11959 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
11960}
11961
11962SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
11963 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11964 return FastLowered;
11965
11966 // The selection matcher assumes anything with a chain selecting to a
11967 // mayRaiseFPException machine instruction. Since we're introducing a chain
11968 // here, we need to explicitly report nofpexcept for the regular fdiv
11969 // lowering.
11970 SDNodeFlags Flags = Op->getFlags();
11971 Flags.setNoFPExcept(true);
11972
11973 SDLoc SL(Op);
11974 SDValue LHS = Op.getOperand(0);
11975 SDValue RHS = Op.getOperand(1);
11976
11977 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
11978
11979 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
11980
11981 SDValue DenominatorScaled =
11982 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
11983 SDValue NumeratorScaled =
11984 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
11985
11986 // Denominator is scaled to not be denormal, so using rcp is ok.
11987 SDValue ApproxRcp =
11988 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
11989 SDValue NegDivScale0 =
11990 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
11991
11992 using namespace AMDGPU::Hwreg;
11993 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
11994 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
11995
11996 const MachineFunction &MF = DAG.getMachineFunction();
11997 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
11998 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
11999
12000 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12001 const bool HasDynamicDenormals =
12002 (DenormMode.Input == DenormalMode::Dynamic) ||
12003 (DenormMode.Output == DenormalMode::Dynamic);
12004
12005 SDValue SavedDenormMode;
12006
12007 if (!PreservesDenormals) {
12008 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12009 // lowering. The chain dependence is insufficient, and we need glue. We do
12010 // not need the glue variants in a strictfp function.
12011
12012 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12013
12014 SDValue Glue = DAG.getEntryNode();
12015 if (HasDynamicDenormals) {
12016 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12017 DAG.getVTList(MVT::i32, MVT::Glue),
12018 {BitField, Glue});
12019 SavedDenormMode = SDValue(GetReg, 0);
12020
12021 Glue = DAG.getMergeValues(
12022 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12023 }
12024
12025 SDNode *EnableDenorm;
12026 if (Subtarget->hasDenormModeInst()) {
12027 const SDValue EnableDenormValue =
12029
12030 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12031 EnableDenormValue)
12032 .getNode();
12033 } else {
12034 const SDValue EnableDenormValue =
12035 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12036 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12037 {EnableDenormValue, BitField, Glue});
12038 }
12039
12040 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12041 SDValue(EnableDenorm, 1)};
12042
12043 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12044 }
12045
12046 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12047 ApproxRcp, One, NegDivScale0, Flags);
12048
12049 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12050 ApproxRcp, Fma0, Flags);
12051
12052 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12053 Fma1, Flags);
12054
12055 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12056 NumeratorScaled, Mul, Flags);
12057
12058 SDValue Fma3 =
12059 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12060
12061 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12062 NumeratorScaled, Fma3, Flags);
12063
12064 if (!PreservesDenormals) {
12065 SDNode *DisableDenorm;
12066 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12067 const SDValue DisableDenormValue = getSPDenormModeValue(
12068 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12069
12070 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12071 DisableDenorm =
12072 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12073 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12074 .getNode();
12075 } else {
12076 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12077 const SDValue DisableDenormValue =
12078 HasDynamicDenormals
12079 ? SavedDenormMode
12080 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12081
12082 DisableDenorm = DAG.getMachineNode(
12083 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12084 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12085 }
12086
12087 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12088 SDValue(DisableDenorm, 0), DAG.getRoot());
12089 DAG.setRoot(OutputChain);
12090 }
12091
12092 SDValue Scale = NumeratorScaled.getValue(1);
12093 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12094 {Fma4, Fma1, Fma3, Scale}, Flags);
12095
12096 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12097}
12098
12099SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12100 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12101 return FastLowered;
12102
12103 SDLoc SL(Op);
12104 SDValue X = Op.getOperand(0);
12105 SDValue Y = Op.getOperand(1);
12106
12107 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12108
12109 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12110
12111 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12112
12113 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12114
12115 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12116
12117 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12118
12119 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12120
12121 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12122
12123 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12124
12125 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12126 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12127
12128 SDValue Fma4 =
12129 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12130
12131 SDValue Scale;
12132
12133 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12134 // Workaround a hardware bug on SI where the condition output from div_scale
12135 // is not usable.
12136
12137 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12138
12139 // Figure out if the scale to use for div_fmas.
12140 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12141 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12142 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12143 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12144
12145 SDValue NumHi =
12146 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12147 SDValue DenHi =
12148 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12149
12150 SDValue Scale0Hi =
12151 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12152 SDValue Scale1Hi =
12153 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12154
12155 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12156 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12157 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12158 } else {
12159 Scale = DivScale1.getValue(1);
12160 }
12161
12162 SDValue Fmas =
12163 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12164
12165 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12166}
12167
12168SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12169 EVT VT = Op.getValueType();
12170
12171 if (VT == MVT::f32)
12172 return LowerFDIV32(Op, DAG);
12173
12174 if (VT == MVT::f64)
12175 return LowerFDIV64(Op, DAG);
12176
12177 if (VT == MVT::f16 || VT == MVT::bf16)
12178 return LowerFDIV16(Op, DAG);
12179
12180 llvm_unreachable("Unexpected type for fdiv");
12181}
12182
12183SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12184 SDLoc dl(Op);
12185 SDValue Val = Op.getOperand(0);
12186 EVT VT = Val.getValueType();
12187 EVT ResultExpVT = Op->getValueType(1);
12188 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12189
12190 SDValue Mant = DAG.getNode(
12192 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12193
12194 SDValue Exp = DAG.getNode(
12195 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12196 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12197
12198 if (Subtarget->hasFractBug()) {
12199 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12200 SDValue Inf =
12202
12203 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12204 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12205 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12206 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12207 }
12208
12209 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12210 return DAG.getMergeValues({Mant, CastExp}, dl);
12211}
12212
12213SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12214 SDLoc DL(Op);
12215 StoreSDNode *Store = cast<StoreSDNode>(Op);
12216 EVT VT = Store->getMemoryVT();
12217
12218 if (VT == MVT::i1) {
12219 return DAG.getTruncStore(
12220 Store->getChain(), DL,
12221 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12222 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12223 }
12224
12225 assert(VT.isVector() &&
12226 Store->getValue().getValueType().getScalarType() == MVT::i32);
12227
12228 unsigned AS = Store->getAddressSpace();
12229 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12230 Store->getAlign().value() < VT.getStoreSize() &&
12231 VT.getSizeInBits() > 32) {
12232 return SplitVectorStore(Op, DAG);
12233 }
12234
12235 MachineFunction &MF = DAG.getMachineFunction();
12236 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12237 // If there is a possibility that flat instruction access scratch memory
12238 // then we need to use the same legalization rules we use for private.
12239 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12240 !Subtarget->hasMultiDwordFlatScratchAddressing())
12241 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12244
12245 unsigned NumElements = VT.getVectorNumElements();
12247 if (NumElements > 4)
12248 return SplitVectorStore(Op, DAG);
12249 // v3 stores not supported on SI.
12250 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12251 return SplitVectorStore(Op, DAG);
12252
12254 VT, *Store->getMemOperand()))
12255 return expandUnalignedStore(Store, DAG);
12256
12257 return SDValue();
12258 }
12259 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12260 switch (Subtarget->getMaxPrivateElementSize()) {
12261 case 4:
12262 return scalarizeVectorStore(Store, DAG);
12263 case 8:
12264 if (NumElements > 2)
12265 return SplitVectorStore(Op, DAG);
12266 return SDValue();
12267 case 16:
12268 if (NumElements > 4 ||
12269 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12270 return SplitVectorStore(Op, DAG);
12271 return SDValue();
12272 default:
12273 llvm_unreachable("unsupported private_element_size");
12274 }
12275 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12276 unsigned Fast = 0;
12277 auto Flags = Store->getMemOperand()->getFlags();
12279 Store->getAlign(), Flags, &Fast) &&
12280 Fast > 1)
12281 return SDValue();
12282
12283 if (VT.isVector())
12284 return SplitVectorStore(Op, DAG);
12285
12286 return expandUnalignedStore(Store, DAG);
12287 }
12288
12289 // Probably an invalid store. If so we'll end up emitting a selection error.
12290 return SDValue();
12291}
12292
12293// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12294SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12295 SDLoc SL(Op);
12296 assert(!Subtarget->has16BitInsts());
12297 SDNodeFlags Flags = Op->getFlags();
12298 SDValue Ext =
12299 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12300
12301 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12302 SDValue Sqrt =
12303 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12304
12305 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12306 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12307}
12308
12309SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12310 SDLoc DL(Op);
12311 SDNodeFlags Flags = Op->getFlags();
12312 MVT VT = Op.getValueType().getSimpleVT();
12313 const SDValue X = Op.getOperand(0);
12314
12315 if (allowApproxFunc(DAG, Flags)) {
12316 // Instruction is 1ulp but ignores denormals.
12317 return DAG.getNode(
12319 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12320 }
12321
12322 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12323 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12324
12325 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12326
12327 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12328
12329 SDValue SqrtX =
12330 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12331
12332 SDValue SqrtS;
12333 if (needsDenormHandlingF32(DAG, X, Flags)) {
12334 SDValue SqrtID =
12335 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12336 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12337
12338 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12339 SDValue SqrtSNextDownInt =
12340 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12341 DAG.getAllOnesConstant(DL, MVT::i32));
12342 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12343
12344 SDValue NegSqrtSNextDown =
12345 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12346
12347 SDValue SqrtVP =
12348 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12349
12350 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12351 DAG.getConstant(1, DL, MVT::i32));
12352 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12353
12354 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12355 SDValue SqrtVS =
12356 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12357
12358 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12359 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12360
12361 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12362 Flags);
12363
12364 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12365 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12366 Flags);
12367 } else {
12368 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12369
12370 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12371
12372 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12373 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12374 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12375
12376 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12377 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12378 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12379
12380 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12381 SDValue SqrtD =
12382 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12383 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12384 }
12385
12386 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12387
12388 SDValue ScaledDown =
12389 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12390
12391 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12392 SDValue IsZeroOrInf =
12393 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12394 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12395
12396 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12397}
12398
12399SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12400 // For double type, the SQRT and RSQ instructions don't have required
12401 // precision, we apply Goldschmidt's algorithm to improve the result:
12402 //
12403 // y0 = rsq(x)
12404 // g0 = x * y0
12405 // h0 = 0.5 * y0
12406 //
12407 // r0 = 0.5 - h0 * g0
12408 // g1 = g0 * r0 + g0
12409 // h1 = h0 * r0 + h0
12410 //
12411 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12412 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12413 // h2 = h1 * r1 + h1
12414 //
12415 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12416 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12417 //
12418 // sqrt(x) = g3
12419
12420 SDNodeFlags Flags = Op->getFlags();
12421
12422 SDLoc DL(Op);
12423
12424 SDValue X = Op.getOperand(0);
12425 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12426
12427 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12428
12429 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12430
12431 // Scale up input if it is too small.
12432 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12433 SDValue ScaleUp =
12434 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12435 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12436
12437 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12438
12439 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12440
12441 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12442 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12443
12444 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12445 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12446
12447 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12448
12449 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12450
12451 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12452 SDValue SqrtD0 =
12453 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12454
12455 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12456
12457 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12458 SDValue SqrtD1 =
12459 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12460
12461 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12462
12463 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12464 SDValue ScaleDown =
12465 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12466 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12467
12468 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12469 // with finite only or nsz because rsq(+/-0) = +/-inf
12470
12471 // TODO: Check for DAZ and expand to subnormals
12472 SDValue IsZeroOrInf =
12473 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12474 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12475
12476 // If x is +INF, +0, or -0, use its original value
12477 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12478 Flags);
12479}
12480
12481SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12482 SDLoc DL(Op);
12483 EVT VT = Op.getValueType();
12484 SDValue Arg = Op.getOperand(0);
12485 SDValue TrigVal;
12486
12487 // Propagate fast-math flags so that the multiply we introduce can be folded
12488 // if Arg is already the result of a multiply by constant.
12489 auto Flags = Op->getFlags();
12490
12491 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12492
12493 if (Subtarget->hasTrigReducedRange()) {
12494 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12495 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12496 } else {
12497 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12498 }
12499
12500 switch (Op.getOpcode()) {
12501 case ISD::FCOS:
12502 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12503 case ISD::FSIN:
12504 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12505 default:
12506 llvm_unreachable("Wrong trig opcode");
12507 }
12508}
12509
12510SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12511 SelectionDAG &DAG) const {
12512 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12513 assert(AtomicNode->isCompareAndSwap());
12514 unsigned AS = AtomicNode->getAddressSpace();
12515
12516 // No custom lowering required for local address space
12518 return Op;
12519
12520 // Non-local address space requires custom lowering for atomic compare
12521 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12522 SDLoc DL(Op);
12523 SDValue ChainIn = Op.getOperand(0);
12524 SDValue Addr = Op.getOperand(1);
12525 SDValue Old = Op.getOperand(2);
12526 SDValue New = Op.getOperand(3);
12527 EVT VT = Op.getValueType();
12528 MVT SimpleVT = VT.getSimpleVT();
12529 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12530
12531 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12532 SDValue Ops[] = {ChainIn, Addr, NewOld};
12533
12535 Op->getVTList(), Ops, VT,
12536 AtomicNode->getMemOperand());
12537}
12538
12539//===----------------------------------------------------------------------===//
12540// Custom DAG optimizations
12541//===----------------------------------------------------------------------===//
12542
12543SDValue
12544SITargetLowering::performUCharToFloatCombine(SDNode *N,
12545 DAGCombinerInfo &DCI) const {
12546 EVT VT = N->getValueType(0);
12547 EVT ScalarVT = VT.getScalarType();
12548 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12549 return SDValue();
12550
12551 SelectionDAG &DAG = DCI.DAG;
12552 SDLoc DL(N);
12553
12554 SDValue Src = N->getOperand(0);
12555 EVT SrcVT = Src.getValueType();
12556
12557 // TODO: We could try to match extracting the higher bytes, which would be
12558 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12559 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12560 // about in practice.
12561 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12562 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12563 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12564 DCI.AddToWorklist(Cvt.getNode());
12565
12566 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12567 if (ScalarVT != MVT::f32) {
12568 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12569 DAG.getTargetConstant(0, DL, MVT::i32));
12570 }
12571 return Cvt;
12572 }
12573 }
12574
12575 return SDValue();
12576}
12577
12578SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12579 DAGCombinerInfo &DCI) const {
12580 SDValue MagnitudeOp = N->getOperand(0);
12581 SDValue SignOp = N->getOperand(1);
12582
12583 // The generic combine for fcopysign + fp cast is too conservative with
12584 // vectors, and also gets confused by the splitting we will perform here, so
12585 // peek through FP casts.
12586 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12587 SignOp.getOpcode() == ISD::FP_ROUND)
12588 SignOp = SignOp.getOperand(0);
12589
12590 SelectionDAG &DAG = DCI.DAG;
12591 SDLoc DL(N);
12592 EVT SignVT = SignOp.getValueType();
12593
12594 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12595 // lower half with a copy.
12596 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12597 EVT MagVT = MagnitudeOp.getValueType();
12598
12599 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12600
12601 if (MagVT.getScalarType() == MVT::f64) {
12602 EVT F32VT = MagVT.isVector()
12603 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12604 : MVT::v2f32;
12605
12606 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12607
12609 for (unsigned I = 0; I != NumElts; ++I) {
12610 SDValue MagLo =
12611 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12612 DAG.getConstant(2 * I, DL, MVT::i32));
12613 SDValue MagHi =
12614 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12615 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12616
12617 SDValue SignOpElt =
12618 MagVT.isVector()
12620 SignOp, DAG.getConstant(I, DL, MVT::i32))
12621 : SignOp;
12622
12623 SDValue HiOp =
12624 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12625
12626 SDValue Vector =
12627 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12628
12629 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12630 NewElts.push_back(NewElt);
12631 }
12632
12633 if (NewElts.size() == 1)
12634 return NewElts[0];
12635
12636 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12637 }
12638
12639 if (SignVT.getScalarType() != MVT::f64)
12640 return SDValue();
12641
12642 // Reduce width of sign operand, we only need the highest bit.
12643 //
12644 // fcopysign f64:x, f64:y ->
12645 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12646 // TODO: In some cases it might make sense to go all the way to f16.
12647
12648 EVT F32VT = MagVT.isVector()
12649 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12650 : MVT::v2f32;
12651
12652 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12653
12654 SmallVector<SDValue, 8> F32Signs;
12655 for (unsigned I = 0; I != NumElts; ++I) {
12656 // Take sign from odd elements of cast vector
12657 SDValue SignAsF32 =
12658 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12659 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12660 F32Signs.push_back(SignAsF32);
12661 }
12662
12663 SDValue NewSign =
12664 NumElts == 1
12665 ? F32Signs.back()
12667 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12668 F32Signs);
12669
12670 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12671 NewSign);
12672}
12673
12674// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12675// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12676// bits
12677
12678// This is a variant of
12679// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12680//
12681// The normal DAG combiner will do this, but only if the add has one use since
12682// that would increase the number of instructions.
12683//
12684// This prevents us from seeing a constant offset that can be folded into a
12685// memory instruction's addressing mode. If we know the resulting add offset of
12686// a pointer can be folded into an addressing offset, we can replace the pointer
12687// operand with the add of new constant offset. This eliminates one of the uses,
12688// and may allow the remaining use to also be simplified.
12689//
12690SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12691 EVT MemVT,
12692 DAGCombinerInfo &DCI) const {
12693 SDValue N0 = N->getOperand(0);
12694 SDValue N1 = N->getOperand(1);
12695
12696 // We only do this to handle cases where it's profitable when there are
12697 // multiple uses of the add, so defer to the standard combine.
12698 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12699 N0->hasOneUse())
12700 return SDValue();
12701
12702 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12703 if (!CN1)
12704 return SDValue();
12705
12706 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12707 if (!CAdd)
12708 return SDValue();
12709
12710 SelectionDAG &DAG = DCI.DAG;
12711
12712 if (N0->getOpcode() == ISD::OR &&
12713 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12714 return SDValue();
12715
12716 // If the resulting offset is too large, we can't fold it into the
12717 // addressing mode offset.
12718 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12719 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12720
12721 AddrMode AM;
12722 AM.HasBaseReg = true;
12723 AM.BaseOffs = Offset.getSExtValue();
12724 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12725 return SDValue();
12726
12727 SDLoc SL(N);
12728 EVT VT = N->getValueType(0);
12729
12730 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12731 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12732
12733 SDNodeFlags Flags;
12734 Flags.setNoUnsignedWrap(
12735 N->getFlags().hasNoUnsignedWrap() &&
12736 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12737
12738 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12739}
12740
12741/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12742/// by the chain and intrinsic ID. Theoretically we would also need to check the
12743/// specific intrinsic, but they all place the pointer operand first.
12744static unsigned getBasePtrIndex(const MemSDNode *N) {
12745 switch (N->getOpcode()) {
12746 case ISD::STORE:
12749 return 2;
12750 default:
12751 return 1;
12752 }
12753}
12754
12755SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12756 DAGCombinerInfo &DCI) const {
12757 SelectionDAG &DAG = DCI.DAG;
12758
12759 unsigned PtrIdx = getBasePtrIndex(N);
12760 SDValue Ptr = N->getOperand(PtrIdx);
12761
12762 // TODO: We could also do this for multiplies.
12763 if (Ptr.getOpcode() == ISD::SHL) {
12764 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12765 N->getMemoryVT(), DCI);
12766 if (NewPtr) {
12767 SmallVector<SDValue, 8> NewOps(N->ops());
12768
12769 NewOps[PtrIdx] = NewPtr;
12770 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12771 }
12772 }
12773
12774 return SDValue();
12775}
12776
12777static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12778 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12779 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12780 (Opc == ISD::XOR && Val == 0);
12781}
12782
12783// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12784// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
12785// integer combine opportunities since most 64-bit operations are decomposed
12786// this way. TODO: We won't want this for SALU especially if it is an inline
12787// immediate.
12788SDValue SITargetLowering::splitBinaryBitConstantOp(
12789 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
12790 const ConstantSDNode *CRHS) const {
12791 uint64_t Val = CRHS->getZExtValue();
12792 uint32_t ValLo = Lo_32(Val);
12793 uint32_t ValHi = Hi_32(Val);
12794 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12795
12796 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
12798 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
12799 // We have 64-bit scalar and/or/xor, but do not have vector forms.
12800 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
12801 !CRHS->user_begin()->isDivergent())
12802 return SDValue();
12803
12804 // If we need to materialize a 64-bit immediate, it will be split up later
12805 // anyway. Avoid creating the harder to understand 64-bit immediate
12806 // materialization.
12807 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
12808 }
12809
12810 return SDValue();
12811}
12812
12814 if (V.getValueType() != MVT::i1)
12815 return false;
12816 switch (V.getOpcode()) {
12817 default:
12818 break;
12819 case ISD::SETCC:
12820 case ISD::IS_FPCLASS:
12822 return true;
12823 case ISD::AND:
12824 case ISD::OR:
12825 case ISD::XOR:
12826 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
12827 case ISD::SADDO:
12828 case ISD::UADDO:
12829 case ISD::SSUBO:
12830 case ISD::USUBO:
12831 case ISD::SMULO:
12832 case ISD::UMULO:
12833 return V.getResNo() == 1;
12835 unsigned IntrinsicID = V.getConstantOperandVal(0);
12836 switch (IntrinsicID) {
12837 case Intrinsic::amdgcn_is_shared:
12838 case Intrinsic::amdgcn_is_private:
12839 return true;
12840 default:
12841 return false;
12842 }
12843
12844 return false;
12845 }
12846 }
12847 return false;
12848}
12849
12850// If a constant has all zeroes or all ones within each byte return it.
12851// Otherwise return 0.
12853 // 0xff for any zero byte in the mask
12854 uint32_t ZeroByteMask = 0;
12855 if (!(C & 0x000000ff))
12856 ZeroByteMask |= 0x000000ff;
12857 if (!(C & 0x0000ff00))
12858 ZeroByteMask |= 0x0000ff00;
12859 if (!(C & 0x00ff0000))
12860 ZeroByteMask |= 0x00ff0000;
12861 if (!(C & 0xff000000))
12862 ZeroByteMask |= 0xff000000;
12863 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
12864 if ((NonZeroByteMask & C) != NonZeroByteMask)
12865 return 0; // Partial bytes selected.
12866 return C;
12867}
12868
12869// Check if a node selects whole bytes from its operand 0 starting at a byte
12870// boundary while masking the rest. Returns select mask as in the v_perm_b32
12871// or -1 if not succeeded.
12872// Note byte select encoding:
12873// value 0-3 selects corresponding source byte;
12874// value 0xc selects zero;
12875// value 0xff selects 0xff.
12877 assert(V.getValueSizeInBits() == 32);
12878
12879 if (V.getNumOperands() != 2)
12880 return ~0;
12881
12882 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
12883 if (!N1)
12884 return ~0;
12885
12886 uint32_t C = N1->getZExtValue();
12887
12888 switch (V.getOpcode()) {
12889 default:
12890 break;
12891 case ISD::AND:
12892 if (uint32_t ConstMask = getConstantPermuteMask(C))
12893 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12894 break;
12895
12896 case ISD::OR:
12897 if (uint32_t ConstMask = getConstantPermuteMask(C))
12898 return (0x03020100 & ~ConstMask) | ConstMask;
12899 break;
12900
12901 case ISD::SHL:
12902 if (C % 8)
12903 return ~0;
12904
12905 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
12906
12907 case ISD::SRL:
12908 if (C % 8)
12909 return ~0;
12910
12911 return uint32_t(0x0c0c0c0c03020100ull >> C);
12912 }
12913
12914 return ~0;
12915}
12916
12917SDValue SITargetLowering::performAndCombine(SDNode *N,
12918 DAGCombinerInfo &DCI) const {
12919 if (DCI.isBeforeLegalize())
12920 return SDValue();
12921
12922 SelectionDAG &DAG = DCI.DAG;
12923 EVT VT = N->getValueType(0);
12924 SDValue LHS = N->getOperand(0);
12925 SDValue RHS = N->getOperand(1);
12926
12927 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12928 if (VT == MVT::i64 && CRHS) {
12929 if (SDValue Split =
12930 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
12931 return Split;
12932 }
12933
12934 if (CRHS && VT == MVT::i32) {
12935 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
12936 // nb = number of trailing zeroes in mask
12937 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
12938 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
12939 uint64_t Mask = CRHS->getZExtValue();
12940 unsigned Bits = llvm::popcount(Mask);
12941 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
12942 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
12943 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
12944 unsigned Shift = CShift->getZExtValue();
12945 unsigned NB = CRHS->getAPIntValue().countr_zero();
12946 unsigned Offset = NB + Shift;
12947 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
12948 SDLoc SL(N);
12949 SDValue BFE =
12950 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
12951 DAG.getConstant(Offset, SL, MVT::i32),
12952 DAG.getConstant(Bits, SL, MVT::i32));
12953 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
12954 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
12955 DAG.getValueType(NarrowVT));
12956 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
12957 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
12958 return Shl;
12959 }
12960 }
12961 }
12962
12963 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12964 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
12965 isa<ConstantSDNode>(LHS.getOperand(2))) {
12966 uint32_t Sel = getConstantPermuteMask(Mask);
12967 if (!Sel)
12968 return SDValue();
12969
12970 // Select 0xc for all zero bytes
12971 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
12972 SDLoc DL(N);
12973 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12974 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12975 }
12976 }
12977
12978 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
12979 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
12980 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
12981 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
12982 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
12983
12984 SDValue X = LHS.getOperand(0);
12985 SDValue Y = RHS.getOperand(0);
12986 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
12987 !isTypeLegal(X.getValueType()))
12988 return SDValue();
12989
12990 if (LCC == ISD::SETO) {
12991 if (X != LHS.getOperand(1))
12992 return SDValue();
12993
12994 if (RCC == ISD::SETUNE) {
12995 const ConstantFPSDNode *C1 =
12996 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
12997 if (!C1 || !C1->isInfinity() || C1->isNegative())
12998 return SDValue();
12999
13000 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13004
13005 static_assert(
13008 0x3ff) == Mask,
13009 "mask not equal");
13010
13011 SDLoc DL(N);
13012 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13013 DAG.getConstant(Mask, DL, MVT::i32));
13014 }
13015 }
13016 }
13017
13018 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13019 std::swap(LHS, RHS);
13020
13021 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13022 RHS.hasOneUse()) {
13023 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13024 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13025 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13026 // | n_nan)
13027 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13028 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13029 (RHS.getOperand(0) == LHS.getOperand(0) &&
13030 LHS.getOperand(0) == LHS.getOperand(1))) {
13031 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13032 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13033 : Mask->getZExtValue() & OrdMask;
13034
13035 SDLoc DL(N);
13036 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13037 DAG.getConstant(NewMask, DL, MVT::i32));
13038 }
13039 }
13040
13041 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13042 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13043 // and x, (sext cc from i1) => select cc, x, 0
13044 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13045 std::swap(LHS, RHS);
13046 if (isBoolSGPR(RHS.getOperand(0)))
13047 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13048 DAG.getConstant(0, SDLoc(N), MVT::i32));
13049 }
13050
13051 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13052 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13053 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13054 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13055 uint32_t LHSMask = getPermuteMask(LHS);
13056 uint32_t RHSMask = getPermuteMask(RHS);
13057 if (LHSMask != ~0u && RHSMask != ~0u) {
13058 // Canonicalize the expression in an attempt to have fewer unique masks
13059 // and therefore fewer registers used to hold the masks.
13060 if (LHSMask > RHSMask) {
13061 std::swap(LHSMask, RHSMask);
13062 std::swap(LHS, RHS);
13063 }
13064
13065 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13066 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13067 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13068 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13069
13070 // Check of we need to combine values from two sources within a byte.
13071 if (!(LHSUsedLanes & RHSUsedLanes) &&
13072 // If we select high and lower word keep it for SDWA.
13073 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13074 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13075 // Each byte in each mask is either selector mask 0-3, or has higher
13076 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13077 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13078 // mask which is not 0xff wins. By anding both masks we have a correct
13079 // result except that 0x0c shall be corrected to give 0x0c only.
13080 uint32_t Mask = LHSMask & RHSMask;
13081 for (unsigned I = 0; I < 32; I += 8) {
13082 uint32_t ByteSel = 0xff << I;
13083 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13084 Mask &= (0x0c << I) & 0xffffffff;
13085 }
13086
13087 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13088 // or 0x0c.
13089 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13090 SDLoc DL(N);
13091
13092 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13093 RHS.getOperand(0),
13094 DAG.getConstant(Sel, DL, MVT::i32));
13095 }
13096 }
13097 }
13098
13099 return SDValue();
13100}
13101
13102// A key component of v_perm is a mapping between byte position of the src
13103// operands, and the byte position of the dest. To provide such, we need: 1. the
13104// node that provides x byte of the dest of the OR, and 2. the byte of the node
13105// used to provide that x byte. calculateByteProvider finds which node provides
13106// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13107// and finds an ultimate src and byte position For example: The supported
13108// LoadCombine pattern for vector loads is as follows
13109// t1
13110// or
13111// / \
13112// t2 t3
13113// zext shl
13114// | | \
13115// t4 t5 16
13116// or anyext
13117// / \ |
13118// t6 t7 t8
13119// srl shl or
13120// / | / \ / \
13121// t9 t10 t11 t12 t13 t14
13122// trunc* 8 trunc* 8 and and
13123// | | / | | \
13124// t15 t16 t17 t18 t19 t20
13125// trunc* 255 srl -256
13126// | / \
13127// t15 t15 16
13128//
13129// *In this example, the truncs are from i32->i16
13130//
13131// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13132// respectively. calculateSrcByte would find (given node) -> ultimate src &
13133// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13134// After finding the mapping, we can combine the tree into vperm t15, t16,
13135// 0x05000407
13136
13137// Find the source and byte position from a node.
13138// \p DestByte is the byte position of the dest of the or that the src
13139// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13140// dest of the or byte. \p Depth tracks how many recursive iterations we have
13141// performed.
13142static const std::optional<ByteProvider<SDValue>>
13143calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13144 unsigned Depth = 0) {
13145 // We may need to recursively traverse a series of SRLs
13146 if (Depth >= 6)
13147 return std::nullopt;
13148
13149 if (Op.getValueSizeInBits() < 8)
13150 return std::nullopt;
13151
13152 if (Op.getValueType().isVector())
13153 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13154
13155 switch (Op->getOpcode()) {
13156 case ISD::TRUNCATE: {
13157 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13158 }
13159
13160 case ISD::SIGN_EXTEND:
13161 case ISD::ZERO_EXTEND:
13163 SDValue NarrowOp = Op->getOperand(0);
13164 auto NarrowVT = NarrowOp.getValueType();
13165 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13166 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13167 NarrowVT = VTSign->getVT();
13168 }
13169 if (!NarrowVT.isByteSized())
13170 return std::nullopt;
13171 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13172
13173 if (SrcIndex >= NarrowByteWidth)
13174 return std::nullopt;
13175 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13176 }
13177
13178 case ISD::SRA:
13179 case ISD::SRL: {
13180 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13181 if (!ShiftOp)
13182 return std::nullopt;
13183
13184 uint64_t BitShift = ShiftOp->getZExtValue();
13185
13186 if (BitShift % 8 != 0)
13187 return std::nullopt;
13188
13189 SrcIndex += BitShift / 8;
13190
13191 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13192 }
13193
13194 default: {
13195 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13196 }
13197 }
13198 llvm_unreachable("fully handled switch");
13199}
13200
13201// For a byte position in the result of an Or, traverse the tree and find the
13202// node (and the byte of the node) which ultimately provides this {Or,
13203// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13204// the byte position of the Op that corresponds with the originally requested
13205// byte of the Or \p Depth tracks how many recursive iterations we have
13206// performed. \p StartingIndex is the originally requested byte of the Or
13207static const std::optional<ByteProvider<SDValue>>
13208calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13209 unsigned StartingIndex = 0) {
13210 // Finding Src tree of RHS of or typically requires at least 1 additional
13211 // depth
13212 if (Depth > 6)
13213 return std::nullopt;
13214
13215 unsigned BitWidth = Op.getScalarValueSizeInBits();
13216 if (BitWidth % 8 != 0)
13217 return std::nullopt;
13218 if (Index > BitWidth / 8 - 1)
13219 return std::nullopt;
13220
13221 bool IsVec = Op.getValueType().isVector();
13222 switch (Op.getOpcode()) {
13223 case ISD::OR: {
13224 if (IsVec)
13225 return std::nullopt;
13226
13227 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13228 StartingIndex);
13229 if (!RHS)
13230 return std::nullopt;
13231 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13232 StartingIndex);
13233 if (!LHS)
13234 return std::nullopt;
13235 // A well formed Or will have two ByteProviders for each byte, one of which
13236 // is constant zero
13237 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13238 return std::nullopt;
13239 if (!LHS || LHS->isConstantZero())
13240 return RHS;
13241 if (!RHS || RHS->isConstantZero())
13242 return LHS;
13243 return std::nullopt;
13244 }
13245
13246 case ISD::AND: {
13247 if (IsVec)
13248 return std::nullopt;
13249
13250 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13251 if (!BitMaskOp)
13252 return std::nullopt;
13253
13254 uint32_t BitMask = BitMaskOp->getZExtValue();
13255 // Bits we expect for our StartingIndex
13256 uint32_t IndexMask = 0xFF << (Index * 8);
13257
13258 if ((IndexMask & BitMask) != IndexMask) {
13259 // If the result of the and partially provides the byte, then it
13260 // is not well formatted
13261 if (IndexMask & BitMask)
13262 return std::nullopt;
13264 }
13265
13266 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13267 }
13268
13269 case ISD::FSHR: {
13270 if (IsVec)
13271 return std::nullopt;
13272
13273 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13274 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13275 if (!ShiftOp || Op.getValueType().isVector())
13276 return std::nullopt;
13277
13278 uint64_t BitsProvided = Op.getValueSizeInBits();
13279 if (BitsProvided % 8 != 0)
13280 return std::nullopt;
13281
13282 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13283 if (BitShift % 8)
13284 return std::nullopt;
13285
13286 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13287 uint64_t ByteShift = BitShift / 8;
13288
13289 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13290 uint64_t BytesProvided = BitsProvided / 8;
13291 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13292 NewIndex %= BytesProvided;
13293 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13294 }
13295
13296 case ISD::SRA:
13297 case ISD::SRL: {
13298 if (IsVec)
13299 return std::nullopt;
13300
13301 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13302 if (!ShiftOp)
13303 return std::nullopt;
13304
13305 uint64_t BitShift = ShiftOp->getZExtValue();
13306 if (BitShift % 8)
13307 return std::nullopt;
13308
13309 auto BitsProvided = Op.getScalarValueSizeInBits();
13310 if (BitsProvided % 8 != 0)
13311 return std::nullopt;
13312
13313 uint64_t BytesProvided = BitsProvided / 8;
13314 uint64_t ByteShift = BitShift / 8;
13315 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13316 // If the byte we are trying to provide (as tracked by index) falls in this
13317 // range, then the SRL provides the byte. The byte of interest of the src of
13318 // the SRL is Index + ByteShift
13319 return BytesProvided - ByteShift > Index
13320 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13321 Index + ByteShift)
13323 }
13324
13325 case ISD::SHL: {
13326 if (IsVec)
13327 return std::nullopt;
13328
13329 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13330 if (!ShiftOp)
13331 return std::nullopt;
13332
13333 uint64_t BitShift = ShiftOp->getZExtValue();
13334 if (BitShift % 8 != 0)
13335 return std::nullopt;
13336 uint64_t ByteShift = BitShift / 8;
13337
13338 // If we are shifting by an amount greater than (or equal to)
13339 // the index we are trying to provide, then it provides 0s. If not,
13340 // then this bytes are not definitively 0s, and the corresponding byte
13341 // of interest is Index - ByteShift of the src
13342 return Index < ByteShift
13344 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13345 Depth + 1, StartingIndex);
13346 }
13347 case ISD::ANY_EXTEND:
13348 case ISD::SIGN_EXTEND:
13349 case ISD::ZERO_EXTEND:
13351 case ISD::AssertZext:
13352 case ISD::AssertSext: {
13353 if (IsVec)
13354 return std::nullopt;
13355
13356 SDValue NarrowOp = Op->getOperand(0);
13357 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13358 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13359 Op->getOpcode() == ISD::AssertZext ||
13360 Op->getOpcode() == ISD::AssertSext) {
13361 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13362 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13363 }
13364 if (NarrowBitWidth % 8 != 0)
13365 return std::nullopt;
13366 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13367
13368 if (Index >= NarrowByteWidth)
13369 return Op.getOpcode() == ISD::ZERO_EXTEND
13370 ? std::optional<ByteProvider<SDValue>>(
13372 : std::nullopt;
13373 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13374 }
13375
13376 case ISD::TRUNCATE: {
13377 if (IsVec)
13378 return std::nullopt;
13379
13380 uint64_t NarrowByteWidth = BitWidth / 8;
13381
13382 if (NarrowByteWidth >= Index) {
13383 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13384 StartingIndex);
13385 }
13386
13387 return std::nullopt;
13388 }
13389
13390 case ISD::CopyFromReg: {
13391 if (BitWidth / 8 > Index)
13392 return calculateSrcByte(Op, StartingIndex, Index);
13393
13394 return std::nullopt;
13395 }
13396
13397 case ISD::LOAD: {
13398 auto *L = cast<LoadSDNode>(Op.getNode());
13399
13400 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13401 if (NarrowBitWidth % 8 != 0)
13402 return std::nullopt;
13403 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13404
13405 // If the width of the load does not reach byte we are trying to provide for
13406 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13407 // question
13408 if (Index >= NarrowByteWidth) {
13409 return L->getExtensionType() == ISD::ZEXTLOAD
13410 ? std::optional<ByteProvider<SDValue>>(
13412 : std::nullopt;
13413 }
13414
13415 if (NarrowByteWidth > Index) {
13416 return calculateSrcByte(Op, StartingIndex, Index);
13417 }
13418
13419 return std::nullopt;
13420 }
13421
13422 case ISD::BSWAP: {
13423 if (IsVec)
13424 return std::nullopt;
13425
13426 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13427 Depth + 1, StartingIndex);
13428 }
13429
13431 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13432 if (!IdxOp)
13433 return std::nullopt;
13434 auto VecIdx = IdxOp->getZExtValue();
13435 auto ScalarSize = Op.getScalarValueSizeInBits();
13436 if (ScalarSize < 32)
13437 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13438 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13439 StartingIndex, Index);
13440 }
13441
13442 case AMDGPUISD::PERM: {
13443 if (IsVec)
13444 return std::nullopt;
13445
13446 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13447 if (!PermMask)
13448 return std::nullopt;
13449
13450 auto IdxMask =
13451 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13452 if (IdxMask > 0x07 && IdxMask != 0x0c)
13453 return std::nullopt;
13454
13455 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13456 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13457
13458 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13461 }
13462
13463 default: {
13464 return std::nullopt;
13465 }
13466 }
13467
13468 llvm_unreachable("fully handled switch");
13469}
13470
13471// Returns true if the Operand is a scalar and is 16 bits
13472static bool isExtendedFrom16Bits(SDValue &Operand) {
13473
13474 switch (Operand.getOpcode()) {
13475 case ISD::ANY_EXTEND:
13476 case ISD::SIGN_EXTEND:
13477 case ISD::ZERO_EXTEND: {
13478 auto OpVT = Operand.getOperand(0).getValueType();
13479 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13480 }
13481 case ISD::LOAD: {
13482 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13483 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13484 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13485 ExtType == ISD::EXTLOAD) {
13486 auto MemVT = L->getMemoryVT();
13487 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13488 }
13489 return L->getMemoryVT().getSizeInBits() == 16;
13490 }
13491 default:
13492 return false;
13493 }
13494}
13495
13496// Returns true if the mask matches consecutive bytes, and the first byte
13497// begins at a power of 2 byte offset from 0th byte
13498static bool addresses16Bits(int Mask) {
13499 int Low8 = Mask & 0xff;
13500 int Hi8 = (Mask & 0xff00) >> 8;
13501
13502 assert(Low8 < 8 && Hi8 < 8);
13503 // Are the bytes contiguous in the order of increasing addresses.
13504 bool IsConsecutive = (Hi8 - Low8 == 1);
13505 // Is the first byte at location that is aligned for 16 bit instructions.
13506 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13507 // In this case, we still need code to extract the 16 bit operand, so it
13508 // is better to use i8 v_perm
13509 bool Is16Aligned = !(Low8 % 2);
13510
13511 return IsConsecutive && Is16Aligned;
13512}
13513
13514// Do not lower into v_perm if the operands are actually 16 bit
13515// and the selected bits (based on PermMask) correspond with two
13516// easily addressable 16 bit operands.
13518 SDValue &OtherOp) {
13519 int Low16 = PermMask & 0xffff;
13520 int Hi16 = (PermMask & 0xffff0000) >> 16;
13521
13522 auto TempOp = peekThroughBitcasts(Op);
13523 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13524
13525 auto OpIs16Bit =
13526 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13527 if (!OpIs16Bit)
13528 return true;
13529
13530 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13531 isExtendedFrom16Bits(TempOtherOp);
13532 if (!OtherOpIs16Bit)
13533 return true;
13534
13535 // Do we cleanly address both
13536 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13537}
13538
13540 unsigned DWordOffset) {
13541 SDValue Ret;
13542
13543 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13544 // ByteProvider must be at least 8 bits
13545 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13546
13547 if (TypeSize <= 32)
13548 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13549
13550 if (Src.getValueType().isVector()) {
13551 auto ScalarTySize = Src.getScalarValueSizeInBits();
13552 auto ScalarTy = Src.getValueType().getScalarType();
13553 if (ScalarTySize == 32) {
13554 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13555 DAG.getConstant(DWordOffset, SL, MVT::i32));
13556 }
13557 if (ScalarTySize > 32) {
13558 Ret = DAG.getNode(
13559 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13560 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13561 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13562 if (ShiftVal)
13563 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13564 DAG.getConstant(ShiftVal, SL, MVT::i32));
13565 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13566 }
13567
13568 assert(ScalarTySize < 32);
13569 auto NumElements = TypeSize / ScalarTySize;
13570 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13571 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13572 auto NumElementsIn32 = 32 / ScalarTySize;
13573 auto NumAvailElements = DWordOffset < Trunc32Elements
13574 ? NumElementsIn32
13575 : NumElements - NormalizedTrunc;
13576
13578 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13579 NumAvailElements);
13580
13581 Ret = DAG.getBuildVector(
13582 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13583 VecSrcs);
13584 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13585 }
13586
13587 /// Scalar Type
13588 auto ShiftVal = 32 * DWordOffset;
13589 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13590 DAG.getConstant(ShiftVal, SL, MVT::i32));
13591 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13592}
13593
13595 SelectionDAG &DAG = DCI.DAG;
13596 [[maybe_unused]] EVT VT = N->getValueType(0);
13598
13599 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13600 assert(VT == MVT::i32);
13601 for (int i = 0; i < 4; i++) {
13602 // Find the ByteProvider that provides the ith byte of the result of OR
13603 std::optional<ByteProvider<SDValue>> P =
13604 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13605 // TODO support constantZero
13606 if (!P || P->isConstantZero())
13607 return SDValue();
13608
13609 PermNodes.push_back(*P);
13610 }
13611 if (PermNodes.size() != 4)
13612 return SDValue();
13613
13614 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13615 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13616 uint64_t PermMask = 0x00000000;
13617 for (size_t i = 0; i < PermNodes.size(); i++) {
13618 auto PermOp = PermNodes[i];
13619 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13620 // by sizeof(Src2) = 4
13621 int SrcByteAdjust = 4;
13622
13623 // If the Src uses a byte from a different DWORD, then it corresponds
13624 // with a difference source
13625 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13626 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13627 if (SecondSrc)
13628 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13629 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13630 return SDValue();
13631
13632 // Set the index of the second distinct Src node
13633 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13634 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13635 SrcByteAdjust = 0;
13636 }
13637 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13639 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13640 }
13641 SDLoc DL(N);
13642 SDValue Op = *PermNodes[FirstSrc.first].Src;
13643 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13644 assert(Op.getValueSizeInBits() == 32);
13645
13646 // Check that we are not just extracting the bytes in order from an op
13647 if (!SecondSrc) {
13648 int Low16 = PermMask & 0xffff;
13649 int Hi16 = (PermMask & 0xffff0000) >> 16;
13650
13651 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13652 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13653
13654 // The perm op would really just produce Op. So combine into Op
13655 if (WellFormedLow && WellFormedHi)
13656 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13657 }
13658
13659 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13660
13661 if (SecondSrc) {
13662 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13663 assert(OtherOp.getValueSizeInBits() == 32);
13664 }
13665
13666 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13667
13668 assert(Op.getValueType().isByteSized() &&
13669 OtherOp.getValueType().isByteSized());
13670
13671 // If the ultimate src is less than 32 bits, then we will only be
13672 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13673 // CalculateByteProvider would not have returned Op as source if we
13674 // used a byte that is outside its ValueType. Thus, we are free to
13675 // ANY_EXTEND as the extended bits are dont-cares.
13676 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13677 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13678
13679 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13680 DAG.getConstant(PermMask, DL, MVT::i32));
13681 }
13682 return SDValue();
13683}
13684
13685SDValue SITargetLowering::performOrCombine(SDNode *N,
13686 DAGCombinerInfo &DCI) const {
13687 SelectionDAG &DAG = DCI.DAG;
13688 SDValue LHS = N->getOperand(0);
13689 SDValue RHS = N->getOperand(1);
13690
13691 EVT VT = N->getValueType(0);
13692 if (VT == MVT::i1) {
13693 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13694 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13695 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13696 SDValue Src = LHS.getOperand(0);
13697 if (Src != RHS.getOperand(0))
13698 return SDValue();
13699
13700 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13701 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13702 if (!CLHS || !CRHS)
13703 return SDValue();
13704
13705 // Only 10 bits are used.
13706 static const uint32_t MaxMask = 0x3ff;
13707
13708 uint32_t NewMask =
13709 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13710 SDLoc DL(N);
13711 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13712 DAG.getConstant(NewMask, DL, MVT::i32));
13713 }
13714
13715 return SDValue();
13716 }
13717
13718 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13720 LHS.getOpcode() == AMDGPUISD::PERM &&
13721 isa<ConstantSDNode>(LHS.getOperand(2))) {
13722 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13723 if (!Sel)
13724 return SDValue();
13725
13726 Sel |= LHS.getConstantOperandVal(2);
13727 SDLoc DL(N);
13728 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13729 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13730 }
13731
13732 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13733 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13734 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13735 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13736
13737 // If all the uses of an or need to extract the individual elements, do not
13738 // attempt to lower into v_perm
13739 auto usesCombinedOperand = [](SDNode *OrUse) {
13740 // If we have any non-vectorized use, then it is a candidate for v_perm
13741 if (OrUse->getOpcode() != ISD::BITCAST ||
13742 !OrUse->getValueType(0).isVector())
13743 return true;
13744
13745 // If we have any non-vectorized use, then it is a candidate for v_perm
13746 for (auto *VUser : OrUse->users()) {
13747 if (!VUser->getValueType(0).isVector())
13748 return true;
13749
13750 // If the use of a vector is a store, then combining via a v_perm
13751 // is beneficial.
13752 // TODO -- whitelist more uses
13753 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13754 if (VUser->getOpcode() == VectorwiseOp)
13755 return true;
13756 }
13757 return false;
13758 };
13759
13760 if (!any_of(N->users(), usesCombinedOperand))
13761 return SDValue();
13762
13763 uint32_t LHSMask = getPermuteMask(LHS);
13764 uint32_t RHSMask = getPermuteMask(RHS);
13765
13766 if (LHSMask != ~0u && RHSMask != ~0u) {
13767 // Canonicalize the expression in an attempt to have fewer unique masks
13768 // and therefore fewer registers used to hold the masks.
13769 if (LHSMask > RHSMask) {
13770 std::swap(LHSMask, RHSMask);
13771 std::swap(LHS, RHS);
13772 }
13773
13774 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13775 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13776 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13777 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13778
13779 // Check of we need to combine values from two sources within a byte.
13780 if (!(LHSUsedLanes & RHSUsedLanes) &&
13781 // If we select high and lower word keep it for SDWA.
13782 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13783 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13784 // Kill zero bytes selected by other mask. Zero value is 0xc.
13785 LHSMask &= ~RHSUsedLanes;
13786 RHSMask &= ~LHSUsedLanes;
13787 // Add 4 to each active LHS lane
13788 LHSMask |= LHSUsedLanes & 0x04040404;
13789 // Combine masks
13790 uint32_t Sel = LHSMask | RHSMask;
13791 SDLoc DL(N);
13792
13793 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13794 RHS.getOperand(0),
13795 DAG.getConstant(Sel, DL, MVT::i32));
13796 }
13797 }
13798 if (LHSMask == ~0u || RHSMask == ~0u) {
13799 if (SDValue Perm = matchPERM(N, DCI))
13800 return Perm;
13801 }
13802 }
13803
13804 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13805 return SDValue();
13806
13807 // TODO: This could be a generic combine with a predicate for extracting the
13808 // high half of an integer being free.
13809
13810 // (or i64:x, (zero_extend i32:y)) ->
13811 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
13812 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
13813 RHS.getOpcode() != ISD::ZERO_EXTEND)
13814 std::swap(LHS, RHS);
13815
13816 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
13817 SDValue ExtSrc = RHS.getOperand(0);
13818 EVT SrcVT = ExtSrc.getValueType();
13819 if (SrcVT == MVT::i32) {
13820 SDLoc SL(N);
13821 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
13822 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
13823
13824 DCI.AddToWorklist(LowOr.getNode());
13825 DCI.AddToWorklist(HiBits.getNode());
13826
13827 SDValue Vec =
13828 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
13829 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
13830 }
13831 }
13832
13833 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
13834 if (CRHS) {
13835 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
13836 N->getOperand(0), CRHS))
13837 return Split;
13838 }
13839
13840 return SDValue();
13841}
13842
13843SDValue SITargetLowering::performXorCombine(SDNode *N,
13844 DAGCombinerInfo &DCI) const {
13845 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
13846 return RV;
13847
13848 SDValue LHS = N->getOperand(0);
13849 SDValue RHS = N->getOperand(1);
13850
13851 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13852 SelectionDAG &DAG = DCI.DAG;
13853
13854 EVT VT = N->getValueType(0);
13855 if (CRHS && VT == MVT::i64) {
13856 if (SDValue Split =
13857 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
13858 return Split;
13859 }
13860
13861 // Make sure to apply the 64-bit constant splitting fold before trying to fold
13862 // fneg-like xors into 64-bit select.
13863 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
13864 // This looks like an fneg, try to fold as a source modifier.
13865 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
13867 // xor (select c, a, b), 0x80000000 ->
13868 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
13869 SDLoc DL(N);
13870 SDValue CastLHS =
13871 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
13872 SDValue CastRHS =
13873 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
13874 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
13875 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
13876 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
13877 LHS->getOperand(0), FNegLHS, FNegRHS);
13878 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13879 }
13880 }
13881
13882 return SDValue();
13883}
13884
13885SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
13886 DAGCombinerInfo &DCI) const {
13887 if (!Subtarget->has16BitInsts() ||
13888 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13889 return SDValue();
13890
13891 EVT VT = N->getValueType(0);
13892 if (VT != MVT::i32)
13893 return SDValue();
13894
13895 SDValue Src = N->getOperand(0);
13896 if (Src.getValueType() != MVT::i16)
13897 return SDValue();
13898
13899 return SDValue();
13900}
13901
13902SDValue
13903SITargetLowering::performSignExtendInRegCombine(SDNode *N,
13904 DAGCombinerInfo &DCI) const {
13905 SDValue Src = N->getOperand(0);
13906 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
13907
13908 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
13909 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
13910 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
13911 VTSign->getVT() == MVT::i8) ||
13912 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
13913 VTSign->getVT() == MVT::i16))) {
13914 assert(Subtarget->hasScalarSubwordLoads() &&
13915 "s_buffer_load_{u8, i8} are supported "
13916 "in GFX12 (or newer) architectures.");
13917 EVT VT = Src.getValueType();
13918 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
13921 SDLoc DL(N);
13922 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
13923 SDValue Ops[] = {
13924 Src.getOperand(0), // source register
13925 Src.getOperand(1), // offset
13926 Src.getOperand(2) // cachePolicy
13927 };
13928 auto *M = cast<MemSDNode>(Src);
13929 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13930 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13931 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
13932 return LoadVal;
13933 }
13934 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
13935 VTSign->getVT() == MVT::i8) ||
13936 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
13937 VTSign->getVT() == MVT::i16)) &&
13938 Src.hasOneUse()) {
13939 auto *M = cast<MemSDNode>(Src);
13940 SDValue Ops[] = {Src.getOperand(0), // Chain
13941 Src.getOperand(1), // rsrc
13942 Src.getOperand(2), // vindex
13943 Src.getOperand(3), // voffset
13944 Src.getOperand(4), // soffset
13945 Src.getOperand(5), // offset
13946 Src.getOperand(6), Src.getOperand(7)};
13947 // replace with BUFFER_LOAD_BYTE/SHORT
13948 SDVTList ResList =
13949 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
13950 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
13953 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13954 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
13955 return DCI.DAG.getMergeValues(
13956 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
13957 }
13958 return SDValue();
13959}
13960
13961SDValue SITargetLowering::performClassCombine(SDNode *N,
13962 DAGCombinerInfo &DCI) const {
13963 SelectionDAG &DAG = DCI.DAG;
13964 SDValue Mask = N->getOperand(1);
13965
13966 // fp_class x, 0 -> false
13967 if (isNullConstant(Mask))
13968 return DAG.getConstant(0, SDLoc(N), MVT::i1);
13969
13970 if (N->getOperand(0).isUndef())
13971 return DAG.getUNDEF(MVT::i1);
13972
13973 return SDValue();
13974}
13975
13976SDValue SITargetLowering::performRcpCombine(SDNode *N,
13977 DAGCombinerInfo &DCI) const {
13978 EVT VT = N->getValueType(0);
13979 SDValue N0 = N->getOperand(0);
13980
13981 if (N0.isUndef()) {
13982 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
13983 SDLoc(N), VT);
13984 }
13985
13986 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
13987 N0.getOpcode() == ISD::SINT_TO_FP)) {
13988 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
13989 N->getFlags());
13990 }
13991
13992 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
13993 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
13994 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
13995 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
13996 N->getFlags());
13997 }
13998
14000}
14001
14003 unsigned MaxDepth) const {
14004 unsigned Opcode = Op.getOpcode();
14005 if (Opcode == ISD::FCANONICALIZE)
14006 return true;
14007
14008 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14009 const auto &F = CFP->getValueAPF();
14010 if (F.isNaN() && F.isSignaling())
14011 return false;
14012 if (!F.isDenormal())
14013 return true;
14014
14015 DenormalMode Mode =
14016 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14017 return Mode == DenormalMode::getIEEE();
14018 }
14019
14020 // If source is a result of another standard FP operation it is already in
14021 // canonical form.
14022 if (MaxDepth == 0)
14023 return false;
14024
14025 switch (Opcode) {
14026 // These will flush denorms if required.
14027 case ISD::FADD:
14028 case ISD::FSUB:
14029 case ISD::FMUL:
14030 case ISD::FCEIL:
14031 case ISD::FFLOOR:
14032 case ISD::FMA:
14033 case ISD::FMAD:
14034 case ISD::FSQRT:
14035 case ISD::FDIV:
14036 case ISD::FREM:
14037 case ISD::FP_ROUND:
14038 case ISD::FP_EXTEND:
14039 case ISD::FP16_TO_FP:
14040 case ISD::FP_TO_FP16:
14041 case ISD::BF16_TO_FP:
14042 case ISD::FP_TO_BF16:
14043 case ISD::FLDEXP:
14046 case AMDGPUISD::RCP:
14047 case AMDGPUISD::RSQ:
14051 case AMDGPUISD::LOG:
14052 case AMDGPUISD::EXP:
14056 case AMDGPUISD::FRACT:
14063 case AMDGPUISD::SIN_HW:
14064 case AMDGPUISD::COS_HW:
14065 return true;
14066
14067 // It can/will be lowered or combined as a bit operation.
14068 // Need to check their input recursively to handle.
14069 case ISD::FNEG:
14070 case ISD::FABS:
14071 case ISD::FCOPYSIGN:
14072 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14073
14074 case ISD::AND:
14075 if (Op.getValueType() == MVT::i32) {
14076 // Be careful as we only know it is a bitcast floating point type. It
14077 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14078 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14079 // is valid to optimize for all types.
14080 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14081 if (RHS->getZExtValue() == 0xffff0000) {
14082 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14083 }
14084 }
14085 }
14086 break;
14087
14088 case ISD::FSIN:
14089 case ISD::FCOS:
14090 case ISD::FSINCOS:
14091 return Op.getValueType().getScalarType() != MVT::f16;
14092
14093 case ISD::FMINNUM:
14094 case ISD::FMAXNUM:
14095 case ISD::FMINNUM_IEEE:
14096 case ISD::FMAXNUM_IEEE:
14097 case ISD::FMINIMUM:
14098 case ISD::FMAXIMUM:
14099 case ISD::FMINIMUMNUM:
14100 case ISD::FMAXIMUMNUM:
14101 case AMDGPUISD::CLAMP:
14102 case AMDGPUISD::FMED3:
14103 case AMDGPUISD::FMAX3:
14104 case AMDGPUISD::FMIN3:
14106 case AMDGPUISD::FMINIMUM3: {
14107 // FIXME: Shouldn't treat the generic operations different based these.
14108 // However, we aren't really required to flush the result from
14109 // minnum/maxnum..
14110
14111 // snans will be quieted, so we only need to worry about denormals.
14112 if (Subtarget->supportsMinMaxDenormModes() ||
14113 // FIXME: denormalsEnabledForType is broken for dynamic
14114 denormalsEnabledForType(DAG, Op.getValueType()))
14115 return true;
14116
14117 // Flushing may be required.
14118 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14119 // targets need to check their input recursively.
14120
14121 // FIXME: Does this apply with clamp? It's implemented with max.
14122 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14123 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14124 return false;
14125 }
14126
14127 return true;
14128 }
14129 case ISD::SELECT: {
14130 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14131 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14132 }
14133 case ISD::BUILD_VECTOR: {
14134 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14135 SDValue SrcOp = Op.getOperand(i);
14136 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14137 return false;
14138 }
14139
14140 return true;
14141 }
14144 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14145 }
14147 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14148 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14149 }
14150 case ISD::UNDEF:
14151 // Could be anything.
14152 return false;
14153
14154 case ISD::BITCAST:
14155 // TODO: This is incorrect as it loses track of the operand's type. We may
14156 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14157 // same bits that are canonicalized in one type need not be in the other.
14158 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14159 case ISD::TRUNCATE: {
14160 // Hack round the mess we make when legalizing extract_vector_elt
14161 if (Op.getValueType() == MVT::i16) {
14162 SDValue TruncSrc = Op.getOperand(0);
14163 if (TruncSrc.getValueType() == MVT::i32 &&
14164 TruncSrc.getOpcode() == ISD::BITCAST &&
14165 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14166 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14167 }
14168 }
14169 return false;
14170 }
14172 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14173 // TODO: Handle more intrinsics
14174 switch (IntrinsicID) {
14175 case Intrinsic::amdgcn_cvt_pkrtz:
14176 case Intrinsic::amdgcn_cubeid:
14177 case Intrinsic::amdgcn_frexp_mant:
14178 case Intrinsic::amdgcn_fdot2:
14179 case Intrinsic::amdgcn_rcp:
14180 case Intrinsic::amdgcn_rsq:
14181 case Intrinsic::amdgcn_rsq_clamp:
14182 case Intrinsic::amdgcn_rcp_legacy:
14183 case Intrinsic::amdgcn_rsq_legacy:
14184 case Intrinsic::amdgcn_trig_preop:
14185 case Intrinsic::amdgcn_tanh:
14186 case Intrinsic::amdgcn_log:
14187 case Intrinsic::amdgcn_exp2:
14188 case Intrinsic::amdgcn_sqrt:
14189 return true;
14190 default:
14191 break;
14192 }
14193
14194 break;
14195 }
14196 default:
14197 break;
14198 }
14199
14200 // FIXME: denormalsEnabledForType is broken for dynamic
14201 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14202 DAG.isKnownNeverSNaN(Op);
14203}
14204
14206 unsigned MaxDepth) const {
14207 const MachineRegisterInfo &MRI = MF.getRegInfo();
14208 MachineInstr *MI = MRI.getVRegDef(Reg);
14209 unsigned Opcode = MI->getOpcode();
14210
14211 if (Opcode == AMDGPU::G_FCANONICALIZE)
14212 return true;
14213
14214 std::optional<FPValueAndVReg> FCR;
14215 // Constant splat (can be padded with undef) or scalar constant.
14217 if (FCR->Value.isSignaling())
14218 return false;
14219 if (!FCR->Value.isDenormal())
14220 return true;
14221
14222 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14223 return Mode == DenormalMode::getIEEE();
14224 }
14225
14226 if (MaxDepth == 0)
14227 return false;
14228
14229 switch (Opcode) {
14230 case AMDGPU::G_FADD:
14231 case AMDGPU::G_FSUB:
14232 case AMDGPU::G_FMUL:
14233 case AMDGPU::G_FCEIL:
14234 case AMDGPU::G_FFLOOR:
14235 case AMDGPU::G_FRINT:
14236 case AMDGPU::G_FNEARBYINT:
14237 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14238 case AMDGPU::G_INTRINSIC_TRUNC:
14239 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14240 case AMDGPU::G_FMA:
14241 case AMDGPU::G_FMAD:
14242 case AMDGPU::G_FSQRT:
14243 case AMDGPU::G_FDIV:
14244 case AMDGPU::G_FREM:
14245 case AMDGPU::G_FPOW:
14246 case AMDGPU::G_FPEXT:
14247 case AMDGPU::G_FLOG:
14248 case AMDGPU::G_FLOG2:
14249 case AMDGPU::G_FLOG10:
14250 case AMDGPU::G_FPTRUNC:
14251 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14252 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14253 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14254 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14255 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14256 return true;
14257 case AMDGPU::G_FNEG:
14258 case AMDGPU::G_FABS:
14259 case AMDGPU::G_FCOPYSIGN:
14260 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14261 case AMDGPU::G_FMINNUM:
14262 case AMDGPU::G_FMAXNUM:
14263 case AMDGPU::G_FMINNUM_IEEE:
14264 case AMDGPU::G_FMAXNUM_IEEE:
14265 case AMDGPU::G_FMINIMUM:
14266 case AMDGPU::G_FMAXIMUM:
14267 case AMDGPU::G_FMINIMUMNUM:
14268 case AMDGPU::G_FMAXIMUMNUM: {
14269 if (Subtarget->supportsMinMaxDenormModes() ||
14270 // FIXME: denormalsEnabledForType is broken for dynamic
14271 denormalsEnabledForType(MRI.getType(Reg), MF))
14272 return true;
14273
14274 [[fallthrough]];
14275 }
14276 case AMDGPU::G_BUILD_VECTOR:
14277 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14278 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14279 return false;
14280 return true;
14281 case AMDGPU::G_INTRINSIC:
14282 case AMDGPU::G_INTRINSIC_CONVERGENT:
14283 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14284 case Intrinsic::amdgcn_fmul_legacy:
14285 case Intrinsic::amdgcn_fmad_ftz:
14286 case Intrinsic::amdgcn_sqrt:
14287 case Intrinsic::amdgcn_fmed3:
14288 case Intrinsic::amdgcn_sin:
14289 case Intrinsic::amdgcn_cos:
14290 case Intrinsic::amdgcn_log:
14291 case Intrinsic::amdgcn_exp2:
14292 case Intrinsic::amdgcn_log_clamp:
14293 case Intrinsic::amdgcn_rcp:
14294 case Intrinsic::amdgcn_rcp_legacy:
14295 case Intrinsic::amdgcn_rsq:
14296 case Intrinsic::amdgcn_rsq_clamp:
14297 case Intrinsic::amdgcn_rsq_legacy:
14298 case Intrinsic::amdgcn_div_scale:
14299 case Intrinsic::amdgcn_div_fmas:
14300 case Intrinsic::amdgcn_div_fixup:
14301 case Intrinsic::amdgcn_fract:
14302 case Intrinsic::amdgcn_cvt_pkrtz:
14303 case Intrinsic::amdgcn_cubeid:
14304 case Intrinsic::amdgcn_cubema:
14305 case Intrinsic::amdgcn_cubesc:
14306 case Intrinsic::amdgcn_cubetc:
14307 case Intrinsic::amdgcn_frexp_mant:
14308 case Intrinsic::amdgcn_fdot2:
14309 case Intrinsic::amdgcn_trig_preop:
14310 case Intrinsic::amdgcn_tanh:
14311 return true;
14312 default:
14313 break;
14314 }
14315
14316 [[fallthrough]];
14317 default:
14318 return false;
14319 }
14320
14321 llvm_unreachable("invalid operation");
14322}
14323
14324// Constant fold canonicalize.
14325SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14326 const SDLoc &SL, EVT VT,
14327 const APFloat &C) const {
14328 // Flush denormals to 0 if not enabled.
14329 if (C.isDenormal()) {
14330 DenormalMode Mode =
14331 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14332 if (Mode == DenormalMode::getPreserveSign()) {
14333 return DAG.getConstantFP(
14334 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14335 }
14336
14337 if (Mode != DenormalMode::getIEEE())
14338 return SDValue();
14339 }
14340
14341 if (C.isNaN()) {
14342 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14343 if (C.isSignaling()) {
14344 // Quiet a signaling NaN.
14345 // FIXME: Is this supposed to preserve payload bits?
14346 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14347 }
14348
14349 // Make sure it is the canonical NaN bitpattern.
14350 //
14351 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14352 // immediate?
14353 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14354 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14355 }
14356
14357 // Already canonical.
14358 return DAG.getConstantFP(C, SL, VT);
14359}
14360
14362 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14363}
14364
14365SDValue
14366SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14367 DAGCombinerInfo &DCI) const {
14368 SelectionDAG &DAG = DCI.DAG;
14369 SDValue N0 = N->getOperand(0);
14370 EVT VT = N->getValueType(0);
14371
14372 // fcanonicalize undef -> qnan
14373 if (N0.isUndef()) {
14375 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14376 }
14377
14378 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14379 EVT VT = N->getValueType(0);
14380 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14381 }
14382
14383 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14384 // (fcanonicalize k)
14385 //
14386 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14387
14388 // TODO: This could be better with wider vectors that will be split to v2f16,
14389 // and to consider uses since there aren't that many packed operations.
14390 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14391 isTypeLegal(MVT::v2f16)) {
14392 SDLoc SL(N);
14393 SDValue NewElts[2];
14394 SDValue Lo = N0.getOperand(0);
14395 SDValue Hi = N0.getOperand(1);
14396 EVT EltVT = Lo.getValueType();
14397
14399 for (unsigned I = 0; I != 2; ++I) {
14400 SDValue Op = N0.getOperand(I);
14401 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14402 NewElts[I] =
14403 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14404 } else if (Op.isUndef()) {
14405 // Handled below based on what the other operand is.
14406 NewElts[I] = Op;
14407 } else {
14408 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14409 }
14410 }
14411
14412 // If one half is undef, and one is constant, prefer a splat vector rather
14413 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14414 // cheaper to use and may be free with a packed operation.
14415 if (NewElts[0].isUndef()) {
14416 if (isa<ConstantFPSDNode>(NewElts[1]))
14417 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14418 ? NewElts[1]
14419 : DAG.getConstantFP(0.0f, SL, EltVT);
14420 }
14421
14422 if (NewElts[1].isUndef()) {
14423 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14424 ? NewElts[0]
14425 : DAG.getConstantFP(0.0f, SL, EltVT);
14426 }
14427
14428 return DAG.getBuildVector(VT, SL, NewElts);
14429 }
14430 }
14431
14432 return SDValue();
14433}
14434
14435static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14436 switch (Opc) {
14437 case ISD::FMAXNUM:
14438 case ISD::FMAXNUM_IEEE:
14439 case ISD::FMAXIMUMNUM:
14440 return AMDGPUISD::FMAX3;
14441 case ISD::FMAXIMUM:
14442 return AMDGPUISD::FMAXIMUM3;
14443 case ISD::SMAX:
14444 return AMDGPUISD::SMAX3;
14445 case ISD::UMAX:
14446 return AMDGPUISD::UMAX3;
14447 case ISD::FMINNUM:
14448 case ISD::FMINNUM_IEEE:
14449 case ISD::FMINIMUMNUM:
14450 return AMDGPUISD::FMIN3;
14451 case ISD::FMINIMUM:
14452 return AMDGPUISD::FMINIMUM3;
14453 case ISD::SMIN:
14454 return AMDGPUISD::SMIN3;
14455 case ISD::UMIN:
14456 return AMDGPUISD::UMIN3;
14457 default:
14458 llvm_unreachable("Not a min/max opcode");
14459 }
14460}
14461
14462SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14463 const SDLoc &SL, SDValue Src,
14464 SDValue MinVal,
14465 SDValue MaxVal,
14466 bool Signed) const {
14467
14468 // med3 comes from
14469 // min(max(x, K0), K1), K0 < K1
14470 // max(min(x, K0), K1), K1 < K0
14471 //
14472 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14473 // min/max op.
14474 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14475 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14476
14477 if (!MinK || !MaxK)
14478 return SDValue();
14479
14480 if (Signed) {
14481 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14482 return SDValue();
14483 } else {
14484 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14485 return SDValue();
14486 }
14487
14488 EVT VT = MinK->getValueType(0);
14489 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14490 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14491 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14492
14493 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14494 // not available, but this is unlikely to be profitable as constants
14495 // will often need to be materialized & extended, especially on
14496 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14497 return SDValue();
14498}
14499
14502 return C;
14503
14505 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14506 return C;
14507 }
14508
14509 return nullptr;
14510}
14511
14512SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14513 const SDLoc &SL, SDValue Op0,
14514 SDValue Op1) const {
14515 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14516 if (!K1)
14517 return SDValue();
14518
14519 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14520 if (!K0)
14521 return SDValue();
14522
14523 // Ordered >= (although NaN inputs should have folded away by now).
14524 if (K0->getValueAPF() > K1->getValueAPF())
14525 return SDValue();
14526
14527 // med3 with a nan input acts like
14528 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14529 //
14530 // So the result depends on whether the IEEE mode bit is enabled or not with a
14531 // signaling nan input.
14532 // ieee=1
14533 // s0 snan: yields s2
14534 // s1 snan: yields s2
14535 // s2 snan: qnan
14536
14537 // s0 qnan: min(s1, s2)
14538 // s1 qnan: min(s0, s2)
14539 // s2 qnan: min(s0, s1)
14540
14541 // ieee=0
14542 // s0 snan: min(s1, s2)
14543 // s1 snan: min(s0, s2)
14544 // s2 snan: qnan
14545
14546 // s0 qnan: min(s1, s2)
14547 // s1 qnan: min(s0, s2)
14548 // s2 qnan: min(s0, s1)
14549 const MachineFunction &MF = DAG.getMachineFunction();
14550 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14551
14552 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14553 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14554 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14555 EVT VT = Op0.getValueType();
14556 if (Info->getMode().DX10Clamp) {
14557 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14558 // hardware fmed3 behavior converting to a min.
14559 // FIXME: Should this be allowing -0.0?
14560 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14561 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14562 }
14563
14564 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14565 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14566 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14567 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14568 // then give the other result, which is different from med3 with a NaN
14569 // input.
14570 SDValue Var = Op0.getOperand(0);
14571 if (!DAG.isKnownNeverSNaN(Var))
14572 return SDValue();
14573
14574 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14575
14576 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14577 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14578 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14579 SDValue(K0, 0), SDValue(K1, 0));
14580 }
14581 }
14582
14583 return SDValue();
14584}
14585
14586/// \return true if the subtarget supports minimum3 and maximum3 with the given
14587/// base min/max opcode \p Opc for type \p VT.
14588static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14589 EVT VT) {
14590 switch (Opc) {
14591 case ISD::FMINNUM:
14592 case ISD::FMAXNUM:
14593 case ISD::FMINNUM_IEEE:
14594 case ISD::FMAXNUM_IEEE:
14595 case ISD::FMINIMUMNUM:
14596 case ISD::FMAXIMUMNUM:
14599 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14600 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14601 case ISD::FMINIMUM:
14602 case ISD::FMAXIMUM:
14603 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14604 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14605 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14606 case ISD::SMAX:
14607 case ISD::SMIN:
14608 case ISD::UMAX:
14609 case ISD::UMIN:
14610 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14611 default:
14612 return false;
14613 }
14614
14615 llvm_unreachable("not a min/max opcode");
14616}
14617
14618SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14619 DAGCombinerInfo &DCI) const {
14620 SelectionDAG &DAG = DCI.DAG;
14621
14622 EVT VT = N->getValueType(0);
14623 unsigned Opc = N->getOpcode();
14624 SDValue Op0 = N->getOperand(0);
14625 SDValue Op1 = N->getOperand(1);
14626
14627 // Only do this if the inner op has one use since this will just increases
14628 // register pressure for no benefit.
14629
14630 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14631 // max(max(a, b), c) -> max3(a, b, c)
14632 // min(min(a, b), c) -> min3(a, b, c)
14633 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14634 SDLoc DL(N);
14635 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14636 Op0.getOperand(0), Op0.getOperand(1), Op1);
14637 }
14638
14639 // Try commuted.
14640 // max(a, max(b, c)) -> max3(a, b, c)
14641 // min(a, min(b, c)) -> min3(a, b, c)
14642 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14643 SDLoc DL(N);
14644 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14645 Op0, Op1.getOperand(0), Op1.getOperand(1));
14646 }
14647 }
14648
14649 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14650 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14651 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14652 if (SDValue Med3 = performIntMed3ImmCombine(
14653 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14654 return Med3;
14655 }
14656 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14657 if (SDValue Med3 = performIntMed3ImmCombine(
14658 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14659 return Med3;
14660 }
14661
14662 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14663 if (SDValue Med3 = performIntMed3ImmCombine(
14664 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14665 return Med3;
14666 }
14667 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14668 if (SDValue Med3 = performIntMed3ImmCombine(
14669 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14670 return Med3;
14671 }
14672
14673 // if !is_snan(x):
14674 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14675 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14676 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14677 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14678 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14679 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14680 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14682 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14683 (VT == MVT::f32 || VT == MVT::f64 ||
14684 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14685 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14686 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14687 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14688 Op0.hasOneUse()) {
14689 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14690 return Res;
14691 }
14692
14693 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14694 // for some types, but at a higher cost since it's implemented with a 3
14695 // operand form.
14696 const SDNodeFlags Flags = N->getFlags();
14697 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14698 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14699 unsigned NewOpc =
14700 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14701 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14702 }
14703
14704 return SDValue();
14705}
14706
14710 // FIXME: Should this be allowing -0.0?
14711 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
14712 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
14713 }
14714 }
14715
14716 return false;
14717}
14718
14719// FIXME: Should only worry about snans for version with chain.
14720SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14721 DAGCombinerInfo &DCI) const {
14722 EVT VT = N->getValueType(0);
14723 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14724 // NaNs. With a NaN input, the order of the operands may change the result.
14725
14726 SelectionDAG &DAG = DCI.DAG;
14727 SDLoc SL(N);
14728
14729 SDValue Src0 = N->getOperand(0);
14730 SDValue Src1 = N->getOperand(1);
14731 SDValue Src2 = N->getOperand(2);
14732
14733 if (isClampZeroToOne(Src0, Src1)) {
14734 // const_a, const_b, x -> clamp is safe in all cases including signaling
14735 // nans.
14736 // FIXME: Should this be allowing -0.0?
14737 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
14738 }
14739
14740 const MachineFunction &MF = DAG.getMachineFunction();
14741 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14742
14743 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
14744 // handling no dx10-clamp?
14745 if (Info->getMode().DX10Clamp) {
14746 // If NaNs is clamped to 0, we are free to reorder the inputs.
14747
14748 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14749 std::swap(Src0, Src1);
14750
14751 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
14752 std::swap(Src1, Src2);
14753
14754 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
14755 std::swap(Src0, Src1);
14756
14757 if (isClampZeroToOne(Src1, Src2))
14758 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
14759 }
14760
14761 return SDValue();
14762}
14763
14764SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
14765 DAGCombinerInfo &DCI) const {
14766 SDValue Src0 = N->getOperand(0);
14767 SDValue Src1 = N->getOperand(1);
14768 if (Src0.isUndef() && Src1.isUndef())
14769 return DCI.DAG.getUNDEF(N->getValueType(0));
14770 return SDValue();
14771}
14772
14773// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
14774// expanded into a set of cmp/select instructions.
14776 unsigned NumElem,
14777 bool IsDivergentIdx,
14778 const GCNSubtarget *Subtarget) {
14780 return false;
14781
14782 unsigned VecSize = EltSize * NumElem;
14783
14784 // Sub-dword vectors of size 2 dword or less have better implementation.
14785 if (VecSize <= 64 && EltSize < 32)
14786 return false;
14787
14788 // Always expand the rest of sub-dword instructions, otherwise it will be
14789 // lowered via memory.
14790 if (EltSize < 32)
14791 return true;
14792
14793 // Always do this if var-idx is divergent, otherwise it will become a loop.
14794 if (IsDivergentIdx)
14795 return true;
14796
14797 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
14798 unsigned NumInsts = NumElem /* Number of compares */ +
14799 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
14800
14801 // On some architectures (GFX9) movrel is not available and it's better
14802 // to expand.
14803 if (Subtarget->useVGPRIndexMode())
14804 return NumInsts <= 16;
14805
14806 // If movrel is available, use it instead of expanding for vector of 8
14807 // elements.
14808 if (Subtarget->hasMovrel())
14809 return NumInsts <= 15;
14810
14811 return true;
14812}
14813
14815 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
14816 if (isa<ConstantSDNode>(Idx))
14817 return false;
14818
14819 SDValue Vec = N->getOperand(0);
14820 EVT VecVT = Vec.getValueType();
14821 EVT EltVT = VecVT.getVectorElementType();
14822 unsigned EltSize = EltVT.getSizeInBits();
14823 unsigned NumElem = VecVT.getVectorNumElements();
14824
14826 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
14827}
14828
14829SDValue
14830SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14831 DAGCombinerInfo &DCI) const {
14832 SDValue Vec = N->getOperand(0);
14833 SelectionDAG &DAG = DCI.DAG;
14834
14835 EVT VecVT = Vec.getValueType();
14836 EVT VecEltVT = VecVT.getVectorElementType();
14837 EVT ResVT = N->getValueType(0);
14838
14839 unsigned VecSize = VecVT.getSizeInBits();
14840 unsigned VecEltSize = VecEltVT.getSizeInBits();
14841
14842 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14844 SDLoc SL(N);
14845 SDValue Idx = N->getOperand(1);
14846 SDValue Elt =
14847 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
14848 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
14849 }
14850
14851 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14852 // =>
14853 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14854 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14855 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14856 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14857 SDLoc SL(N);
14858 SDValue Idx = N->getOperand(1);
14859 unsigned Opc = Vec.getOpcode();
14860
14861 switch (Opc) {
14862 default:
14863 break;
14864 // TODO: Support other binary operations.
14865 case ISD::FADD:
14866 case ISD::FSUB:
14867 case ISD::FMUL:
14868 case ISD::ADD:
14869 case ISD::UMIN:
14870 case ISD::UMAX:
14871 case ISD::SMIN:
14872 case ISD::SMAX:
14873 case ISD::FMAXNUM:
14874 case ISD::FMINNUM:
14875 case ISD::FMAXNUM_IEEE:
14876 case ISD::FMINNUM_IEEE:
14877 case ISD::FMAXIMUM:
14878 case ISD::FMINIMUM: {
14879 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14880 Vec.getOperand(0), Idx);
14881 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
14882 Vec.getOperand(1), Idx);
14883
14884 DCI.AddToWorklist(Elt0.getNode());
14885 DCI.AddToWorklist(Elt1.getNode());
14886 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
14887 }
14888 }
14889 }
14890
14891 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14893 SDLoc SL(N);
14894 SDValue Idx = N->getOperand(1);
14895 SDValue V;
14896 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14897 SDValue IC = DAG.getVectorIdxConstant(I, SL);
14898 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
14899 if (I == 0)
14900 V = Elt;
14901 else
14902 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
14903 }
14904 return V;
14905 }
14906
14907 if (!DCI.isBeforeLegalize())
14908 return SDValue();
14909
14910 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14911 // elements. This exposes more load reduction opportunities by replacing
14912 // multiple small extract_vector_elements with a single 32-bit extract.
14913 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
14914 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14915 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14916 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
14917
14918 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14919 unsigned EltIdx = BitIndex / 32;
14920 unsigned LeftoverBitIdx = BitIndex % 32;
14921 SDLoc SL(N);
14922
14923 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
14924 DCI.AddToWorklist(Cast.getNode());
14925
14926 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
14927 DAG.getConstant(EltIdx, SL, MVT::i32));
14928 DCI.AddToWorklist(Elt.getNode());
14929 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
14930 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
14931 DCI.AddToWorklist(Srl.getNode());
14932
14933 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14934 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
14935 DCI.AddToWorklist(Trunc.getNode());
14936
14937 if (VecEltVT == ResVT) {
14938 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
14939 }
14940
14941 assert(ResVT.isScalarInteger());
14942 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
14943 }
14944
14945 return SDValue();
14946}
14947
14948SDValue
14949SITargetLowering::performInsertVectorEltCombine(SDNode *N,
14950 DAGCombinerInfo &DCI) const {
14951 SDValue Vec = N->getOperand(0);
14952 SDValue Idx = N->getOperand(2);
14953 EVT VecVT = Vec.getValueType();
14954 EVT EltVT = VecVT.getVectorElementType();
14955
14956 // INSERT_VECTOR_ELT (<n x e>, var-idx)
14957 // => BUILD_VECTOR n x select (e, const-idx)
14959 return SDValue();
14960
14961 SelectionDAG &DAG = DCI.DAG;
14962 SDLoc SL(N);
14963 SDValue Ins = N->getOperand(1);
14964 EVT IdxVT = Idx.getValueType();
14965
14967 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14968 SDValue IC = DAG.getConstant(I, SL, IdxVT);
14969 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
14970 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
14971 Ops.push_back(V);
14972 }
14973
14974 return DAG.getBuildVector(VecVT, SL, Ops);
14975}
14976
14977/// Return the source of an fp_extend from f16 to f32, or a converted FP
14978/// constant.
14980 if (Src.getOpcode() == ISD::FP_EXTEND &&
14981 Src.getOperand(0).getValueType() == MVT::f16) {
14982 return Src.getOperand(0);
14983 }
14984
14985 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
14986 APFloat Val = CFP->getValueAPF();
14987 bool LosesInfo = true;
14989 if (!LosesInfo)
14990 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
14991 }
14992
14993 return SDValue();
14994}
14995
14996SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
14997 DAGCombinerInfo &DCI) const {
14998 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14999 "combine only useful on gfx8");
15000
15001 SDValue TruncSrc = N->getOperand(0);
15002 EVT VT = N->getValueType(0);
15003 if (VT != MVT::f16)
15004 return SDValue();
15005
15006 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15007 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15008 return SDValue();
15009
15010 SelectionDAG &DAG = DCI.DAG;
15011 SDLoc SL(N);
15012
15013 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15014 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15015 // casting back.
15016
15017 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15018 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15019 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15020 if (!A)
15021 return SDValue();
15022
15023 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15024 if (!B)
15025 return SDValue();
15026
15027 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15028 if (!C)
15029 return SDValue();
15030
15031 // This changes signaling nan behavior. If an input is a signaling nan, it
15032 // would have been quieted by the fpext originally. We don't care because
15033 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15034 // we would be worse off than just doing the promotion.
15035 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15036 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15037 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15038 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15039}
15040
15041unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15042 const SDNode *N0,
15043 const SDNode *N1) const {
15044 EVT VT = N0->getValueType(0);
15045
15046 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15047 // support denormals ever.
15048 if (((VT == MVT::f32 &&
15050 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15053 return ISD::FMAD;
15054
15055 const TargetOptions &Options = DAG.getTarget().Options;
15056 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15057 (N0->getFlags().hasAllowContract() &&
15058 N1->getFlags().hasAllowContract())) &&
15060 return ISD::FMA;
15061 }
15062
15063 return 0;
15064}
15065
15066// For a reassociatable opcode perform:
15067// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15068SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15069 SelectionDAG &DAG) const {
15070 EVT VT = N->getValueType(0);
15071 if (VT != MVT::i32 && VT != MVT::i64)
15072 return SDValue();
15073
15074 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15075 return SDValue();
15076
15077 unsigned Opc = N->getOpcode();
15078 SDValue Op0 = N->getOperand(0);
15079 SDValue Op1 = N->getOperand(1);
15080
15081 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15082 return SDValue();
15083
15084 if (Op0->isDivergent())
15085 std::swap(Op0, Op1);
15086
15087 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15088 return SDValue();
15089
15090 SDValue Op2 = Op1.getOperand(1);
15091 Op1 = Op1.getOperand(0);
15092 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15093 return SDValue();
15094
15095 if (Op1->isDivergent())
15096 std::swap(Op1, Op2);
15097
15098 SDLoc SL(N);
15099 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15100 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15101}
15102
15103static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15104 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15106 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15107 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15108 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15109}
15110
15111// Fold
15112// y = lshr i64 x, 32
15113// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15114// with Const.hi == -1
15115// To
15116// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15118 SDValue MulLHS, SDValue MulRHS,
15119 SDValue AddRHS) {
15120 if (MulRHS.getOpcode() == ISD::SRL)
15121 std::swap(MulLHS, MulRHS);
15122
15123 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15124 return SDValue();
15125
15126 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15127 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15128 MulLHS.getOperand(0) != AddRHS)
15129 return SDValue();
15130
15132 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15133 return SDValue();
15134
15135 SDValue ConstMul =
15136 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15137 return getMad64_32(DAG, SL, MVT::i64,
15138 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15139 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15140}
15141
15142// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15143// multiplies, if any.
15144//
15145// Full 64-bit multiplies that feed into an addition are lowered here instead
15146// of using the generic expansion. The generic expansion ends up with
15147// a tree of ADD nodes that prevents us from using the "add" part of the
15148// MAD instruction. The expansion produced here results in a chain of ADDs
15149// instead of a tree.
15150SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15151 DAGCombinerInfo &DCI) const {
15152 assert(N->isAnyAdd());
15153
15154 SelectionDAG &DAG = DCI.DAG;
15155 EVT VT = N->getValueType(0);
15156 SDLoc SL(N);
15157 SDValue LHS = N->getOperand(0);
15158 SDValue RHS = N->getOperand(1);
15159
15160 if (VT.isVector())
15161 return SDValue();
15162
15163 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15164 // result in scalar registers for uniform values.
15165 if (!N->isDivergent() && Subtarget->hasSMulHi())
15166 return SDValue();
15167
15168 unsigned NumBits = VT.getScalarSizeInBits();
15169 if (NumBits <= 32 || NumBits > 64)
15170 return SDValue();
15171
15172 if (LHS.getOpcode() != ISD::MUL) {
15173 assert(RHS.getOpcode() == ISD::MUL);
15174 std::swap(LHS, RHS);
15175 }
15176
15177 // Avoid the fold if it would unduly increase the number of multiplies due to
15178 // multiple uses, except on hardware with full-rate multiply-add (which is
15179 // part of full-rate 64-bit ops).
15180 if (!Subtarget->hasFullRate64Ops()) {
15181 unsigned NumUsers = 0;
15182 for (SDNode *User : LHS->users()) {
15183 // There is a use that does not feed into addition, so the multiply can't
15184 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15185 if (!User->isAnyAdd())
15186 return SDValue();
15187
15188 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15189 // MUL + 3xADD + 3xADDC over 3xMAD.
15190 ++NumUsers;
15191 if (NumUsers >= 3)
15192 return SDValue();
15193 }
15194 }
15195
15196 SDValue MulLHS = LHS.getOperand(0);
15197 SDValue MulRHS = LHS.getOperand(1);
15198 SDValue AddRHS = RHS;
15199
15200 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15201 return FoldedMAD;
15202
15203 // Always check whether operands are small unsigned values, since that
15204 // knowledge is useful in more cases. Check for small signed values only if
15205 // doing so can unlock a shorter code sequence.
15206 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15207 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15208
15209 bool MulSignedLo = false;
15210 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15211 MulSignedLo =
15212 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15213 }
15214
15215 // The operands and final result all have the same number of bits. If
15216 // operands need to be extended, they can be extended with garbage. The
15217 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15218 // truncated away in the end.
15219 if (VT != MVT::i64) {
15220 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15221 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15222 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15223 }
15224
15225 // The basic code generated is conceptually straightforward. Pseudo code:
15226 //
15227 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15228 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15229 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15230 //
15231 // The second and third lines are optional, depending on whether the factors
15232 // are {sign,zero}-extended or not.
15233 //
15234 // The actual DAG is noisier than the pseudo code, but only due to
15235 // instructions that disassemble values into low and high parts, and
15236 // assemble the final result.
15237 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15238
15239 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15240 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15241 SDValue Accum =
15242 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15243
15244 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15245 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15246
15247 if (!MulLHSUnsigned32) {
15248 auto MulLHSHi =
15249 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15250 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15251 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15252 }
15253
15254 if (!MulRHSUnsigned32) {
15255 auto MulRHSHi =
15256 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15257 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15258 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15259 }
15260
15261 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15262 Accum = DAG.getBitcast(MVT::i64, Accum);
15263 }
15264
15265 if (VT != MVT::i64)
15266 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15267 return Accum;
15268}
15269
15270SDValue
15271SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15272 DAGCombinerInfo &DCI) const {
15273 SDValue RHS = N->getOperand(1);
15274 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15275 if (!CRHS)
15276 return SDValue();
15277
15278 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15279 // common.
15280 uint64_t Val = CRHS->getZExtValue();
15281 if (countr_zero(Val) >= 32) {
15282 SelectionDAG &DAG = DCI.DAG;
15283 SDLoc SL(N);
15284 SDValue LHS = N->getOperand(0);
15285
15286 // Avoid carry machinery if we know the low half of the add does not
15287 // contribute to the final result.
15288 //
15289 // add i64:x, K if computeTrailingZeros(K) >= 32
15290 // => build_pair (add x.hi, K.hi), x.lo
15291
15292 // Breaking the 64-bit add here with this strange constant is unlikely
15293 // to interfere with addressing mode patterns.
15294
15295 SDValue Hi = getHiHalf64(LHS, DAG);
15296 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15297 unsigned Opcode = N->getOpcode();
15298 if (Opcode == ISD::PTRADD)
15299 Opcode = ISD::ADD;
15300 SDValue AddHi =
15301 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15302
15303 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15304 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15305 }
15306
15307 return SDValue();
15308}
15309
15310// Collect the ultimate src of each of the mul node's operands, and confirm
15311// each operand is 8 bytes.
15312static std::optional<ByteProvider<SDValue>>
15313handleMulOperand(const SDValue &MulOperand) {
15314 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15315 if (!Byte0 || Byte0->isConstantZero()) {
15316 return std::nullopt;
15317 }
15318 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15319 if (Byte1 && !Byte1->isConstantZero()) {
15320 return std::nullopt;
15321 }
15322 return Byte0;
15323}
15324
15325static unsigned addPermMasks(unsigned First, unsigned Second) {
15326 unsigned FirstCs = First & 0x0c0c0c0c;
15327 unsigned SecondCs = Second & 0x0c0c0c0c;
15328 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15329 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15330
15331 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15332 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15333 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15334 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15335
15336 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15337}
15338
15339struct DotSrc {
15341 int64_t PermMask;
15343};
15344
15348 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15349
15350 assert(Src0.Src.has_value() && Src1.Src.has_value());
15351 // Src0s and Src1s are empty, just place arbitrarily.
15352 if (Step == 0) {
15353 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15354 Src0.SrcOffset / 4});
15355 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15356 Src1.SrcOffset / 4});
15357 return;
15358 }
15359
15360 for (int BPI = 0; BPI < 2; BPI++) {
15361 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15362 if (BPI == 1) {
15363 BPP = {Src1, Src0};
15364 }
15365 unsigned ZeroMask = 0x0c0c0c0c;
15366 unsigned FMask = 0xFF << (8 * (3 - Step));
15367
15368 unsigned FirstMask =
15369 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15370 unsigned SecondMask =
15371 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15372 // Attempt to find Src vector which contains our SDValue, if so, add our
15373 // perm mask to the existing one. If we are unable to find a match for the
15374 // first SDValue, attempt to find match for the second.
15375 int FirstGroup = -1;
15376 for (int I = 0; I < 2; I++) {
15377 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15378 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15379 return IterElt.SrcOp == *BPP.first.Src &&
15380 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15381 };
15382
15383 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15384 if (Match != Srcs.end()) {
15385 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15386 FirstGroup = I;
15387 break;
15388 }
15389 }
15390 if (FirstGroup != -1) {
15391 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15392 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15393 return IterElt.SrcOp == *BPP.second.Src &&
15394 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15395 };
15396 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15397 if (Match != Srcs.end()) {
15398 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15399 } else
15400 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15401 return;
15402 }
15403 }
15404
15405 // If we have made it here, then we could not find a match in Src0s or Src1s
15406 // for either Src0 or Src1, so just place them arbitrarily.
15407
15408 unsigned ZeroMask = 0x0c0c0c0c;
15409 unsigned FMask = 0xFF << (8 * (3 - Step));
15410
15411 Src0s.push_back(
15412 {*Src0.Src,
15413 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15414 Src0.SrcOffset / 4});
15415 Src1s.push_back(
15416 {*Src1.Src,
15417 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15418 Src1.SrcOffset / 4});
15419}
15420
15422 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15423 bool IsAny) {
15424
15425 // If we just have one source, just permute it accordingly.
15426 if (Srcs.size() == 1) {
15427 auto *Elt = Srcs.begin();
15428 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15429
15430 // v_perm will produce the original value
15431 if (Elt->PermMask == 0x3020100)
15432 return EltOp;
15433
15434 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15435 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15436 }
15437
15438 auto *FirstElt = Srcs.begin();
15439 auto *SecondElt = std::next(FirstElt);
15440
15442
15443 // If we have multiple sources in the chain, combine them via perms (using
15444 // calculated perm mask) and Ors.
15445 while (true) {
15446 auto FirstMask = FirstElt->PermMask;
15447 auto SecondMask = SecondElt->PermMask;
15448
15449 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15450 unsigned FirstPlusFour = FirstMask | 0x04040404;
15451 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15452 // original 0x0C.
15453 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15454
15455 auto PermMask = addPermMasks(FirstMask, SecondMask);
15456 auto FirstVal =
15457 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15458 auto SecondVal =
15459 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15460
15461 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15462 SecondVal,
15463 DAG.getConstant(PermMask, SL, MVT::i32)));
15464
15465 FirstElt = std::next(SecondElt);
15466 if (FirstElt == Srcs.end())
15467 break;
15468
15469 SecondElt = std::next(FirstElt);
15470 // If we only have a FirstElt, then just combine that into the cumulative
15471 // source node.
15472 if (SecondElt == Srcs.end()) {
15473 auto EltOp =
15474 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15475
15476 Perms.push_back(
15477 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15478 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15479 break;
15480 }
15481 }
15482
15483 assert(Perms.size() == 1 || Perms.size() == 2);
15484 return Perms.size() == 2
15485 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15486 : Perms[0];
15487}
15488
15489static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15490 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15491 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15492 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15493 EntryMask += ZeroMask;
15494 }
15495}
15496
15497static bool isMul(const SDValue Op) {
15498 auto Opcode = Op.getOpcode();
15499
15500 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15501 Opcode == AMDGPUISD::MUL_I24);
15502}
15503
15504static std::optional<bool>
15506 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15507 const SDValue &S1Op, const SelectionDAG &DAG) {
15508 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15509 // of the dot4 is irrelevant.
15510 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15511 return false;
15512
15513 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15514 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15515 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15516 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15517 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15518 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15519
15520 assert(!(S0IsUnsigned && S0IsSigned));
15521 assert(!(S1IsUnsigned && S1IsSigned));
15522
15523 // There are 9 possible permutations of
15524 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15525
15526 // In two permutations, the sign bits are known to be the same for both Ops,
15527 // so simply return Signed / Unsigned corresponding to the MSB
15528
15529 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15530 return S0IsSigned;
15531
15532 // In another two permutations, the sign bits are known to be opposite. In
15533 // this case return std::nullopt to indicate a bad match.
15534
15535 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15536 return std::nullopt;
15537
15538 // In the remaining five permutations, we don't know the value of the sign
15539 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15540 // the upper bits must be extension bits. Thus, the only ways for the sign
15541 // bit to be unknown is if it was sign extended from unknown value, or if it
15542 // was any extended. In either case, it is correct to use the signed
15543 // version of the signedness semantics of dot4
15544
15545 // In two of such permutations, we known the sign bit is set for
15546 // one op, and the other is unknown. It is okay to used signed version of
15547 // dot4.
15548 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15549 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15550 return true;
15551
15552 // In one such permutation, we don't know either of the sign bits. It is okay
15553 // to used the signed version of dot4.
15554 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15555 return true;
15556
15557 // In two of such permutations, we known the sign bit is unset for
15558 // one op, and the other is unknown. Return std::nullopt to indicate a
15559 // bad match.
15560 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15561 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15562 return std::nullopt;
15563
15564 llvm_unreachable("Fully covered condition");
15565}
15566
15567SDValue SITargetLowering::performAddCombine(SDNode *N,
15568 DAGCombinerInfo &DCI) const {
15569 SelectionDAG &DAG = DCI.DAG;
15570 EVT VT = N->getValueType(0);
15571 SDLoc SL(N);
15572 SDValue LHS = N->getOperand(0);
15573 SDValue RHS = N->getOperand(1);
15574
15575 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15576 if (Subtarget->hasMad64_32()) {
15577 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15578 return Folded;
15579 }
15580 }
15581
15582 if (SDValue V = reassociateScalarOps(N, DAG)) {
15583 return V;
15584 }
15585
15586 if (VT == MVT::i64) {
15587 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15588 return Folded;
15589 }
15590
15591 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15592 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15593 SDValue TempNode(N, 0);
15594 std::optional<bool> IsSigned;
15598
15599 // Match the v_dot4 tree, while collecting src nodes.
15600 int ChainLength = 0;
15601 for (int I = 0; I < 4; I++) {
15602 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15603 if (MulIdx == -1)
15604 break;
15605 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15606 if (!Src0)
15607 break;
15608 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15609 if (!Src1)
15610 break;
15611
15612 auto IterIsSigned = checkDot4MulSignedness(
15613 TempNode->getOperand(MulIdx), *Src0, *Src1,
15614 TempNode->getOperand(MulIdx)->getOperand(0),
15615 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15616 if (!IterIsSigned)
15617 break;
15618 if (!IsSigned)
15619 IsSigned = *IterIsSigned;
15620 if (*IterIsSigned != *IsSigned)
15621 break;
15622 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15623 auto AddIdx = 1 - MulIdx;
15624 // Allow the special case where add (add (mul24, 0), mul24) became ->
15625 // add (mul24, mul24).
15626 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15627 Src2s.push_back(TempNode->getOperand(AddIdx));
15628 auto Src0 =
15629 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15630 if (!Src0)
15631 break;
15632 auto Src1 =
15633 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15634 if (!Src1)
15635 break;
15636 auto IterIsSigned = checkDot4MulSignedness(
15637 TempNode->getOperand(AddIdx), *Src0, *Src1,
15638 TempNode->getOperand(AddIdx)->getOperand(0),
15639 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15640 if (!IterIsSigned)
15641 break;
15642 assert(IsSigned);
15643 if (*IterIsSigned != *IsSigned)
15644 break;
15645 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15646 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15647 ChainLength = I + 2;
15648 break;
15649 }
15650
15651 TempNode = TempNode->getOperand(AddIdx);
15652 Src2s.push_back(TempNode);
15653 ChainLength = I + 1;
15654 if (TempNode->getNumOperands() < 2)
15655 break;
15656 LHS = TempNode->getOperand(0);
15657 RHS = TempNode->getOperand(1);
15658 }
15659
15660 if (ChainLength < 2)
15661 return SDValue();
15662
15663 // Masks were constructed with assumption that we would find a chain of
15664 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15665 // 0x0c) so they do not affect dot calculation.
15666 if (ChainLength < 4) {
15667 fixMasks(Src0s, ChainLength);
15668 fixMasks(Src1s, ChainLength);
15669 }
15670
15671 SDValue Src0, Src1;
15672
15673 // If we are just using a single source for both, and have permuted the
15674 // bytes consistently, we can just use the sources without permuting
15675 // (commutation).
15676 bool UseOriginalSrc = false;
15677 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15678 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15679 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15680 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15681 SmallVector<unsigned, 4> SrcBytes;
15682 auto Src0Mask = Src0s.begin()->PermMask;
15683 SrcBytes.push_back(Src0Mask & 0xFF000000);
15684 bool UniqueEntries = true;
15685 for (auto I = 1; I < 4; I++) {
15686 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15687
15688 if (is_contained(SrcBytes, NextByte)) {
15689 UniqueEntries = false;
15690 break;
15691 }
15692 SrcBytes.push_back(NextByte);
15693 }
15694
15695 if (UniqueEntries) {
15696 UseOriginalSrc = true;
15697
15698 auto *FirstElt = Src0s.begin();
15699 auto FirstEltOp =
15700 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15701
15702 auto *SecondElt = Src1s.begin();
15703 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
15704 SecondElt->DWordOffset);
15705
15706 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
15707 MVT::getIntegerVT(32));
15708 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
15709 MVT::getIntegerVT(32));
15710 }
15711 }
15712
15713 if (!UseOriginalSrc) {
15714 Src0 = resolveSources(DAG, SL, Src0s, false, true);
15715 Src1 = resolveSources(DAG, SL, Src1s, false, true);
15716 }
15717
15718 assert(IsSigned);
15719 SDValue Src2 =
15720 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
15721
15722 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
15723 : Intrinsic::amdgcn_udot4,
15724 SL, MVT::i64);
15725
15726 assert(!VT.isVector());
15727 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
15728 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
15729
15730 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
15731 }
15732
15733 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15734 return SDValue();
15735
15736 // add x, zext (setcc) => uaddo_carry x, 0, setcc
15737 // add x, sext (setcc) => usubo_carry x, 0, setcc
15738 unsigned Opc = LHS.getOpcode();
15741 std::swap(RHS, LHS);
15742
15743 Opc = RHS.getOpcode();
15744 switch (Opc) {
15745 default:
15746 break;
15747 case ISD::ZERO_EXTEND:
15748 case ISD::SIGN_EXTEND:
15749 case ISD::ANY_EXTEND: {
15750 auto Cond = RHS.getOperand(0);
15751 // If this won't be a real VOPC output, we would still need to insert an
15752 // extra instruction anyway.
15753 if (!isBoolSGPR(Cond))
15754 break;
15755 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15756 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15758 return DAG.getNode(Opc, SL, VTList, Args);
15759 }
15760 case ISD::UADDO_CARRY: {
15761 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
15762 if (!isNullConstant(RHS.getOperand(1)))
15763 break;
15764 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
15765 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
15766 }
15767 }
15768 return SDValue();
15769}
15770
15771SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
15772 DAGCombinerInfo &DCI) const {
15773 SelectionDAG &DAG = DCI.DAG;
15774 SDLoc DL(N);
15775 EVT VT = N->getValueType(0);
15776 SDValue N0 = N->getOperand(0);
15777 SDValue N1 = N->getOperand(1);
15778
15779 // The following folds transform PTRADDs into regular arithmetic in cases
15780 // where the PTRADD wouldn't be folded as an immediate offset into memory
15781 // instructions anyway. They are target-specific in that other targets might
15782 // prefer to not lose information about the pointer arithmetic.
15783
15784 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
15785 // Adapted from DAGCombiner::visitADDLikeCommutative.
15786 SDValue V, K;
15787 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
15788 SDNodeFlags ShlFlags = N1->getFlags();
15789 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
15790 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
15791 // preserved.
15792 SDNodeFlags NewShlFlags =
15793 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
15795 : SDNodeFlags();
15796 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
15797 DCI.AddToWorklist(Inner.getNode());
15798 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
15799 }
15800
15801 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
15802 // performAddCombine.
15803 if (N1.getOpcode() == ISD::MUL) {
15804 if (Subtarget->hasMad64_32()) {
15805 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15806 return Folded;
15807 }
15808 }
15809
15810 // If the 32 low bits of the constant are all zero, there is nothing to fold
15811 // into an immediate offset, so it's better to eliminate the unnecessary
15812 // addition for the lower 32 bits than to preserve the PTRADD.
15813 // Analogous to a fold in performAddCombine.
15814 if (VT == MVT::i64) {
15815 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15816 return Folded;
15817 }
15818
15819 if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
15820 // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
15821 // global address GA and constant c, such that c can be folded into GA.
15822 SDValue GAValue = N0.getOperand(0);
15823 if (const GlobalAddressSDNode *GA =
15825 if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
15826 // If both additions in the original were NUW, reassociation preserves
15827 // that.
15828 SDNodeFlags Flags =
15829 (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15830 SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
15831 DCI.AddToWorklist(Inner.getNode());
15832 return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
15833 }
15834 }
15835 }
15836
15837 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
15838 return SDValue();
15839
15840 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15841 // y is not, and (add y, z) is used only once.
15842 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15843 // z is not, and (add y, z) is used only once.
15844 // The goal is to move constant offsets to the outermost ptradd, to create
15845 // more opportunities to fold offsets into memory instructions.
15846 // Together with the generic combines in DAGCombiner.cpp, this also
15847 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15848 //
15849 // This transform is here instead of in the general DAGCombiner as it can
15850 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15851 // AArch64's CPA.
15852 SDValue X = N0;
15853 SDValue Y = N1.getOperand(0);
15854 SDValue Z = N1.getOperand(1);
15855 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
15856 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
15857
15858 // If both additions in the original were NUW, reassociation preserves that.
15859 SDNodeFlags ReassocFlags =
15860 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15861
15862 if (ZIsConstant != YIsConstant) {
15863 if (YIsConstant)
15864 std::swap(Y, Z);
15865 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15866 DCI.AddToWorklist(Inner.getNode());
15867 return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15868 }
15869
15870 // If one of Y and Z is constant, they have been handled above. If both were
15871 // constant, the addition would have been folded in SelectionDAG::getNode
15872 // already. This ensures that the generic DAG combines won't undo the
15873 // following reassociation.
15874 assert(!YIsConstant && !ZIsConstant);
15875
15876 if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15877 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
15878 // y are uniform and z isn't.
15879 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
15880 // z are uniform and y isn't.
15881 // The goal is to push uniform operands up in the computation, so that they
15882 // can be handled with scalar operations. We can't use reassociateScalarOps
15883 // for this since it requires two identical commutative operations to
15884 // reassociate.
15885 if (Y->isDivergent())
15886 std::swap(Y, Z);
15887 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15888 DCI.AddToWorklist(UniformInner.getNode());
15889 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
15890 }
15891
15892 return SDValue();
15893}
15894
15895SDValue SITargetLowering::performSubCombine(SDNode *N,
15896 DAGCombinerInfo &DCI) const {
15897 SelectionDAG &DAG = DCI.DAG;
15898 EVT VT = N->getValueType(0);
15899
15900 if (VT == MVT::i64) {
15901 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15902 return Folded;
15903 }
15904
15905 if (VT != MVT::i32)
15906 return SDValue();
15907
15908 SDLoc SL(N);
15909 SDValue LHS = N->getOperand(0);
15910 SDValue RHS = N->getOperand(1);
15911
15912 // sub x, zext (setcc) => usubo_carry x, 0, setcc
15913 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
15914 unsigned Opc = RHS.getOpcode();
15915 switch (Opc) {
15916 default:
15917 break;
15918 case ISD::ZERO_EXTEND:
15919 case ISD::SIGN_EXTEND:
15920 case ISD::ANY_EXTEND: {
15921 auto Cond = RHS.getOperand(0);
15922 // If this won't be a real VOPC output, we would still need to insert an
15923 // extra instruction anyway.
15924 if (!isBoolSGPR(Cond))
15925 break;
15926 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
15927 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
15929 return DAG.getNode(Opc, SL, VTList, Args);
15930 }
15931 }
15932
15933 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
15934 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
15935 if (!isNullConstant(LHS.getOperand(1)))
15936 return SDValue();
15937 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
15938 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
15939 }
15940 return SDValue();
15941}
15942
15943SDValue
15944SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
15945 DAGCombinerInfo &DCI) const {
15946
15947 if (N->getValueType(0) != MVT::i32)
15948 return SDValue();
15949
15950 if (!isNullConstant(N->getOperand(1)))
15951 return SDValue();
15952
15953 SelectionDAG &DAG = DCI.DAG;
15954 SDValue LHS = N->getOperand(0);
15955
15956 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
15957 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
15958 unsigned LHSOpc = LHS.getOpcode();
15959 unsigned Opc = N->getOpcode();
15960 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
15961 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
15962 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
15963 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
15964 }
15965 return SDValue();
15966}
15967
15968SDValue SITargetLowering::performFAddCombine(SDNode *N,
15969 DAGCombinerInfo &DCI) const {
15970 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15971 return SDValue();
15972
15973 SelectionDAG &DAG = DCI.DAG;
15974 EVT VT = N->getValueType(0);
15975
15976 SDLoc SL(N);
15977 SDValue LHS = N->getOperand(0);
15978 SDValue RHS = N->getOperand(1);
15979
15980 // These should really be instruction patterns, but writing patterns with
15981 // source modifiers is a pain.
15982
15983 // fadd (fadd (a, a), b) -> mad 2.0, a, b
15984 if (LHS.getOpcode() == ISD::FADD) {
15985 SDValue A = LHS.getOperand(0);
15986 if (A == LHS.getOperand(1)) {
15987 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
15988 if (FusedOp != 0) {
15989 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
15990 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
15991 }
15992 }
15993 }
15994
15995 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
15996 if (RHS.getOpcode() == ISD::FADD) {
15997 SDValue A = RHS.getOperand(0);
15998 if (A == RHS.getOperand(1)) {
15999 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16000 if (FusedOp != 0) {
16001 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16002 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16003 }
16004 }
16005 }
16006
16007 return SDValue();
16008}
16009
16010SDValue SITargetLowering::performFSubCombine(SDNode *N,
16011 DAGCombinerInfo &DCI) const {
16012 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16013 return SDValue();
16014
16015 SelectionDAG &DAG = DCI.DAG;
16016 SDLoc SL(N);
16017 EVT VT = N->getValueType(0);
16018 assert(!VT.isVector());
16019
16020 // Try to get the fneg to fold into the source modifier. This undoes generic
16021 // DAG combines and folds them into the mad.
16022 //
16023 // Only do this if we are not trying to support denormals. v_mad_f32 does
16024 // not support denormals ever.
16025 SDValue LHS = N->getOperand(0);
16026 SDValue RHS = N->getOperand(1);
16027 if (LHS.getOpcode() == ISD::FADD) {
16028 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16029 SDValue A = LHS.getOperand(0);
16030 if (A == LHS.getOperand(1)) {
16031 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16032 if (FusedOp != 0) {
16033 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16034 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16035
16036 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16037 }
16038 }
16039 }
16040
16041 if (RHS.getOpcode() == ISD::FADD) {
16042 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16043
16044 SDValue A = RHS.getOperand(0);
16045 if (A == RHS.getOperand(1)) {
16046 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16047 if (FusedOp != 0) {
16048 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16049 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16050 }
16051 }
16052 }
16053
16054 return SDValue();
16055}
16056
16057SDValue SITargetLowering::performFDivCombine(SDNode *N,
16058 DAGCombinerInfo &DCI) const {
16059 SelectionDAG &DAG = DCI.DAG;
16060 SDLoc SL(N);
16061 EVT VT = N->getValueType(0);
16062 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16063 return SDValue();
16064
16065 SDValue LHS = N->getOperand(0);
16066 SDValue RHS = N->getOperand(1);
16067
16068 SDNodeFlags Flags = N->getFlags();
16069 SDNodeFlags RHSFlags = RHS->getFlags();
16070 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16071 !RHS->hasOneUse())
16072 return SDValue();
16073
16074 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16075 bool IsNegative = false;
16076 if (CLHS->isExactlyValue(1.0) ||
16077 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16078 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16079 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16080 if (RHS.getOpcode() == ISD::FSQRT) {
16081 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16082 SDValue Rsq =
16083 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16084 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16085 }
16086 }
16087 }
16088
16089 return SDValue();
16090}
16091
16092SDValue SITargetLowering::performFMulCombine(SDNode *N,
16093 DAGCombinerInfo &DCI) const {
16094 SelectionDAG &DAG = DCI.DAG;
16095 EVT VT = N->getValueType(0);
16096 EVT ScalarVT = VT.getScalarType();
16097 EVT IntVT = VT.changeElementType(MVT::i32);
16098
16099 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16100 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16101 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16102 return SDValue();
16103 }
16104
16105 SDValue LHS = N->getOperand(0);
16106 SDValue RHS = N->getOperand(1);
16107
16108 // It is cheaper to realize i32 inline constants as compared against
16109 // materializing f16 or f64 (or even non-inline f32) values,
16110 // possible via ldexp usage, as shown below :
16111 //
16112 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16113 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16114 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16115 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16116 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16117 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16118 if (!TrueNode)
16119 return SDValue();
16120 const ConstantFPSDNode *FalseNode =
16121 isConstOrConstSplatFP(RHS.getOperand(2));
16122 if (!FalseNode)
16123 return SDValue();
16124
16125 if (TrueNode->isNegative() != FalseNode->isNegative())
16126 return SDValue();
16127
16128 // For f32, only non-inline constants should be transformed.
16129 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16130 if (ScalarVT == MVT::f32 &&
16131 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16132 TII->isInlineConstant(FalseNode->getValueAPF()))
16133 return SDValue();
16134
16135 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16136 if (TrueNodeExpVal == INT_MIN)
16137 return SDValue();
16138 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16139 if (FalseNodeExpVal == INT_MIN)
16140 return SDValue();
16141
16142 SDLoc SL(N);
16143 SDValue SelectNode =
16144 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16145 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16146 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16147
16148 LHS = TrueNode->isNegative()
16149 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16150 : LHS;
16151
16152 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16153 }
16154
16155 return SDValue();
16156}
16157
16158SDValue SITargetLowering::performFMACombine(SDNode *N,
16159 DAGCombinerInfo &DCI) const {
16160 SelectionDAG &DAG = DCI.DAG;
16161 EVT VT = N->getValueType(0);
16162 SDLoc SL(N);
16163
16164 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16165 return SDValue();
16166
16167 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16168 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16169 SDValue Op1 = N->getOperand(0);
16170 SDValue Op2 = N->getOperand(1);
16171 SDValue FMA = N->getOperand(2);
16172
16173 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16174 Op2.getOpcode() != ISD::FP_EXTEND)
16175 return SDValue();
16176
16177 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16178 // regardless of the denorm mode setting. Therefore,
16179 // fp-contract is sufficient to allow generating fdot2.
16180 const TargetOptions &Options = DAG.getTarget().Options;
16181 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16182 (N->getFlags().hasAllowContract() &&
16183 FMA->getFlags().hasAllowContract())) {
16184 Op1 = Op1.getOperand(0);
16185 Op2 = Op2.getOperand(0);
16186 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16188 return SDValue();
16189
16190 SDValue Vec1 = Op1.getOperand(0);
16191 SDValue Idx1 = Op1.getOperand(1);
16192 SDValue Vec2 = Op2.getOperand(0);
16193
16194 SDValue FMAOp1 = FMA.getOperand(0);
16195 SDValue FMAOp2 = FMA.getOperand(1);
16196 SDValue FMAAcc = FMA.getOperand(2);
16197
16198 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16199 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16200 return SDValue();
16201
16202 FMAOp1 = FMAOp1.getOperand(0);
16203 FMAOp2 = FMAOp2.getOperand(0);
16204 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16206 return SDValue();
16207
16208 SDValue Vec3 = FMAOp1.getOperand(0);
16209 SDValue Vec4 = FMAOp2.getOperand(0);
16210 SDValue Idx2 = FMAOp1.getOperand(1);
16211
16212 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16213 // Idx1 and Idx2 cannot be the same.
16214 Idx1 == Idx2)
16215 return SDValue();
16216
16217 if (Vec1 == Vec2 || Vec3 == Vec4)
16218 return SDValue();
16219
16220 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16221 return SDValue();
16222
16223 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16224 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16225 DAG.getTargetConstant(0, SL, MVT::i1));
16226 }
16227 }
16228 return SDValue();
16229}
16230
16231SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16232 DAGCombinerInfo &DCI) const {
16233 SelectionDAG &DAG = DCI.DAG;
16234 SDLoc SL(N);
16235
16236 SDValue LHS = N->getOperand(0);
16237 SDValue RHS = N->getOperand(1);
16238 EVT VT = LHS.getValueType();
16239 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16240
16241 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16242 if (!CRHS) {
16244 if (CRHS) {
16245 std::swap(LHS, RHS);
16246 CC = getSetCCSwappedOperands(CC);
16247 }
16248 }
16249
16250 if (CRHS) {
16251 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16252 isBoolSGPR(LHS.getOperand(0))) {
16253 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16254 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16255 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16256 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16257 if ((CRHS->isAllOnes() &&
16258 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16259 (CRHS->isZero() &&
16260 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16261 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16262 DAG.getAllOnesConstant(SL, MVT::i1));
16263 if ((CRHS->isAllOnes() &&
16264 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16265 (CRHS->isZero() &&
16266 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16267 return LHS.getOperand(0);
16268 }
16269
16270 const APInt &CRHSVal = CRHS->getAPIntValue();
16271 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16272 LHS.getOpcode() == ISD::SELECT &&
16273 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16274 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16275 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16276 isBoolSGPR(LHS.getOperand(0))) {
16277 // Given CT != FT:
16278 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16279 // setcc (select cc, CT, CF), CF, ne => cc
16280 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16281 // setcc (select cc, CT, CF), CT, eq => cc
16282 const APInt &CT = LHS.getConstantOperandAPInt(1);
16283 const APInt &CF = LHS.getConstantOperandAPInt(2);
16284
16285 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16286 (CT == CRHSVal && CC == ISD::SETNE))
16287 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16288 DAG.getAllOnesConstant(SL, MVT::i1));
16289 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16290 (CT == CRHSVal && CC == ISD::SETEQ))
16291 return LHS.getOperand(0);
16292 }
16293 }
16294
16295 if (VT != MVT::f32 && VT != MVT::f64 &&
16296 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16297 return SDValue();
16298
16299 // Match isinf/isfinite pattern
16300 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16301 // (fcmp one (fabs x), inf) -> (fp_class x,
16302 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16303 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16304 LHS.getOpcode() == ISD::FABS) {
16305 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16306 if (!CRHS)
16307 return SDValue();
16308
16309 const APFloat &APF = CRHS->getValueAPF();
16310 if (APF.isInfinity() && !APF.isNegative()) {
16311 const unsigned IsInfMask =
16313 const unsigned IsFiniteMask =
16317 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16318 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16319 DAG.getConstant(Mask, SL, MVT::i32));
16320 }
16321 }
16322
16323 return SDValue();
16324}
16325
16326SDValue
16327SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16328 DAGCombinerInfo &DCI) const {
16329 SelectionDAG &DAG = DCI.DAG;
16330 SDLoc SL(N);
16331 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16332
16333 SDValue Src = N->getOperand(0);
16334 SDValue Shift = N->getOperand(0);
16335
16336 // TODO: Extend type shouldn't matter (assuming legal types).
16337 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16338 Shift = Shift.getOperand(0);
16339
16340 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16341 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16342 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16343 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16344 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16345 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16346 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16347 SDValue Shifted = DAG.getZExtOrTrunc(
16348 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16349
16350 unsigned ShiftOffset = 8 * Offset;
16351 if (Shift.getOpcode() == ISD::SHL)
16352 ShiftOffset -= C->getZExtValue();
16353 else
16354 ShiftOffset += C->getZExtValue();
16355
16356 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16357 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16358 MVT::f32, Shifted);
16359 }
16360 }
16361 }
16362
16363 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16364 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16365 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16366 // We simplified Src. If this node is not dead, visit it again so it is
16367 // folded properly.
16368 if (N->getOpcode() != ISD::DELETED_NODE)
16369 DCI.AddToWorklist(N);
16370 return SDValue(N, 0);
16371 }
16372
16373 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16374 if (SDValue DemandedSrc =
16375 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16376 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16377
16378 return SDValue();
16379}
16380
16381SDValue SITargetLowering::performClampCombine(SDNode *N,
16382 DAGCombinerInfo &DCI) const {
16383 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16384 if (!CSrc)
16385 return SDValue();
16386
16387 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16388 const APFloat &F = CSrc->getValueAPF();
16389 APFloat Zero = APFloat::getZero(F.getSemantics());
16390 if (F < Zero ||
16391 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16392 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16393 }
16394
16395 APFloat One(F.getSemantics(), "1.0");
16396 if (F > One)
16397 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16398
16399 return SDValue(CSrc, 0);
16400}
16401
16402SDValue SITargetLowering::performSelectCombine(SDNode *N,
16403 DAGCombinerInfo &DCI) const {
16404
16405 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16406 // integer).
16407 // Detect when CMP and SELECT use the same constant and fold them to avoid
16408 // loading the constant twice. Specifically handles patterns like:
16409 // %cmp = icmp eq i32 %val, 4242
16410 // %sel = select i1 %cmp, i32 4242, i32 %other
16411 // It can be optimized to reuse %val instead of 4242 in select.
16412 SDValue Cond = N->getOperand(0);
16413 SDValue TrueVal = N->getOperand(1);
16414 SDValue FalseVal = N->getOperand(2);
16415
16416 // Check if condition is a comparison.
16417 if (Cond.getOpcode() != ISD::SETCC)
16418 return SDValue();
16419
16420 SDValue LHS = Cond.getOperand(0);
16421 SDValue RHS = Cond.getOperand(1);
16422 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16423
16424 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16425 bool isInteger = LHS.getValueType().isInteger();
16426
16427 // Handle simple floating-point and integer types only.
16428 if (!isFloatingPoint && !isInteger)
16429 return SDValue();
16430
16431 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16432 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16433 if (!isEquality && !isNonEquality)
16434 return SDValue();
16435
16436 SDValue ArgVal, ConstVal;
16437 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16438 (isInteger && isa<ConstantSDNode>(RHS))) {
16439 ConstVal = RHS;
16440 ArgVal = LHS;
16441 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16442 (isInteger && isa<ConstantSDNode>(LHS))) {
16443 ConstVal = LHS;
16444 ArgVal = RHS;
16445 } else {
16446 return SDValue();
16447 }
16448
16449 // Skip optimization for inlinable immediates.
16450 if (isFloatingPoint) {
16451 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16452 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16453 return SDValue();
16454 } else {
16456 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16457 return SDValue();
16458 }
16459
16460 // For equality and non-equality comparisons, patterns:
16461 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16462 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16463 if (!(isEquality && TrueVal == ConstVal) &&
16464 !(isNonEquality && FalseVal == ConstVal))
16465 return SDValue();
16466
16467 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16468 SDValue SelectRHS =
16469 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16470 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16471 SelectLHS, SelectRHS);
16472}
16473
16475 DAGCombinerInfo &DCI) const {
16476 switch (N->getOpcode()) {
16477 case ISD::ADD:
16478 case ISD::SUB:
16479 case ISD::SHL:
16480 case ISD::SRL:
16481 case ISD::SRA:
16482 case ISD::AND:
16483 case ISD::OR:
16484 case ISD::XOR:
16485 case ISD::MUL:
16486 case ISD::SETCC:
16487 case ISD::SELECT:
16488 case ISD::SMIN:
16489 case ISD::SMAX:
16490 case ISD::UMIN:
16491 case ISD::UMAX:
16492 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16493 return Res;
16494 break;
16495 default:
16496 break;
16497 }
16498
16499 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16500 return SDValue();
16501
16502 switch (N->getOpcode()) {
16503 case ISD::ADD:
16504 return performAddCombine(N, DCI);
16505 case ISD::PTRADD:
16506 return performPtrAddCombine(N, DCI);
16507 case ISD::SUB:
16508 return performSubCombine(N, DCI);
16509 case ISD::UADDO_CARRY:
16510 case ISD::USUBO_CARRY:
16511 return performAddCarrySubCarryCombine(N, DCI);
16512 case ISD::FADD:
16513 return performFAddCombine(N, DCI);
16514 case ISD::FSUB:
16515 return performFSubCombine(N, DCI);
16516 case ISD::FDIV:
16517 return performFDivCombine(N, DCI);
16518 case ISD::FMUL:
16519 return performFMulCombine(N, DCI);
16520 case ISD::SETCC:
16521 return performSetCCCombine(N, DCI);
16522 case ISD::SELECT:
16523 if (auto Res = performSelectCombine(N, DCI))
16524 return Res;
16525 break;
16526 case ISD::FMAXNUM:
16527 case ISD::FMINNUM:
16528 case ISD::FMAXNUM_IEEE:
16529 case ISD::FMINNUM_IEEE:
16530 case ISD::FMAXIMUM:
16531 case ISD::FMINIMUM:
16532 case ISD::FMAXIMUMNUM:
16533 case ISD::FMINIMUMNUM:
16534 case ISD::SMAX:
16535 case ISD::SMIN:
16536 case ISD::UMAX:
16537 case ISD::UMIN:
16540 return performMinMaxCombine(N, DCI);
16541 case ISD::FMA:
16542 return performFMACombine(N, DCI);
16543 case ISD::AND:
16544 return performAndCombine(N, DCI);
16545 case ISD::OR:
16546 return performOrCombine(N, DCI);
16547 case ISD::FSHR: {
16549 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16550 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16551 return matchPERM(N, DCI);
16552 }
16553 break;
16554 }
16555 case ISD::XOR:
16556 return performXorCombine(N, DCI);
16557 case ISD::ZERO_EXTEND:
16558 return performZeroExtendCombine(N, DCI);
16560 return performSignExtendInRegCombine(N, DCI);
16562 return performClassCombine(N, DCI);
16563 case ISD::FCANONICALIZE:
16564 return performFCanonicalizeCombine(N, DCI);
16565 case AMDGPUISD::RCP:
16566 return performRcpCombine(N, DCI);
16567 case ISD::FLDEXP:
16568 case AMDGPUISD::FRACT:
16569 case AMDGPUISD::RSQ:
16572 case AMDGPUISD::RSQ_CLAMP: {
16573 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16574 SDValue Src = N->getOperand(0);
16575 if (Src.isUndef())
16576 return Src;
16577 break;
16578 }
16579 case ISD::SINT_TO_FP:
16580 case ISD::UINT_TO_FP:
16581 return performUCharToFloatCombine(N, DCI);
16582 case ISD::FCOPYSIGN:
16583 return performFCopySignCombine(N, DCI);
16588 return performCvtF32UByteNCombine(N, DCI);
16589 case AMDGPUISD::FMED3:
16590 return performFMed3Combine(N, DCI);
16592 return performCvtPkRTZCombine(N, DCI);
16593 case AMDGPUISD::CLAMP:
16594 return performClampCombine(N, DCI);
16595 case ISD::SCALAR_TO_VECTOR: {
16596 SelectionDAG &DAG = DCI.DAG;
16597 EVT VT = N->getValueType(0);
16598
16599 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16600 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16601 SDLoc SL(N);
16602 SDValue Src = N->getOperand(0);
16603 EVT EltVT = Src.getValueType();
16604 if (EltVT != MVT::i16)
16605 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16606
16607 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16608 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16609 }
16610
16611 break;
16612 }
16614 return performExtractVectorEltCombine(N, DCI);
16616 return performInsertVectorEltCombine(N, DCI);
16617 case ISD::FP_ROUND:
16618 return performFPRoundCombine(N, DCI);
16619 case ISD::LOAD: {
16620 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16621 return Widened;
16622 [[fallthrough]];
16623 }
16624 default: {
16625 if (!DCI.isBeforeLegalize()) {
16626 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16627 return performMemSDNodeCombine(MemNode, DCI);
16628 }
16629
16630 break;
16631 }
16632 }
16633
16635}
16636
16637/// Helper function for adjustWritemask
16638static unsigned SubIdx2Lane(unsigned Idx) {
16639 switch (Idx) {
16640 default:
16641 return ~0u;
16642 case AMDGPU::sub0:
16643 return 0;
16644 case AMDGPU::sub1:
16645 return 1;
16646 case AMDGPU::sub2:
16647 return 2;
16648 case AMDGPU::sub3:
16649 return 3;
16650 case AMDGPU::sub4:
16651 return 4; // Possible with TFE/LWE
16652 }
16653}
16654
16655/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16656SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16657 SelectionDAG &DAG) const {
16658 unsigned Opcode = Node->getMachineOpcode();
16659
16660 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16661 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16662 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16663 return Node; // not implemented for D16
16664
16665 SDNode *Users[5] = {nullptr};
16666 unsigned Lane = 0;
16667 unsigned DmaskIdx =
16668 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16669 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16670 unsigned NewDmask = 0;
16671 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16672 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16673 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16674 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16675 unsigned TFCLane = 0;
16676 bool HasChain = Node->getNumValues() > 1;
16677
16678 if (OldDmask == 0) {
16679 // These are folded out, but on the chance it happens don't assert.
16680 return Node;
16681 }
16682
16683 unsigned OldBitsSet = llvm::popcount(OldDmask);
16684 // Work out which is the TFE/LWE lane if that is enabled.
16685 if (UsesTFC) {
16686 TFCLane = OldBitsSet;
16687 }
16688
16689 // Try to figure out the used register components
16690 for (SDUse &Use : Node->uses()) {
16691
16692 // Don't look at users of the chain.
16693 if (Use.getResNo() != 0)
16694 continue;
16695
16696 SDNode *User = Use.getUser();
16697
16698 // Abort if we can't understand the usage
16699 if (!User->isMachineOpcode() ||
16700 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16701 return Node;
16702
16703 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16704 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16705 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16706 // set, etc.
16707 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16708 if (Lane == ~0u)
16709 return Node;
16710
16711 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16712 if (UsesTFC && Lane == TFCLane) {
16713 Users[Lane] = User;
16714 } else {
16715 // Set which texture component corresponds to the lane.
16716 unsigned Comp;
16717 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16718 Comp = llvm::countr_zero(Dmask);
16719 Dmask &= ~(1 << Comp);
16720 }
16721
16722 // Abort if we have more than one user per component.
16723 if (Users[Lane])
16724 return Node;
16725
16726 Users[Lane] = User;
16727 NewDmask |= 1 << Comp;
16728 }
16729 }
16730
16731 // Don't allow 0 dmask, as hardware assumes one channel enabled.
16732 bool NoChannels = !NewDmask;
16733 if (NoChannels) {
16734 if (!UsesTFC) {
16735 // No uses of the result and not using TFC. Then do nothing.
16736 return Node;
16737 }
16738 // If the original dmask has one channel - then nothing to do
16739 if (OldBitsSet == 1)
16740 return Node;
16741 // Use an arbitrary dmask - required for the instruction to work
16742 NewDmask = 1;
16743 }
16744 // Abort if there's no change
16745 if (NewDmask == OldDmask)
16746 return Node;
16747
16748 unsigned BitsSet = llvm::popcount(NewDmask);
16749
16750 // Check for TFE or LWE - increase the number of channels by one to account
16751 // for the extra return value
16752 // This will need adjustment for D16 if this is also included in
16753 // adjustWriteMask (this function) but at present D16 are excluded.
16754 unsigned NewChannels = BitsSet + UsesTFC;
16755
16756 int NewOpcode =
16757 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
16758 assert(NewOpcode != -1 &&
16759 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
16760 "failed to find equivalent MIMG op");
16761
16762 // Adjust the writemask in the node
16764 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
16765 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
16766 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
16767
16768 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
16769
16770 MVT ResultVT = NewChannels == 1
16771 ? SVT
16772 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
16773 : NewChannels == 5 ? 8
16774 : NewChannels);
16775 SDVTList NewVTList =
16776 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
16777
16778 MachineSDNode *NewNode =
16779 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
16780
16781 if (HasChain) {
16782 // Update chain.
16783 DAG.setNodeMemRefs(NewNode, Node->memoperands());
16784 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
16785 }
16786
16787 if (NewChannels == 1) {
16788 assert(Node->hasNUsesOfValue(1, 0));
16789 SDNode *Copy =
16790 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
16791 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
16792 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
16793 return nullptr;
16794 }
16795
16796 // Update the users of the node with the new indices
16797 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
16798 SDNode *User = Users[i];
16799 if (!User) {
16800 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
16801 // Users[0] is still nullptr because channel 0 doesn't really have a use.
16802 if (i || !NoChannels)
16803 continue;
16804 } else {
16805 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
16806 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
16807 if (NewUser != User) {
16808 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
16809 DAG.RemoveDeadNode(User);
16810 }
16811 }
16812
16813 switch (Idx) {
16814 default:
16815 break;
16816 case AMDGPU::sub0:
16817 Idx = AMDGPU::sub1;
16818 break;
16819 case AMDGPU::sub1:
16820 Idx = AMDGPU::sub2;
16821 break;
16822 case AMDGPU::sub2:
16823 Idx = AMDGPU::sub3;
16824 break;
16825 case AMDGPU::sub3:
16826 Idx = AMDGPU::sub4;
16827 break;
16828 }
16829 }
16830
16831 DAG.RemoveDeadNode(Node);
16832 return nullptr;
16833}
16834
16836 if (Op.getOpcode() == ISD::AssertZext)
16837 Op = Op.getOperand(0);
16838
16839 return isa<FrameIndexSDNode>(Op);
16840}
16841
16842/// Legalize target independent instructions (e.g. INSERT_SUBREG)
16843/// with frame index operands.
16844/// LLVM assumes that inputs are to these instructions are registers.
16845SDNode *
16847 SelectionDAG &DAG) const {
16848 if (Node->getOpcode() == ISD::CopyToReg) {
16849 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
16850 SDValue SrcVal = Node->getOperand(2);
16851
16852 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
16853 // to try understanding copies to physical registers.
16854 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
16855 SDLoc SL(Node);
16857 SDValue VReg = DAG.getRegister(
16858 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
16859
16860 SDNode *Glued = Node->getGluedNode();
16861 SDValue ToVReg = DAG.getCopyToReg(
16862 Node->getOperand(0), SL, VReg, SrcVal,
16863 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
16864 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
16865 VReg, ToVReg.getValue(1));
16866 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
16867 DAG.RemoveDeadNode(Node);
16868 return ToResultReg.getNode();
16869 }
16870 }
16871
16873 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
16874 if (!isFrameIndexOp(Node->getOperand(i))) {
16875 Ops.push_back(Node->getOperand(i));
16876 continue;
16877 }
16878
16879 SDLoc DL(Node);
16880 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
16881 Node->getOperand(i).getValueType(),
16882 Node->getOperand(i)),
16883 0));
16884 }
16885
16886 return DAG.UpdateNodeOperands(Node, Ops);
16887}
16888
16889/// Fold the instructions after selecting them.
16890/// Returns null if users were already updated.
16892 SelectionDAG &DAG) const {
16894 unsigned Opcode = Node->getMachineOpcode();
16895
16896 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
16897 !TII->isGather4(Opcode) &&
16898 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
16899 return adjustWritemask(Node, DAG);
16900 }
16901
16902 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16904 return Node;
16905 }
16906
16907 switch (Opcode) {
16908 case AMDGPU::V_DIV_SCALE_F32_e64:
16909 case AMDGPU::V_DIV_SCALE_F64_e64: {
16910 // Satisfy the operand register constraint when one of the inputs is
16911 // undefined. Ordinarily each undef value will have its own implicit_def of
16912 // a vreg, so force these to use a single register.
16913 SDValue Src0 = Node->getOperand(1);
16914 SDValue Src1 = Node->getOperand(3);
16915 SDValue Src2 = Node->getOperand(5);
16916
16917 if ((Src0.isMachineOpcode() &&
16918 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
16919 (Src0 == Src1 || Src0 == Src2))
16920 break;
16921
16922 MVT VT = Src0.getValueType().getSimpleVT();
16923 const TargetRegisterClass *RC =
16924 getRegClassFor(VT, Src0.getNode()->isDivergent());
16925
16927 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
16928
16929 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
16930 Src0, SDValue());
16931
16932 // src0 must be the same register as src1 or src2, even if the value is
16933 // undefined, so make sure we don't violate this constraint.
16934 if (Src0.isMachineOpcode() &&
16935 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
16936 if (Src1.isMachineOpcode() &&
16937 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16938 Src0 = Src1;
16939 else if (Src2.isMachineOpcode() &&
16940 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16941 Src0 = Src2;
16942 else {
16943 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
16944 Src0 = UndefReg;
16945 Src1 = UndefReg;
16946 }
16947 } else
16948 break;
16949
16951 Ops[1] = Src0;
16952 Ops[3] = Src1;
16953 Ops[5] = Src2;
16954 Ops.push_back(ImpDef.getValue(1));
16955 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
16956 }
16957 default:
16958 break;
16959 }
16960
16961 return Node;
16962}
16963
16964// Any MIMG instructions that use tfe or lwe require an initialization of the
16965// result register that will be written in the case of a memory access failure.
16966// The required code is also added to tie this init code to the result of the
16967// img instruction.
16970 const SIRegisterInfo &TRI = TII->getRegisterInfo();
16971 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
16972 MachineBasicBlock &MBB = *MI.getParent();
16973
16974 int DstIdx =
16975 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
16976 unsigned InitIdx = 0;
16977
16978 if (TII->isImage(MI)) {
16979 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
16980 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
16981 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
16982
16983 if (!TFE && !LWE) // intersect_ray
16984 return;
16985
16986 unsigned TFEVal = TFE ? TFE->getImm() : 0;
16987 unsigned LWEVal = LWE ? LWE->getImm() : 0;
16988 unsigned D16Val = D16 ? D16->getImm() : 0;
16989
16990 if (!TFEVal && !LWEVal)
16991 return;
16992
16993 // At least one of TFE or LWE are non-zero
16994 // We have to insert a suitable initialization of the result value and
16995 // tie this to the dest of the image instruction.
16996
16997 // Calculate which dword we have to initialize to 0.
16998 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
16999
17000 // check that dmask operand is found.
17001 assert(MO_Dmask && "Expected dmask operand in instruction");
17002
17003 unsigned dmask = MO_Dmask->getImm();
17004 // Determine the number of active lanes taking into account the
17005 // Gather4 special case
17006 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17007
17008 bool Packed = !Subtarget->hasUnpackedD16VMem();
17009
17010 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17011
17012 // Abandon attempt if the dst size isn't large enough
17013 // - this is in fact an error but this is picked up elsewhere and
17014 // reported correctly.
17015 uint32_t DstSize =
17016 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17017 if (DstSize < InitIdx)
17018 return;
17019 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17020 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17021 } else {
17022 return;
17023 }
17024
17025 const DebugLoc &DL = MI.getDebugLoc();
17026
17027 // Create a register for the initialization value.
17028 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17029 unsigned NewDst = 0; // Final initialized value will be in here
17030
17031 // If PRTStrictNull feature is enabled (the default) then initialize
17032 // all the result registers to 0, otherwise just the error indication
17033 // register (VGPRn+1)
17034 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17035 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17036
17037 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17038 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17039 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17040 // Initialize dword
17041 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17042 // clang-format off
17043 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17044 .addImm(0);
17045 // clang-format on
17046 // Insert into the super-reg
17047 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17048 .addReg(PrevDst)
17049 .addReg(SubReg)
17051
17052 PrevDst = NewDst;
17053 }
17054
17055 // Add as an implicit operand
17056 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17057
17058 // Tie the just added implicit operand to the dst
17059 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17060}
17061
17062/// Assign the register class depending on the number of
17063/// bits set in the writemask
17065 SDNode *Node) const {
17067
17068 MachineFunction *MF = MI.getParent()->getParent();
17071
17072 if (TII->isVOP3(MI.getOpcode())) {
17073 // Make sure constant bus requirements are respected.
17074 TII->legalizeOperandsVOP3(MRI, MI);
17075
17076 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17077 // This saves a chain-copy of registers and better balance register
17078 // use between vgpr and agpr as agpr tuples tend to be big.
17079 if (!MI.getDesc().operands().empty()) {
17080 unsigned Opc = MI.getOpcode();
17081 bool HasAGPRs = Info->mayNeedAGPRs();
17082 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17083 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17084 for (auto I :
17085 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17086 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17087 if (I == -1)
17088 break;
17089 if ((I == Src2Idx) && (HasAGPRs))
17090 break;
17091 MachineOperand &Op = MI.getOperand(I);
17092 if (!Op.isReg() || !Op.getReg().isVirtual())
17093 continue;
17094 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17095 if (!TRI->hasAGPRs(RC))
17096 continue;
17097 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17098 if (!Src || !Src->isCopy() ||
17099 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17100 continue;
17101 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17102 // All uses of agpr64 and agpr32 can also accept vgpr except for
17103 // v_accvgpr_read, but we do not produce agpr reads during selection,
17104 // so no use checks are needed.
17105 MRI.setRegClass(Op.getReg(), NewRC);
17106 }
17107
17108 if (TII->isMAI(MI)) {
17109 // The ordinary src0, src1, src2 were legalized above.
17110 //
17111 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17112 // as a separate instruction.
17113 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17114 AMDGPU::OpName::scale_src0);
17115 if (Src0Idx != -1) {
17116 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17117 AMDGPU::OpName::scale_src1);
17118 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17119 TII->usesConstantBus(MRI, MI, Src1Idx))
17120 TII->legalizeOpWithMove(MI, Src1Idx);
17121 }
17122 }
17123
17124 if (!HasAGPRs)
17125 return;
17126
17127 // Resolve the rest of AV operands to AGPRs.
17128 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17129 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17130 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17131 if (TRI->isVectorSuperClass(RC)) {
17132 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17133 MRI.setRegClass(Src2->getReg(), NewRC);
17134 if (Src2->isTied())
17135 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17136 }
17137 }
17138 }
17139 }
17140
17141 return;
17142 }
17143
17144 if (TII->isImage(MI))
17145 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17146}
17147
17149 uint64_t Val) {
17150 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17151 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17152}
17153
17155 const SDLoc &DL,
17156 SDValue Ptr) const {
17158
17159 // Build the half of the subregister with the constants before building the
17160 // full 128-bit register. If we are building multiple resource descriptors,
17161 // this will allow CSEing of the 2-component register.
17162 const SDValue Ops0[] = {
17163 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17164 buildSMovImm32(DAG, DL, 0),
17165 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17166 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17167 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17168
17169 SDValue SubRegHi = SDValue(
17170 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17171
17172 // Combine the constants and the pointer.
17173 const SDValue Ops1[] = {
17174 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17175 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17176 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17177
17178 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17179}
17180
17181/// Return a resource descriptor with the 'Add TID' bit enabled
17182/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17183/// of the resource descriptor) to create an offset, which is added to
17184/// the resource pointer.
17186 SDValue Ptr, uint32_t RsrcDword1,
17187 uint64_t RsrcDword2And3) const {
17188 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17189 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17190 if (RsrcDword1) {
17191 PtrHi =
17192 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17193 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17194 0);
17195 }
17196
17197 SDValue DataLo =
17198 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17199 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17200
17201 const SDValue Ops[] = {
17202 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17203 PtrLo,
17204 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17205 PtrHi,
17206 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17207 DataLo,
17208 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17209 DataHi,
17210 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17211
17212 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17213}
17214
17215//===----------------------------------------------------------------------===//
17216// SI Inline Assembly Support
17217//===----------------------------------------------------------------------===//
17218
17219std::pair<unsigned, const TargetRegisterClass *>
17221 StringRef Constraint,
17222 MVT VT) const {
17223 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17224
17225 const TargetRegisterClass *RC = nullptr;
17226 if (Constraint.size() == 1) {
17227 // Check if we cannot determine the bit size of the given value type. This
17228 // can happen, for example, in this situation where we have an empty struct
17229 // (size 0): `call void asm "", "v"({} poison)`-
17230 if (VT == MVT::Other)
17231 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17232 const unsigned BitWidth = VT.getSizeInBits();
17233 switch (Constraint[0]) {
17234 default:
17235 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17236 case 's':
17237 case 'r':
17238 switch (BitWidth) {
17239 case 16:
17240 RC = &AMDGPU::SReg_32RegClass;
17241 break;
17242 case 64:
17243 RC = &AMDGPU::SGPR_64RegClass;
17244 break;
17245 default:
17247 if (!RC)
17248 return std::pair(0U, nullptr);
17249 break;
17250 }
17251 break;
17252 case 'v':
17253 switch (BitWidth) {
17254 case 16:
17255 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17256 : &AMDGPU::VGPR_32_Lo256RegClass;
17257 break;
17258 default:
17259 RC = Subtarget->has1024AddressableVGPRs()
17260 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17261 : TRI->getVGPRClassForBitWidth(BitWidth);
17262 if (!RC)
17263 return std::pair(0U, nullptr);
17264 break;
17265 }
17266 break;
17267 case 'a':
17268 if (!Subtarget->hasMAIInsts())
17269 break;
17270 switch (BitWidth) {
17271 case 16:
17272 RC = &AMDGPU::AGPR_32RegClass;
17273 break;
17274 default:
17275 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17276 if (!RC)
17277 return std::pair(0U, nullptr);
17278 break;
17279 }
17280 break;
17281 }
17282 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17283 const unsigned BitWidth = VT.getSizeInBits();
17284 switch (BitWidth) {
17285 case 16:
17286 RC = &AMDGPU::AV_32RegClass;
17287 break;
17288 default:
17289 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17290 if (!RC)
17291 return std::pair(0U, nullptr);
17292 break;
17293 }
17294 }
17295
17296 // We actually support i128, i16 and f16 as inline parameters
17297 // even if they are not reported as legal
17298 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17299 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17300 return std::pair(0U, RC);
17301
17302 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17303 if (Kind != '\0') {
17304 if (Kind == 'v') {
17305 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17306 } else if (Kind == 's') {
17307 RC = &AMDGPU::SGPR_32RegClass;
17308 } else if (Kind == 'a') {
17309 RC = &AMDGPU::AGPR_32RegClass;
17310 }
17311
17312 if (RC) {
17313 if (NumRegs > 1) {
17314 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17315 return std::pair(0U, nullptr);
17316
17317 uint32_t Width = NumRegs * 32;
17318 // Prohibit constraints for register ranges with a width that does not
17319 // match the required type.
17320 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17321 return std::pair(0U, nullptr);
17322
17323 MCRegister Reg = RC->getRegister(Idx);
17325 RC = TRI->getVGPRClassForBitWidth(Width);
17326 else if (SIRegisterInfo::isSGPRClass(RC))
17327 RC = TRI->getSGPRClassForBitWidth(Width);
17328 else if (SIRegisterInfo::isAGPRClass(RC))
17329 RC = TRI->getAGPRClassForBitWidth(Width);
17330 if (RC) {
17331 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17332 if (!Reg) {
17333 // The register class does not contain the requested register,
17334 // e.g., because it is an SGPR pair that would violate alignment
17335 // requirements.
17336 return std::pair(0U, nullptr);
17337 }
17338 return std::pair(Reg, RC);
17339 }
17340 }
17341
17342 // Check for lossy scalar/vector conversions.
17343 if (VT.isVector() && VT.getSizeInBits() != 32)
17344 return std::pair(0U, nullptr);
17345 if (Idx < RC->getNumRegs())
17346 return std::pair(RC->getRegister(Idx), RC);
17347 return std::pair(0U, nullptr);
17348 }
17349 }
17350
17351 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17352 if (Ret.first)
17353 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17354
17355 return Ret;
17356}
17357
17358static bool isImmConstraint(StringRef Constraint) {
17359 if (Constraint.size() == 1) {
17360 switch (Constraint[0]) {
17361 default:
17362 break;
17363 case 'I':
17364 case 'J':
17365 case 'A':
17366 case 'B':
17367 case 'C':
17368 return true;
17369 }
17370 } else if (Constraint == "DA" || Constraint == "DB") {
17371 return true;
17372 }
17373 return false;
17374}
17375
17378 if (Constraint.size() == 1) {
17379 switch (Constraint[0]) {
17380 default:
17381 break;
17382 case 's':
17383 case 'v':
17384 case 'a':
17385 return C_RegisterClass;
17386 }
17387 } else if (Constraint.size() == 2) {
17388 if (Constraint == "VA")
17389 return C_RegisterClass;
17390 }
17391 if (isImmConstraint(Constraint)) {
17392 return C_Other;
17393 }
17394 return TargetLowering::getConstraintType(Constraint);
17395}
17396
17397static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17399 Val = Val & maskTrailingOnes<uint64_t>(Size);
17400 }
17401 return Val;
17402}
17403
17405 StringRef Constraint,
17406 std::vector<SDValue> &Ops,
17407 SelectionDAG &DAG) const {
17408 if (isImmConstraint(Constraint)) {
17409 uint64_t Val;
17410 if (getAsmOperandConstVal(Op, Val) &&
17411 checkAsmConstraintVal(Op, Constraint, Val)) {
17412 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17413 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17414 }
17415 } else {
17417 }
17418}
17419
17421 unsigned Size = Op.getScalarValueSizeInBits();
17422 if (Size > 64)
17423 return false;
17424
17425 if (Size == 16 && !Subtarget->has16BitInsts())
17426 return false;
17427
17429 Val = C->getSExtValue();
17430 return true;
17431 }
17433 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17434 return true;
17435 }
17437 if (Size != 16 || Op.getNumOperands() != 2)
17438 return false;
17439 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17440 return false;
17441 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17442 Val = C->getSExtValue();
17443 return true;
17444 }
17445 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17446 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17447 return true;
17448 }
17449 }
17450
17451 return false;
17452}
17453
17455 uint64_t Val) const {
17456 if (Constraint.size() == 1) {
17457 switch (Constraint[0]) {
17458 case 'I':
17460 case 'J':
17461 return isInt<16>(Val);
17462 case 'A':
17463 return checkAsmConstraintValA(Op, Val);
17464 case 'B':
17465 return isInt<32>(Val);
17466 case 'C':
17467 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17469 default:
17470 break;
17471 }
17472 } else if (Constraint.size() == 2) {
17473 if (Constraint == "DA") {
17474 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17475 int64_t LoBits = static_cast<int32_t>(Val);
17476 return checkAsmConstraintValA(Op, HiBits, 32) &&
17477 checkAsmConstraintValA(Op, LoBits, 32);
17478 }
17479 if (Constraint == "DB") {
17480 return true;
17481 }
17482 }
17483 llvm_unreachable("Invalid asm constraint");
17484}
17485
17487 unsigned MaxSize) const {
17488 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17489 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17490 if (Size == 16) {
17491 MVT VT = Op.getSimpleValueType();
17492 switch (VT.SimpleTy) {
17493 default:
17494 return false;
17495 case MVT::i16:
17496 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17497 case MVT::f16:
17498 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17499 case MVT::bf16:
17500 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17501 case MVT::v2i16:
17502 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17503 case MVT::v2f16:
17504 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17505 case MVT::v2bf16:
17506 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17507 }
17508 }
17509 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17510 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17511 return true;
17512 return false;
17513}
17514
17515static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17516 switch (UnalignedClassID) {
17517 case AMDGPU::VReg_64RegClassID:
17518 return AMDGPU::VReg_64_Align2RegClassID;
17519 case AMDGPU::VReg_96RegClassID:
17520 return AMDGPU::VReg_96_Align2RegClassID;
17521 case AMDGPU::VReg_128RegClassID:
17522 return AMDGPU::VReg_128_Align2RegClassID;
17523 case AMDGPU::VReg_160RegClassID:
17524 return AMDGPU::VReg_160_Align2RegClassID;
17525 case AMDGPU::VReg_192RegClassID:
17526 return AMDGPU::VReg_192_Align2RegClassID;
17527 case AMDGPU::VReg_224RegClassID:
17528 return AMDGPU::VReg_224_Align2RegClassID;
17529 case AMDGPU::VReg_256RegClassID:
17530 return AMDGPU::VReg_256_Align2RegClassID;
17531 case AMDGPU::VReg_288RegClassID:
17532 return AMDGPU::VReg_288_Align2RegClassID;
17533 case AMDGPU::VReg_320RegClassID:
17534 return AMDGPU::VReg_320_Align2RegClassID;
17535 case AMDGPU::VReg_352RegClassID:
17536 return AMDGPU::VReg_352_Align2RegClassID;
17537 case AMDGPU::VReg_384RegClassID:
17538 return AMDGPU::VReg_384_Align2RegClassID;
17539 case AMDGPU::VReg_512RegClassID:
17540 return AMDGPU::VReg_512_Align2RegClassID;
17541 case AMDGPU::VReg_1024RegClassID:
17542 return AMDGPU::VReg_1024_Align2RegClassID;
17543 case AMDGPU::AReg_64RegClassID:
17544 return AMDGPU::AReg_64_Align2RegClassID;
17545 case AMDGPU::AReg_96RegClassID:
17546 return AMDGPU::AReg_96_Align2RegClassID;
17547 case AMDGPU::AReg_128RegClassID:
17548 return AMDGPU::AReg_128_Align2RegClassID;
17549 case AMDGPU::AReg_160RegClassID:
17550 return AMDGPU::AReg_160_Align2RegClassID;
17551 case AMDGPU::AReg_192RegClassID:
17552 return AMDGPU::AReg_192_Align2RegClassID;
17553 case AMDGPU::AReg_256RegClassID:
17554 return AMDGPU::AReg_256_Align2RegClassID;
17555 case AMDGPU::AReg_512RegClassID:
17556 return AMDGPU::AReg_512_Align2RegClassID;
17557 case AMDGPU::AReg_1024RegClassID:
17558 return AMDGPU::AReg_1024_Align2RegClassID;
17559 default:
17560 return -1;
17561 }
17562}
17563
17564// Figure out which registers should be reserved for stack access. Only after
17565// the function is legalized do we know all of the non-spill stack objects or if
17566// calls are present.
17570 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17571 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17572 const SIInstrInfo *TII = ST.getInstrInfo();
17573
17574 if (Info->isEntryFunction()) {
17575 // Callable functions have fixed registers used for stack access.
17577 }
17578
17579 // TODO: Move this logic to getReservedRegs()
17580 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17581 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17582 Register SReg = ST.isWave32()
17583 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17584 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17585 &AMDGPU::SGPR_64RegClass);
17586 Info->setSGPRForEXECCopy(SReg);
17587
17588 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17589 Info->getStackPtrOffsetReg()));
17590 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17591 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17592
17593 // We need to worry about replacing the default register with itself in case
17594 // of MIR testcases missing the MFI.
17595 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17596 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17597
17598 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17599 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17600
17601 Info->limitOccupancy(MF);
17602
17603 if (ST.isWave32() && !MF.empty()) {
17604 for (auto &MBB : MF) {
17605 for (auto &MI : MBB) {
17606 TII->fixImplicitOperands(MI);
17607 }
17608 }
17609 }
17610
17611 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17612 // classes if required. Ideally the register class constraints would differ
17613 // per-subtarget, but there's no easy way to achieve that right now. This is
17614 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17615 // from using them as the register class for legal types.
17616 if (ST.needsAlignedVGPRs()) {
17617 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17618 const Register Reg = Register::index2VirtReg(I);
17619 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17620 if (!RC)
17621 continue;
17622 int NewClassID = getAlignedAGPRClassID(RC->getID());
17623 if (NewClassID != -1)
17624 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17625 }
17626 }
17627
17629}
17630
17632 KnownBits &Known,
17633 const APInt &DemandedElts,
17634 const SelectionDAG &DAG,
17635 unsigned Depth) const {
17636 Known.resetAll();
17637 unsigned Opc = Op.getOpcode();
17638 switch (Opc) {
17640 unsigned IID = Op.getConstantOperandVal(0);
17641 switch (IID) {
17642 case Intrinsic::amdgcn_mbcnt_lo:
17643 case Intrinsic::amdgcn_mbcnt_hi: {
17644 const GCNSubtarget &ST =
17646 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17647 // most 31 + src1.
17648 Known.Zero.setBitsFrom(
17649 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17650 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17651 Known = KnownBits::add(Known, Known2);
17652 return;
17653 }
17654 }
17655 break;
17656 }
17657 }
17659 Op, Known, DemandedElts, DAG, Depth);
17660}
17661
17663 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17665
17666 // Set the high bits to zero based on the maximum allowed scratch size per
17667 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17668 // calculation won't overflow, so assume the sign bit is never set.
17669 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17670}
17671
17673 GISelValueTracking &VT, KnownBits &Known,
17674 unsigned Dim) {
17675 unsigned MaxValue =
17676 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17677 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17678}
17679
17681 KnownBits &Known, const APInt &DemandedElts,
17682 unsigned BFEWidth, bool SExt, unsigned Depth) {
17684 const MachineOperand &Src1 = MI.getOperand(2);
17685
17686 unsigned Src1Cst = 0;
17687 if (Src1.isImm()) {
17688 Src1Cst = Src1.getImm();
17689 } else if (Src1.isReg()) {
17690 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17691 if (!Cst)
17692 return;
17693 Src1Cst = Cst->Value.getZExtValue();
17694 } else {
17695 return;
17696 }
17697
17698 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17699 // Width is always [22:16].
17700 const unsigned Offset =
17701 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17702 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17703
17704 if (Width >= BFEWidth) // Ill-formed.
17705 return;
17706
17707 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17708 Depth + 1);
17709
17710 Known = Known.extractBits(Width, Offset);
17711
17712 if (SExt)
17713 Known = Known.sext(BFEWidth);
17714 else
17715 Known = Known.zext(BFEWidth);
17716}
17717
17719 GISelValueTracking &VT, Register R, KnownBits &Known,
17720 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
17721 unsigned Depth) const {
17722 Known.resetAll();
17723 const MachineInstr *MI = MRI.getVRegDef(R);
17724 switch (MI->getOpcode()) {
17725 case AMDGPU::S_BFE_I32:
17726 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17727 /*SExt=*/true, Depth);
17728 case AMDGPU::S_BFE_U32:
17729 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
17730 /*SExt=*/false, Depth);
17731 case AMDGPU::S_BFE_I64:
17732 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17733 /*SExt=*/true, Depth);
17734 case AMDGPU::S_BFE_U64:
17735 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
17736 /*SExt=*/false, Depth);
17737 case AMDGPU::G_INTRINSIC:
17738 case AMDGPU::G_INTRINSIC_CONVERGENT: {
17739 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
17740 switch (IID) {
17741 case Intrinsic::amdgcn_workitem_id_x:
17742 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
17743 break;
17744 case Intrinsic::amdgcn_workitem_id_y:
17745 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
17746 break;
17747 case Intrinsic::amdgcn_workitem_id_z:
17748 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
17749 break;
17750 case Intrinsic::amdgcn_mbcnt_lo:
17751 case Intrinsic::amdgcn_mbcnt_hi: {
17752 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17753 // most 31 + src1.
17754 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
17755 ? getSubtarget()->getWavefrontSizeLog2()
17756 : 5);
17757 KnownBits Known2;
17758 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
17759 Depth + 1);
17760 Known = KnownBits::add(Known, Known2);
17761 break;
17762 }
17763 case Intrinsic::amdgcn_groupstaticsize: {
17764 // We can report everything over the maximum size as 0. We can't report
17765 // based on the actual size because we don't know if it's accurate or not
17766 // at any given point.
17767 Known.Zero.setHighBits(
17768 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
17769 break;
17770 }
17771 }
17772 break;
17773 }
17774 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
17775 Known.Zero.setHighBits(24);
17776 break;
17777 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
17778 Known.Zero.setHighBits(16);
17779 break;
17780 case AMDGPU::G_AMDGPU_SMED3:
17781 case AMDGPU::G_AMDGPU_UMED3: {
17782 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
17783
17784 KnownBits Known2;
17785 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
17786 if (Known2.isUnknown())
17787 break;
17788
17789 KnownBits Known1;
17790 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
17791 if (Known1.isUnknown())
17792 break;
17793
17794 KnownBits Known0;
17795 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
17796 if (Known0.isUnknown())
17797 break;
17798
17799 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
17800 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
17801 Known.One = Known0.One & Known1.One & Known2.One;
17802 break;
17803 }
17804 }
17805}
17806
17809 unsigned Depth) const {
17810 const MachineInstr *MI = MRI.getVRegDef(R);
17811 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
17812 // FIXME: Can this move to generic code? What about the case where the call
17813 // site specifies a lower alignment?
17814 Intrinsic::ID IID = GI->getIntrinsicID();
17816 AttributeList Attrs =
17817 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
17818 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
17819 return *RetAlign;
17820 }
17821 return Align(1);
17822}
17823
17826 const Align CacheLineAlign = Align(64);
17827
17828 // Pre-GFX10 target did not benefit from loop alignment
17829 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
17830 getSubtarget()->hasInstFwdPrefetchBug())
17831 return PrefAlign;
17832
17833 // On GFX10 I$ is 4 x 64 bytes cache lines.
17834 // By default prefetcher keeps one cache line behind and reads two ahead.
17835 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
17836 // behind and one ahead.
17837 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
17838 // If loop fits 64 bytes it always spans no more than two cache lines and
17839 // does not need an alignment.
17840 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
17841 // Else if loop is less or equal 192 bytes we need two lines behind.
17842
17844 const MachineBasicBlock *Header = ML->getHeader();
17845 if (Header->getAlignment() != PrefAlign)
17846 return Header->getAlignment(); // Already processed.
17847
17848 unsigned LoopSize = 0;
17849 for (const MachineBasicBlock *MBB : ML->blocks()) {
17850 // If inner loop block is aligned assume in average half of the alignment
17851 // size to be added as nops.
17852 if (MBB != Header)
17853 LoopSize += MBB->getAlignment().value() / 2;
17854
17855 for (const MachineInstr &MI : *MBB) {
17856 LoopSize += TII->getInstSizeInBytes(MI);
17857 if (LoopSize > 192)
17858 return PrefAlign;
17859 }
17860 }
17861
17862 if (LoopSize <= 64)
17863 return PrefAlign;
17864
17865 if (LoopSize <= 128)
17866 return CacheLineAlign;
17867
17868 // If any of parent loops is surrounded by prefetch instructions do not
17869 // insert new for inner loop, which would reset parent's settings.
17870 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
17871 if (MachineBasicBlock *Exit = P->getExitBlock()) {
17872 auto I = Exit->getFirstNonDebugInstr();
17873 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
17874 return CacheLineAlign;
17875 }
17876 }
17877
17878 MachineBasicBlock *Pre = ML->getLoopPreheader();
17879 MachineBasicBlock *Exit = ML->getExitBlock();
17880
17881 if (Pre && Exit) {
17882 auto PreTerm = Pre->getFirstTerminator();
17883 if (PreTerm == Pre->begin() ||
17884 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17885 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17886 .addImm(1); // prefetch 2 lines behind PC
17887
17888 auto ExitHead = Exit->getFirstNonDebugInstr();
17889 if (ExitHead == Exit->end() ||
17890 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17891 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
17892 .addImm(2); // prefetch 1 line behind PC
17893 }
17894
17895 return CacheLineAlign;
17896}
17897
17899static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
17900 assert(N->getOpcode() == ISD::CopyFromReg);
17901 do {
17902 // Follow the chain until we find an INLINEASM node.
17903 N = N->getOperand(0).getNode();
17904 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
17905 return true;
17906 } while (N->getOpcode() == ISD::CopyFromReg);
17907 return false;
17908}
17909
17912 UniformityInfo *UA) const {
17913 switch (N->getOpcode()) {
17914 case ISD::CopyFromReg: {
17915 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
17916 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
17917 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17918 Register Reg = R->getReg();
17919
17920 // FIXME: Why does this need to consider isLiveIn?
17921 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
17922 return !TRI->isSGPRReg(MRI, Reg);
17923
17924 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
17925 return UA->isDivergent(V);
17926
17928 return !TRI->isSGPRReg(MRI, Reg);
17929 }
17930 case ISD::LOAD: {
17931 const LoadSDNode *L = cast<LoadSDNode>(N);
17932 unsigned AS = L->getAddressSpace();
17933 // A flat load may access private memory.
17935 }
17936 case ISD::CALLSEQ_END:
17937 return true;
17939 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
17941 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
17960 // Target-specific read-modify-write atomics are sources of divergence.
17961 return true;
17962 default:
17963 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
17964 // Generic read-modify-write atomics are sources of divergence.
17965 return A->readMem() && A->writeMem();
17966 }
17967 return false;
17968 }
17969}
17970
17972 EVT VT) const {
17973 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
17974 case MVT::f32:
17976 case MVT::f64:
17977 case MVT::f16:
17979 default:
17980 return false;
17981 }
17982}
17983
17985 LLT Ty, const MachineFunction &MF) const {
17986 switch (Ty.getScalarSizeInBits()) {
17987 case 32:
17988 return !denormalModeIsFlushAllF32(MF);
17989 case 64:
17990 case 16:
17991 return !denormalModeIsFlushAllF64F16(MF);
17992 default:
17993 return false;
17994 }
17995}
17996
17998 const APInt &DemandedElts,
17999 const SelectionDAG &DAG,
18000 bool SNaN,
18001 unsigned Depth) const {
18002 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18003 const MachineFunction &MF = DAG.getMachineFunction();
18005
18006 if (Info->getMode().DX10Clamp)
18007 return true; // Clamped to 0.
18008 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18009 }
18010
18012 DAG, SNaN, Depth);
18013}
18014
18015// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18016// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18018 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18019 return true;
18020
18022 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18023 if (DenormMode == DenormalMode::getPreserveSign())
18024 return true;
18025
18026 // TODO: Remove this.
18027 return RMW->getFunction()
18028 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18029 .getValueAsBool();
18030}
18031
18033 LLVMContext &Ctx = RMW->getContext();
18034 StringRef MemScope =
18035 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18036
18037 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18038 << "Hardware instruction generated for atomic "
18039 << RMW->getOperationName(RMW->getOperation())
18040 << " operation at memory scope " << MemScope;
18041}
18042
18043static bool isV2F16OrV2BF16(Type *Ty) {
18044 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18045 Type *EltTy = VT->getElementType();
18046 return VT->getNumElements() == 2 &&
18047 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18048 }
18049
18050 return false;
18051}
18052
18053static bool isV2F16(Type *Ty) {
18055 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18056}
18057
18058static bool isV2BF16(Type *Ty) {
18060 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18061}
18062
18063/// \return true if atomicrmw integer ops work for the type.
18064static bool isAtomicRMWLegalIntTy(Type *Ty) {
18065 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18066 unsigned BW = IT->getBitWidth();
18067 return BW == 32 || BW == 64;
18068 }
18069
18070 return false;
18071}
18072
18073/// \return true if this atomicrmw xchg type can be selected.
18074static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18075 Type *Ty = RMW->getType();
18076 if (isAtomicRMWLegalIntTy(Ty))
18077 return true;
18078
18079 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18080 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18081 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18082 return BW == 32 || BW == 64;
18083 }
18084
18085 if (Ty->isFloatTy() || Ty->isDoubleTy())
18086 return true;
18087
18089 return VT->getNumElements() == 2 &&
18090 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18091 }
18092
18093 return false;
18094}
18095
18096/// \returns true if it's valid to emit a native instruction for \p RMW, based
18097/// on the properties of the target memory.
18098static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18099 const AtomicRMWInst *RMW,
18100 bool HasSystemScope) {
18101 // The remote/fine-grained access logic is different from the integer
18102 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18103 // fine-grained access does not work, even for a device local allocation.
18104 //
18105 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18106 // allocations work.
18107 if (HasSystemScope) {
18109 RMW->hasMetadata("amdgpu.no.remote.memory"))
18110 return true;
18111 if (Subtarget.hasEmulatedSystemScopeAtomics())
18112 return true;
18114 return true;
18115
18116 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18117}
18118
18119/// \return Action to perform on AtomicRMWInsts for integer operations.
18126
18127/// Return if a flat address space atomicrmw can access private memory.
18129 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18130 return !MD ||
18132}
18133
18141
18144 unsigned AS = RMW->getPointerAddressSpace();
18145 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18147
18148 // 64-bit flat atomics that dynamically reside in private memory will silently
18149 // be dropped.
18150 //
18151 // Note that we will emit a new copy of the original atomic in the expansion,
18152 // which will be incrementally relegalized.
18153 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18154 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18155 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18158
18159 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18161 ORE.emit([=]() {
18162 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18163 });
18164 return Kind;
18165 };
18166
18167 auto SSID = RMW->getSyncScopeID();
18168 bool HasSystemScope =
18169 SSID == SyncScope::System ||
18170 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18171
18172 auto Op = RMW->getOperation();
18173 switch (Op) {
18175 // PCIe supports add and xchg for system atomics.
18176 return isAtomicRMWLegalXChgTy(RMW)
18179 case AtomicRMWInst::Add:
18180 // PCIe supports add and xchg for system atomics.
18182 case AtomicRMWInst::Sub:
18183 case AtomicRMWInst::And:
18184 case AtomicRMWInst::Or:
18185 case AtomicRMWInst::Xor:
18186 case AtomicRMWInst::Max:
18187 case AtomicRMWInst::Min:
18194 if (Subtarget->hasEmulatedSystemScopeAtomics())
18196
18197 // On most subtargets, for atomicrmw operations other than add/xchg,
18198 // whether or not the instructions will behave correctly depends on where
18199 // the address physically resides and what interconnect is used in the
18200 // system configuration. On some some targets the instruction will nop,
18201 // and in others synchronization will only occur at degraded device scope.
18202 //
18203 // If the allocation is known local to the device, the instructions should
18204 // work correctly.
18205 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18207
18208 // If fine-grained remote memory works at device scope, we don't need to
18209 // do anything.
18210 if (!HasSystemScope &&
18211 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18213
18214 // If we are targeting a remote allocated address, it depends what kind of
18215 // allocation the address belongs to.
18216 //
18217 // If the allocation is fine-grained (in host memory, or in PCIe peer
18218 // device memory), the operation will fail depending on the target.
18219 //
18220 // Note fine-grained host memory access does work on APUs or if XGMI is
18221 // used, but we do not know if we are targeting an APU or the system
18222 // configuration from the ISA version/target-cpu.
18223 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18225
18228 // Atomic sub/or/xor do not work over PCI express, but atomic add
18229 // does. InstCombine transforms these with 0 to or, so undo that.
18230 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18231 ConstVal && ConstVal->isNullValue())
18233 }
18234
18235 // If the allocation could be in remote, fine-grained memory, the rmw
18236 // instructions may fail. cmpxchg should work, so emit that. On some
18237 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18238 // even work, so you're out of luck anyway.
18239
18240 // In summary:
18241 //
18242 // Cases that may fail:
18243 // - fine-grained pinned host memory
18244 // - fine-grained migratable host memory
18245 // - fine-grained PCIe peer device
18246 //
18247 // Cases that should work, but may be treated overly conservatively.
18248 // - fine-grained host memory on an APU
18249 // - fine-grained XGMI peer device
18251 }
18252
18254 }
18255 case AtomicRMWInst::FAdd: {
18256 Type *Ty = RMW->getType();
18257
18258 // TODO: Handle REGION_ADDRESS
18259 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18260 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18261 // is fixed to round-to-nearest-even.
18262 //
18263 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18264 // round-to-nearest-even.
18265 //
18266 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18267 // suggests it is OK if the floating-point mode may not match the calling
18268 // thread.
18269 if (Ty->isFloatTy()) {
18270 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18272 }
18273
18274 if (Ty->isDoubleTy()) {
18275 // Ignores denormal mode, but we don't consider flushing mandatory.
18276 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18278 }
18279
18280 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18282
18284 }
18285
18286 // LDS atomics respect the denormal mode from the mode register.
18287 //
18288 // Traditionally f32 global/buffer memory atomics would unconditionally
18289 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18290 // flush.
18291 //
18292 // On targets with flat atomic fadd, denormals would flush depending on
18293 // whether the target address resides in LDS or global memory. We consider
18294 // this flat-maybe-flush as will-flush.
18295 if (Ty->isFloatTy() &&
18296 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18299
18300 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18301 // safe. The message phrasing also should be better.
18302 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18303 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18304 // gfx942, gfx12
18305 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18306 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18307 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18308 // gfx90a, gfx942, gfx12
18309 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18310 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18311
18312 // gfx942, gfx12
18313 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18314 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18315 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18316 // gfx90a, gfx942, gfx12
18317 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18318 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18319
18320 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18321 // buffer. gfx12 does have the buffer version.
18322 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18323 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18324 }
18325
18326 // global and flat atomic fadd f64: gfx90a, gfx942.
18327 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18328 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18329
18330 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18331 if (Ty->isFloatTy()) {
18332 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18333 // gfx11+.
18334 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18335 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18336 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18337 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18338 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18339 } else {
18340 // gfx908
18341 if (RMW->use_empty() &&
18342 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18343 isV2F16(Ty))
18344 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18345 }
18346 }
18347
18348 // flat atomic fadd f32: gfx942, gfx11+.
18349 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18350 if (Subtarget->hasFlatAtomicFaddF32Inst())
18351 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18352
18353 // If it is in flat address space, and the type is float, we will try to
18354 // expand it, if the target supports global and lds atomic fadd. The
18355 // reason we need that is, in the expansion, we emit the check of
18356 // address space. If it is in global address space, we emit the global
18357 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18358 // fadd.
18359 if (Subtarget->hasLDSFPAtomicAddF32()) {
18360 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18362 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18364 }
18365 }
18366 }
18367
18369 }
18371 case AtomicRMWInst::FMax: {
18372 Type *Ty = RMW->getType();
18373
18374 // LDS float and double fmin/fmax were always supported.
18375 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18376 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18378 }
18379
18380 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18381 // For flat and global cases:
18382 // float, double in gfx7. Manual claims denormal support.
18383 // Removed in gfx8.
18384 // float, double restored in gfx10.
18385 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18386 //
18387 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18388 // no f32.
18389 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18390 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18391 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18392 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18393 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18394 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18396 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18397 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18398 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18399 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18400 }
18401 }
18402
18404 }
18407 default:
18409 }
18410
18411 llvm_unreachable("covered atomicrmw op switch");
18412}
18413
18420
18427
18430 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18431 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18433
18434 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18436
18437 const DataLayout &DL = CmpX->getDataLayout();
18438
18439 Type *ValTy = CmpX->getNewValOperand()->getType();
18440
18441 // If a 64-bit flat atomic may alias private, we need to avoid using the
18442 // atomic in the private case.
18443 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18445}
18446
18447const TargetRegisterClass *
18448SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18450 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18451 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18452 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18453 : &AMDGPU::SReg_32RegClass;
18454 if (!TRI->isSGPRClass(RC) && !isDivergent)
18455 return TRI->getEquivalentSGPRClass(RC);
18456 if (TRI->isSGPRClass(RC) && isDivergent)
18457 return TRI->getEquivalentVGPRClass(RC);
18458
18459 return RC;
18460}
18461
18462// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18463// uniform values (as produced by the mask results of control flow intrinsics)
18464// used outside of divergent blocks. The phi users need to also be treated as
18465// always uniform.
18466//
18467// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18468static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18469 unsigned WaveSize) {
18470 // FIXME: We assume we never cast the mask results of a control flow
18471 // intrinsic.
18472 // Early exit if the type won't be consistent as a compile time hack.
18473 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18474 if (!IT || IT->getBitWidth() != WaveSize)
18475 return false;
18476
18477 if (!isa<Instruction>(V))
18478 return false;
18479 if (!Visited.insert(V).second)
18480 return false;
18481 bool Result = false;
18482 for (const auto *U : V->users()) {
18484 if (V == U->getOperand(1)) {
18485 switch (Intrinsic->getIntrinsicID()) {
18486 default:
18487 Result = false;
18488 break;
18489 case Intrinsic::amdgcn_if_break:
18490 case Intrinsic::amdgcn_if:
18491 case Intrinsic::amdgcn_else:
18492 Result = true;
18493 break;
18494 }
18495 }
18496 if (V == U->getOperand(0)) {
18497 switch (Intrinsic->getIntrinsicID()) {
18498 default:
18499 Result = false;
18500 break;
18501 case Intrinsic::amdgcn_end_cf:
18502 case Intrinsic::amdgcn_loop:
18503 Result = true;
18504 break;
18505 }
18506 }
18507 } else {
18508 Result = hasCFUser(U, Visited, WaveSize);
18509 }
18510 if (Result)
18511 break;
18512 }
18513 return Result;
18514}
18515
18517 const Value *V) const {
18518 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18519 if (CI->isInlineAsm()) {
18520 // FIXME: This cannot give a correct answer. This should only trigger in
18521 // the case where inline asm returns mixed SGPR and VGPR results, used
18522 // outside the defining block. We don't have a specific result to
18523 // consider, so this assumes if any value is SGPR, the overall register
18524 // also needs to be SGPR.
18525 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18527 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18528 for (auto &TC : TargetConstraints) {
18529 if (TC.Type == InlineAsm::isOutput) {
18531 const TargetRegisterClass *RC =
18532 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18533 TC.ConstraintVT)
18534 .second;
18535 if (RC && SIRI->isSGPRClass(RC))
18536 return true;
18537 }
18538 }
18539 }
18540 }
18542 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18543}
18544
18546 for (SDUse &Use : N->uses()) {
18548 if (getBasePtrIndex(M) == Use.getOperandNo())
18549 return true;
18550 }
18551 }
18552 return false;
18553}
18554
18556 SDValue N1) const {
18557 if (!N0.hasOneUse())
18558 return false;
18559 // Take care of the opportunity to keep N0 uniform
18560 if (N0->isDivergent() || !N1->isDivergent())
18561 return true;
18562 // Check if we have a good chance to form the memory access pattern with the
18563 // base and offset
18564 return (DAG.isBaseWithConstantOffset(N0) &&
18566}
18567
18569 Register N0, Register N1) const {
18570 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18571}
18572
18575 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18577 if (I.getMetadata("amdgpu.noclobber"))
18578 Flags |= MONoClobber;
18579 if (I.getMetadata("amdgpu.last.use"))
18580 Flags |= MOLastUse;
18581 return Flags;
18582}
18583
18585 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18586 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18587 if (User->getOpcode() != ISD::CopyToReg)
18588 return false;
18589 if (!Def->isMachineOpcode())
18590 return false;
18592 if (!MDef)
18593 return false;
18594
18595 unsigned ResNo = User->getOperand(Op).getResNo();
18596 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18597 return false;
18598 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18599 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18600 PhysReg = AMDGPU::SCC;
18601 const TargetRegisterClass *RC =
18602 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18603 Cost = RC->getCopyCost();
18604 return true;
18605 }
18606 return false;
18607}
18608
18610 Instruction *AI) const {
18611 // Given: atomicrmw fadd ptr %addr, float %val ordering
18612 //
18613 // With this expansion we produce the following code:
18614 // [...]
18615 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18616 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18617 //
18618 // atomicrmw.shared:
18619 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18620 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18621 // float %val ordering
18622 // br label %atomicrmw.phi
18623 //
18624 // atomicrmw.check.private:
18625 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18626 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18627 //
18628 // atomicrmw.private:
18629 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18630 // %loaded.private = load float, ptr addrspace(5) %cast.private
18631 // %val.new = fadd float %loaded.private, %val
18632 // store float %val.new, ptr addrspace(5) %cast.private
18633 // br label %atomicrmw.phi
18634 //
18635 // atomicrmw.global:
18636 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18637 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18638 // float %val ordering
18639 // br label %atomicrmw.phi
18640 //
18641 // atomicrmw.phi:
18642 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18643 // [ %loaded.private, %atomicrmw.private ],
18644 // [ %loaded.global, %atomicrmw.global ]
18645 // br label %atomicrmw.end
18646 //
18647 // atomicrmw.end:
18648 // [...]
18649 //
18650 //
18651 // For 64-bit atomics which may reside in private memory, we perform a simpler
18652 // version that only inserts the private check, and uses the flat operation.
18653
18654 IRBuilder<> Builder(AI);
18655 LLVMContext &Ctx = Builder.getContext();
18656
18657 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18658 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18660 Value *Addr = AI->getOperand(PtrOpIdx);
18661
18662 /// TODO: Only need to check private, then emit flat-known-not private (no
18663 /// need for shared block, or cast to global).
18665
18666 Align Alignment;
18667 if (RMW)
18668 Alignment = RMW->getAlign();
18669 else if (CX)
18670 Alignment = CX->getAlign();
18671 else
18672 llvm_unreachable("unhandled atomic operation");
18673
18674 // FullFlatEmulation is true if we need to issue the private, shared, and
18675 // global cases.
18676 //
18677 // If this is false, we are only dealing with the flat-targeting-private case,
18678 // where we only insert a check for private and still use the flat instruction
18679 // for global and shared.
18680
18681 bool FullFlatEmulation =
18682 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18683 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18684 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18685 RMW->getType()->isDoubleTy()));
18686
18687 // If the return value isn't used, do not introduce a false use in the phi.
18688 bool ReturnValueIsUsed = !AI->use_empty();
18689
18690 BasicBlock *BB = Builder.GetInsertBlock();
18691 Function *F = BB->getParent();
18692 BasicBlock *ExitBB =
18693 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18694 BasicBlock *SharedBB = nullptr;
18695
18696 BasicBlock *CheckPrivateBB = BB;
18697 if (FullFlatEmulation) {
18698 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18699 CheckPrivateBB =
18700 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18701 }
18702
18703 BasicBlock *PrivateBB =
18704 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18705 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18706 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18707
18708 std::prev(BB->end())->eraseFromParent();
18709 Builder.SetInsertPoint(BB);
18710
18711 Value *LoadedShared = nullptr;
18712 if (FullFlatEmulation) {
18713 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18714 {Addr}, nullptr, "is.shared");
18715 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18716 Builder.SetInsertPoint(SharedBB);
18717 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18719
18720 Instruction *Clone = AI->clone();
18721 Clone->insertInto(SharedBB, SharedBB->end());
18722 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
18723 LoadedShared = Clone;
18724
18725 Builder.CreateBr(PhiBB);
18726 Builder.SetInsertPoint(CheckPrivateBB);
18727 }
18728
18729 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
18730 {Addr}, nullptr, "is.private");
18731 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
18732
18733 Builder.SetInsertPoint(PrivateBB);
18734
18735 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
18737
18738 Value *LoadedPrivate;
18739 if (RMW) {
18740 LoadedPrivate = Builder.CreateAlignedLoad(
18741 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
18742
18743 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
18744 LoadedPrivate, RMW->getValOperand());
18745
18746 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
18747 } else {
18748 auto [ResultLoad, Equal] =
18749 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
18750 CX->getNewValOperand(), CX->getAlign());
18751
18752 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
18753 ResultLoad, 0);
18754 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
18755 }
18756
18757 Builder.CreateBr(PhiBB);
18758
18759 Builder.SetInsertPoint(GlobalBB);
18760
18761 // Continue using a flat instruction if we only emitted the check for private.
18762 Instruction *LoadedGlobal = AI;
18763 if (FullFlatEmulation) {
18764 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
18766 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
18767 }
18768
18769 AI->removeFromParent();
18770 AI->insertInto(GlobalBB, GlobalBB->end());
18771
18772 // The new atomicrmw may go through another round of legalization later.
18773 if (!FullFlatEmulation) {
18774 // We inserted the runtime check already, make sure we do not try to
18775 // re-expand this.
18776 // TODO: Should union with any existing metadata.
18777 MDBuilder MDB(F->getContext());
18778 MDNode *RangeNotPrivate =
18781 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
18782 RangeNotPrivate);
18783 }
18784
18785 Builder.CreateBr(PhiBB);
18786
18787 Builder.SetInsertPoint(PhiBB);
18788
18789 if (ReturnValueIsUsed) {
18790 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
18791 AI->replaceAllUsesWith(Loaded);
18792 if (FullFlatEmulation)
18793 Loaded->addIncoming(LoadedShared, SharedBB);
18794 Loaded->addIncoming(LoadedPrivate, PrivateBB);
18795 Loaded->addIncoming(LoadedGlobal, GlobalBB);
18796 Loaded->takeName(AI);
18797 }
18798
18799 Builder.CreateBr(ExitBB);
18800}
18801
18803 unsigned PtrOpIdx) {
18804 Value *PtrOp = I->getOperand(PtrOpIdx);
18807
18808 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
18809 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
18810 I->getIterator());
18811 I->setOperand(PtrOpIdx, ASCast);
18812}
18813
18816
18819
18822 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
18823 ConstVal && ConstVal->isNullValue()) {
18824 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
18826
18827 // We may still need the private-alias-flat handling below.
18828
18829 // TODO: Skip this for cases where we cannot access remote memory.
18830 }
18831 }
18832
18833 // The non-flat expansions should only perform the de-canonicalization of
18834 // identity values.
18836 return;
18837
18839}
18840
18847
18851
18853 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
18854}
18855
18857 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
18858 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
18859
18861 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
18862}
18863
18864LoadInst *
18866 IRBuilder<> Builder(AI);
18867 auto Order = AI->getOrdering();
18868
18869 // The optimization removes store aspect of the atomicrmw. Therefore, cache
18870 // must be flushed if the atomic ordering had a release semantics. This is
18871 // not necessary a fence, a release fence just coincides to do that flush.
18872 // Avoid replacing of an atomicrmw with a release semantics.
18873 if (isReleaseOrStronger(Order))
18874 return nullptr;
18875
18876 LoadInst *LI = Builder.CreateAlignedLoad(
18877 AI->getType(), AI->getPointerOperand(), AI->getAlign());
18878 LI->setAtomic(Order, AI->getSyncScopeID());
18879 LI->copyMetadata(*AI);
18880 LI->takeName(AI);
18881 AI->replaceAllUsesWith(LI);
18882 AI->eraseFromParent();
18883 return LI;
18884}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1247
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1244
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1445
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:307
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:853
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:270
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:551
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:390
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:336
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:203
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:399
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1760
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:238
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:380
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:251
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:218
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:241
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs