LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/MDBuilder.h"
45#include "llvm/Support/ModRef.h"
47#include <optional>
48
49using namespace llvm;
50using namespace llvm::SDPatternMatch;
51
52#define DEBUG_TYPE "si-lower"
53
54STATISTIC(NumTailCalls, "Number of tail calls");
55
56static cl::opt<bool>
57 DisableLoopAlignment("amdgpu-disable-loop-alignment",
58 cl::desc("Do not align and prefetch loops"),
59 cl::init(false));
60
62 "amdgpu-use-divergent-register-indexing", cl::Hidden,
63 cl::desc("Use indirect register addressing for divergent indexes"),
64 cl::init(false));
65
66// TODO: This option should be removed once we switch to always using PTRADD in
67// the SelectionDAG.
69 "amdgpu-use-sdag-ptradd", cl::Hidden,
70 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
71 "SelectionDAG ISel"),
72 cl::init(false));
73
76 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
77}
78
81 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
82}
83
84static unsigned findFirstFreeSGPR(CCState &CCInfo) {
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
87 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
88 return AMDGPU::SGPR0 + Reg;
89 }
90 }
91 llvm_unreachable("Cannot allocate sgpr");
92}
93
95 const GCNSubtarget &STI)
96 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
97 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
98 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
99
100 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
101 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const SIRegisterInfo *TRI = STI.getRegisterInfo();
106 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
147
148 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
149 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
150
151 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
152 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
153
154 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
156
157 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
158 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
159
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
162 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
163 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
165 } else {
166 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
169 }
170
171 // Unless there are also VOP3P operations, not operations are really legal.
172 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
176 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
179 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
182 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
185 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
187 }
188
189 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
190 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
191
192 computeRegisterProperties(Subtarget->getRegisterInfo());
193
194 // The boolean content concept here is too inflexible. Compares only ever
195 // really produce a 1-bit result. Any copy/extend from these will turn into a
196 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
197 // it's what most targets use.
200
201 // We need to custom lower vector stores from local memory
202 setOperationAction(ISD::LOAD,
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
207 Custom);
208
209 setOperationAction(ISD::STORE,
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
214 Custom);
215
216 if (isTypeLegal(MVT::bf16)) {
217 for (unsigned Opc :
219 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
225 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
226 ISD::SETCC}) {
227 // FIXME: The promoted to type shouldn't need to be explicit
228 setOperationAction(Opc, MVT::bf16, Promote);
229 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
230 }
231
233
235 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
236
237 setOperationAction(ISD::FABS, MVT::bf16, Legal);
238 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
240
241 // We only need to custom lower because we can't specify an action for bf16
242 // sources.
245 }
246
247 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
248 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
253 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
258 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
259 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
260 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
261 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
262 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
263
264 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
265 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
266 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
267 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
270 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
271
272 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
273
277 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
278
279 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
280
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
283
285 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
286 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
287
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
292 Expand);
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
297 Expand);
298
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
302 Custom);
303
304 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
305 setOperationAction(ISD::BR_CC,
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
307
309
311
313 Expand);
314
315#if 0
317#endif
318
319 // We only support LOAD/STORE and vector manipulation ops for vectors
320 // with > 4 elements.
321 for (MVT VT :
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
330 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
331 switch (Op) {
332 case ISD::LOAD:
333 case ISD::STORE:
335 case ISD::BITCAST:
336 case ISD::UNDEF:
340 case ISD::IS_FPCLASS:
341 break;
346 break;
347 default:
349 break;
350 }
351 }
352 }
353
354 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
355
356 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
357 // is expanded to avoid having two separate loops in case the index is a VGPR.
358
359 // Most operations are naturally 32-bit vector operations. We only support
360 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
361 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
363 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
364
366 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
367
369 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
373 }
374
375 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
377 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
378
380 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
381
383 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
387 }
388
389 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
391 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
392
394 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
395
397 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
401 }
402
403 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
405 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
406
408 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
409
411 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
415 }
416
417 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
419 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
420
422 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
423
425 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
429 }
430
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
434 Custom);
435
436 if (Subtarget->hasPkMovB32()) {
437 // TODO: 16-bit element vectors should be legal with even aligned elements.
438 // TODO: Can be legal with wider source types than the result with
439 // subregister extracts.
440 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441 }
442
444 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
445 // instead lower to cndmask in SITargetLowering::LowerSELECT().
447 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
448 // alignbit.
449 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
450
451 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
452 Custom);
453
454 // Avoid stack access for these.
455 // TODO: Generalize to more vector types.
457 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
458 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
459 Custom);
460
461 // Deal with vec3 vector operations when widened to vec4.
463 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
464
465 // Deal with vec5/6/7 vector operations when widened to vec8.
467 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
468 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
469 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
470 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
471 Custom);
472
473 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
474 // and output demarshalling
475 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
476
477 // We can't return success/failure, only the old value,
478 // let LLVM add the comparison
479 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
480 Expand);
481
482 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
483
484 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
485
486 // FIXME: This should be narrowed to i32, but that only happens if i64 is
487 // illegal.
488 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
489 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
490
491 // On SI this is s_memtime and s_memrealtime on VI.
492 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
493
494 if (Subtarget->hasSMemRealTime() ||
495 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
496 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
497 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
498
499 if (Subtarget->has16BitInsts()) {
500 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
501 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
502 } else {
503 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
504 }
505
506 if (Subtarget->hasMadMacF32Insts())
508
509 if (!Subtarget->hasBFI())
510 // fcopysign can be done in a single instruction with BFI.
511 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
512
513 if (!Subtarget->hasBCNT(32))
515
516 if (!Subtarget->hasBCNT(64))
518
519 if (Subtarget->hasFFBH())
521
522 if (Subtarget->hasFFBL())
524
525 // We only really have 32-bit BFE instructions (and 16-bit on VI).
526 //
527 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
528 // effort to match them now. We want this to be false for i64 cases when the
529 // extraction isn't restricted to the upper or lower half. Ideally we would
530 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
531 // span the midpoint are probably relatively rare, so don't worry about them
532 // for now.
533 if (Subtarget->hasBFE())
535
536 // Clamp modifier on add/sub
537 if (Subtarget->hasIntClamp())
539
540 if (Subtarget->hasAddNoCarry())
541 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
542 Legal);
543
545 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
546 {MVT::f32, MVT::f64}, Custom);
547
548 // These are really only legal for ieee_mode functions. We should be avoiding
549 // them for functions that don't have ieee_mode enabled, so just say they are
550 // legal.
551 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
552 {MVT::f32, MVT::f64}, Legal);
553
554 if (Subtarget->haveRoundOpsF64())
555 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
556 Legal);
557 else
558 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
559 MVT::f64, Custom);
560
561 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
562 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
563 Legal);
564 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
565
566 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
568
569 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
570 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
571
572 // Custom lower these because we can't specify a rule based on an illegal
573 // source bf16.
574 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
575 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
576
577 if (Subtarget->has16BitInsts()) {
580 MVT::i16, Legal);
581
582 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
583
585 MVT::i16, Expand);
586
590 ISD::CTPOP},
591 MVT::i16, Promote);
592
593 setOperationAction(ISD::LOAD, MVT::i16, Custom);
594
595 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
596
597 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
598 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
599 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
600 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
601
605
607
608 // F16 - Constant Actions.
611
612 // F16 - Load/Store Actions.
613 setOperationAction(ISD::LOAD, MVT::f16, Promote);
614 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
615 setOperationAction(ISD::STORE, MVT::f16, Promote);
616 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
617
618 // BF16 - Load/Store Actions.
619 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
620 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
621 setOperationAction(ISD::STORE, MVT::bf16, Promote);
622 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
623
624 // F16 - VOP1 Actions.
626 ISD::FSIN, ISD::FROUND},
627 MVT::f16, Custom);
628
629 // BF16 - VOP1 Actions.
630 if (Subtarget->hasBF16TransInsts())
631 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
632
635
636 // F16 - VOP2 Actions.
637 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
638 Expand);
639 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
640 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
642
643 // F16 - VOP3 Actions.
645 if (STI.hasMadF16())
647
648 for (MVT VT :
649 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
650 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
651 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
652 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
653 switch (Op) {
654 case ISD::LOAD:
655 case ISD::STORE:
657 case ISD::BITCAST:
658 case ISD::UNDEF:
663 case ISD::IS_FPCLASS:
664 break;
668 break;
669 default:
671 break;
672 }
673 }
674 }
675
676 // v_perm_b32 can handle either of these.
677 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
679
680 // XXX - Do these do anything? Vector constants turn into build_vector.
681 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
682
683 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
684 Legal);
685
686 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
687 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
688 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
689 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
690
691 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
693 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
694 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
695
696 setOperationAction(ISD::AND, MVT::v2i16, Promote);
697 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
698 setOperationAction(ISD::OR, MVT::v2i16, Promote);
699 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
700 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
701 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
702
703 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
704 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
705 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
707 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
709
710 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
711 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
712 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
713 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
714 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
715 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
716
717 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
718 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
719 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
721 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
723
724 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
725 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
726 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
727 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
728
729 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
730 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
731 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
733 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
734 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
735
736 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
737 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
738 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
739 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
740 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
741 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
742
743 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
744 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
745 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
746 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
747 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
748 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
749
750 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
751 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
752 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
753 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
754 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
755 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
756
757 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
758 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
759 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
760 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
761 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
762 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
763
765 MVT::v2i32, Expand);
766 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
767
769 MVT::v4i32, Expand);
770
772 MVT::v8i32, Expand);
773
774 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
775 Subtarget->hasVOP3PInsts() ? Legal : Custom);
776
777 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
778 // This isn't really legal, but this avoids the legalizer unrolling it (and
779 // allows matching fneg (fabs x) patterns)
780 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
781
782 // Can do this in one BFI plus a constant materialize.
784 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
785 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
786 MVT::v32f16, MVT::v32bf16},
787 Custom);
788
790 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
791 MVT::f16, Custom);
792 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
793
794 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
795 ISD::FMAXIMUMNUM},
796 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
797 Custom);
798
799 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
800 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
801 Expand);
802
803 for (MVT Vec16 :
804 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
805 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
808 Vec16, Custom);
810 }
811 }
812
813 if (Subtarget->hasVOP3PInsts()) {
817 MVT::v2i16, Legal);
818
819 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
820 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
821 MVT::v2f16, Legal);
822
824 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
825
827 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
828 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
829 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
830 Custom);
831
832 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
833 // Split vector operations.
838 VT, Custom);
839
840 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
841 // Split vector operations.
843 VT, Custom);
844
846 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
847 {MVT::v2f16, MVT::v4f16}, Custom);
848
849 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
850 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
851 Custom);
852
853 if (Subtarget->hasPackedFP32Ops()) {
855 MVT::v2f32, Legal);
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
858 Custom);
859 }
860 }
861
862 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
863
864 if (Subtarget->has16BitInsts()) {
866 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
868 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
869 } else {
870 // Legalization hack.
871 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
872
873 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
874 }
875
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
881 Custom);
882
884
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
889
890 if (Subtarget->hasMad64_32())
892
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
894 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
895
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
897 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
899 } else {
900 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
901 if (Subtarget->hasMinimum3Maximum3F32())
902 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
903
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
905 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
906
907 // If only the vector form is available, we need to widen to a vector.
908 if (!Subtarget->hasMinimum3Maximum3F16())
909 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
910 }
911 }
912
913 if (Subtarget->hasVOP3PInsts()) {
914 // We want to break these into v2f16 pieces, not scalarize.
915 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
917 Custom);
918 }
919
920 if (Subtarget->hasIntMinMax64())
922 Legal);
923
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
927 MVT::i8},
928 Custom);
929
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
935 Custom);
936
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
942 Custom);
943
944 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
946 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
947 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
948 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
949
950 // TODO: Could move this to custom lowering, could benefit from combines on
951 // extract of relevant bits.
952 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
953
955
956 if (Subtarget->hasBF16ConversionInsts()) {
957 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
959 }
960
961 if (Subtarget->hasBF16PackedInsts()) {
963 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
964 MVT::v2bf16, Legal);
965 }
966
967 if (Subtarget->hasBF16TransInsts()) {
968 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
969 }
970
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
974 Custom);
975 }
976
978 ISD::PTRADD,
980 ISD::SUB,
982 ISD::MUL,
983 ISD::FADD,
984 ISD::FSUB,
985 ISD::FDIV,
986 ISD::FMUL,
987 ISD::FMINNUM,
988 ISD::FMAXNUM,
989 ISD::FMINNUM_IEEE,
990 ISD::FMAXNUM_IEEE,
991 ISD::FMINIMUM,
992 ISD::FMAXIMUM,
993 ISD::FMINIMUMNUM,
994 ISD::FMAXIMUMNUM,
995 ISD::FMA,
996 ISD::SMIN,
997 ISD::SMAX,
998 ISD::UMIN,
999 ISD::UMAX,
1000 ISD::SETCC,
1002 ISD::SMIN,
1003 ISD::SMAX,
1004 ISD::UMIN,
1005 ISD::UMAX,
1006 ISD::AND,
1007 ISD::OR,
1008 ISD::XOR,
1009 ISD::SHL,
1010 ISD::SRL,
1011 ISD::SRA,
1012 ISD::FSHR,
1022
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1025
1026 // All memory operations. Some folding on the pointer operand is done to help
1027 // matching the constant offsets in the addressing modes.
1028 setTargetDAGCombine({ISD::LOAD,
1029 ISD::STORE,
1030 ISD::ATOMIC_LOAD,
1031 ISD::ATOMIC_STORE,
1032 ISD::ATOMIC_CMP_SWAP,
1033 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1034 ISD::ATOMIC_SWAP,
1035 ISD::ATOMIC_LOAD_ADD,
1036 ISD::ATOMIC_LOAD_SUB,
1037 ISD::ATOMIC_LOAD_AND,
1038 ISD::ATOMIC_LOAD_OR,
1039 ISD::ATOMIC_LOAD_XOR,
1040 ISD::ATOMIC_LOAD_NAND,
1041 ISD::ATOMIC_LOAD_MIN,
1042 ISD::ATOMIC_LOAD_MAX,
1043 ISD::ATOMIC_LOAD_UMIN,
1044 ISD::ATOMIC_LOAD_UMAX,
1045 ISD::ATOMIC_LOAD_FADD,
1046 ISD::ATOMIC_LOAD_FMIN,
1047 ISD::ATOMIC_LOAD_FMAX,
1048 ISD::ATOMIC_LOAD_UINC_WRAP,
1049 ISD::ATOMIC_LOAD_UDEC_WRAP,
1052
1053 // FIXME: In other contexts we pretend this is a per-function property.
1055
1057}
1058
1059const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1060
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1063 return RCRegs;
1064}
1065
1066//===----------------------------------------------------------------------===//
1067// TargetLowering queries
1068//===----------------------------------------------------------------------===//
1069
1070// v_mad_mix* support a conversion from f16 to f32.
1071//
1072// There is only one special case when denormals are enabled we don't currently,
1073// where this is OK to use.
1074bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1075 EVT DestVT, EVT SrcVT) const {
1076 return DestVT.getScalarType() == MVT::f32 &&
1077 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1079 SrcVT.getScalarType() == MVT::f16) ||
1080 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1081 SrcVT.getScalarType() == MVT::bf16)) &&
1082 // TODO: This probably only requires no input flushing?
1084}
1085
1087 LLT DestTy, LLT SrcTy) const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1090 DestTy.getScalarSizeInBits() == 32 &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1092 // TODO: This probably only requires no input flushing?
1093 denormalModeIsFlushAllF32(*MI.getMF());
1094}
1095
1097 // SI has some legal vector types, but no legal vector operations. Say no
1098 // shuffles are legal in order to prefer scalarizing some vector operations.
1099 return false;
1100}
1101
1103 CallingConv::ID CC,
1104 EVT VT) const {
1106 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1107
1108 if (VT.isVector()) {
1109 EVT ScalarVT = VT.getScalarType();
1110 unsigned Size = ScalarVT.getSizeInBits();
1111 if (Size == 16) {
1112 if (Subtarget->has16BitInsts()) {
1113 if (VT.isInteger())
1114 return MVT::v2i16;
1115 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1116 }
1117 return VT.isInteger() ? MVT::i32 : MVT::f32;
1118 }
1119
1120 if (Size < 16)
1121 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1122 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1123 }
1124
1125 if (VT.getSizeInBits() > 32)
1126 return MVT::i32;
1127
1128 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1129}
1130
1132 CallingConv::ID CC,
1133 EVT VT) const {
1135 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1136
1137 if (VT.isVector()) {
1138 unsigned NumElts = VT.getVectorNumElements();
1139 EVT ScalarVT = VT.getScalarType();
1140 unsigned Size = ScalarVT.getSizeInBits();
1141
1142 // FIXME: Should probably promote 8-bit vectors to i16.
1143 if (Size == 16 && Subtarget->has16BitInsts())
1144 return (NumElts + 1) / 2;
1145
1146 if (Size <= 32)
1147 return NumElts;
1148
1149 if (Size > 32)
1150 return NumElts * ((Size + 31) / 32);
1151 } else if (VT.getSizeInBits() > 32)
1152 return (VT.getSizeInBits() + 31) / 32;
1153
1154 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1155}
1156
1158 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1159 unsigned &NumIntermediates, MVT &RegisterVT) const {
1160 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1161 unsigned NumElts = VT.getVectorNumElements();
1162 EVT ScalarVT = VT.getScalarType();
1163 unsigned Size = ScalarVT.getSizeInBits();
1164 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1165 // support, but unless we can properly handle 3-vectors, it will be still be
1166 // inconsistent.
1167 if (Size == 16 && Subtarget->has16BitInsts()) {
1168 if (ScalarVT == MVT::bf16) {
1169 RegisterVT = MVT::i32;
1170 IntermediateVT = MVT::v2bf16;
1171 } else {
1172 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1173 IntermediateVT = RegisterVT;
1174 }
1175 NumIntermediates = (NumElts + 1) / 2;
1176 return NumIntermediates;
1177 }
1178
1179 if (Size == 32) {
1180 RegisterVT = ScalarVT.getSimpleVT();
1181 IntermediateVT = RegisterVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size < 16 && Subtarget->has16BitInsts()) {
1187 // FIXME: Should probably form v2i16 pieces
1188 RegisterVT = MVT::i16;
1189 IntermediateVT = ScalarVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1192 }
1193
1194 if (Size != 16 && Size <= 32) {
1195 RegisterVT = MVT::i32;
1196 IntermediateVT = ScalarVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1199 }
1200
1201 if (Size > 32) {
1202 RegisterVT = MVT::i32;
1203 IntermediateVT = RegisterVT;
1204 NumIntermediates = NumElts * ((Size + 31) / 32);
1205 return NumIntermediates;
1206 }
1207 }
1208
1210 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1211}
1212
1214 const DataLayout &DL, Type *Ty,
1215 unsigned MaxNumLanes) {
1216 assert(MaxNumLanes != 0);
1217
1218 LLVMContext &Ctx = Ty->getContext();
1219 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1220 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1221 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1222 NumElts);
1223 }
1224
1225 return TLI.getValueType(DL, Ty);
1226}
1227
1228// Peek through TFE struct returns to only use the data size.
1230 const DataLayout &DL, Type *Ty,
1231 unsigned MaxNumLanes) {
1232 auto *ST = dyn_cast<StructType>(Ty);
1233 if (!ST)
1234 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1235
1236 // TFE intrinsics return an aggregate type.
1237 assert(ST->getNumContainedTypes() == 2 &&
1238 ST->getContainedType(1)->isIntegerTy(32));
1239 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1240}
1241
1242/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1243/// in-memory representation. This return value is a custom type because there
1244/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1245/// could cause issues during codegen, these address space 7 pointers will be
1246/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1247/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1248/// for cost modeling, to work. (This also sets us up decently for doing the
1249/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1251 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1252 return MVT::amdgpuBufferFatPointer;
1254 DL.getPointerSizeInBits(AS) == 192)
1255 return MVT::amdgpuBufferStridedPointer;
1257}
1258/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1259/// v8i32 when padding is added.
1260/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1261/// also v8i32 with padding.
1263 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1264 DL.getPointerSizeInBits(AS) == 160) ||
1266 DL.getPointerSizeInBits(AS) == 192))
1267 return MVT::v8i32;
1269}
1270
1271static unsigned getIntrMemWidth(unsigned IntrID) {
1272 switch (IntrID) {
1273 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1274 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1276 return 8;
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1282 return 32;
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1288 return 64;
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1292 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1293 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1294 return 128;
1295 default:
1296 llvm_unreachable("Unknown width");
1297 }
1298}
1299
1300static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1302 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1303 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1304 switch (AtomicOrderingCABI(Ord)) {
1307 break;
1310 break;
1313 break;
1314 default:
1316 break;
1317 }
1318
1319 Info.flags =
1321 Info.flags |= MOCooperative;
1322
1323 MDNode *ScopeMD = cast<MDNode>(
1324 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1325 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1326 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1327}
1328
1330 const CallInst &CI,
1331 MachineFunction &MF,
1332 unsigned IntrID) const {
1333 Info.flags = MachineMemOperand::MONone;
1334 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1335 Info.flags |= MachineMemOperand::MOInvariant;
1336 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1338 Info.flags |= getTargetMMOFlags(CI);
1339
1340 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1342 AttributeSet Attr =
1344 MemoryEffects ME = Attr.getMemoryEffects();
1345 if (ME.doesNotAccessMemory())
1346 return false;
1347
1348 // TODO: Should images get their own address space?
1349 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1350
1351 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1352 if (RsrcIntr->IsImage) {
1353 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1355 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1356 Info.align.reset();
1357 }
1358
1359 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1360 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1361 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1362 // We conservatively set the memory operand of a buffer intrinsic to the
1363 // base resource pointer, so that we can access alias information about
1364 // those pointers. Cases like "this points at the same value
1365 // but with a different offset" are handled in
1366 // areMemAccessesTriviallyDisjoint.
1367 Info.ptrVal = RsrcArg;
1368 }
1369
1370 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1371 if (!IsSPrefetch) {
1372 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1373 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1374 Info.flags |= MachineMemOperand::MOVolatile;
1375 }
1376
1378 if (ME.onlyReadsMemory()) {
1379 if (RsrcIntr->IsImage) {
1380 unsigned MaxNumLanes = 4;
1381
1382 if (!BaseOpcode->Gather4) {
1383 // If this isn't a gather, we may have excess loaded elements in the
1384 // IR type. Check the dmask for the real number of elements loaded.
1385 unsigned DMask =
1386 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1387 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1388 }
1389
1390 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1391 CI.getType(), MaxNumLanes);
1392 } else {
1393 Info.memVT =
1395 std::numeric_limits<unsigned>::max());
1396 }
1397
1398 // FIXME: What does alignment mean for an image?
1399 Info.opc = ISD::INTRINSIC_W_CHAIN;
1400 Info.flags |= MachineMemOperand::MOLoad;
1401 } else if (ME.onlyWritesMemory()) {
1402 Info.opc = ISD::INTRINSIC_VOID;
1403
1404 Type *DataTy = CI.getArgOperand(0)->getType();
1405 if (RsrcIntr->IsImage) {
1406 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1407 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1408 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1409 DMaskLanes);
1410 } else
1411 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1412
1413 Info.flags |= MachineMemOperand::MOStore;
1414 } else {
1415 // Atomic, NoReturn Sampler or prefetch
1416 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1418 Info.flags |=
1420
1421 if (!IsSPrefetch)
1422 Info.flags |= MachineMemOperand::MOStore;
1423
1424 switch (IntrID) {
1425 default:
1426 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1427 // Fake memory access type for no return sampler intrinsics
1428 Info.memVT = MVT::i32;
1429 } else {
1430 // XXX - Should this be volatile without known ordering?
1431 Info.flags |= MachineMemOperand::MOVolatile;
1432 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1433 }
1434 break;
1435 case Intrinsic::amdgcn_raw_buffer_load_lds:
1436 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1437 case Intrinsic::amdgcn_struct_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1439 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1440 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1441 Info.ptrVal = CI.getArgOperand(1);
1442 return true;
1443 }
1444 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1445 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1446 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1448 Info.memVT =
1450 std::numeric_limits<unsigned>::max());
1451 Info.flags &= ~MachineMemOperand::MOStore;
1452 return true;
1453 }
1454 }
1455 }
1456 return true;
1457 }
1458
1459 switch (IntrID) {
1460 case Intrinsic::amdgcn_ds_ordered_add:
1461 case Intrinsic::amdgcn_ds_ordered_swap: {
1462 Info.opc = ISD::INTRINSIC_W_CHAIN;
1463 Info.memVT = MVT::getVT(CI.getType());
1464 Info.ptrVal = CI.getOperand(0);
1465 Info.align.reset();
1467
1468 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1469 if (!Vol->isZero())
1470 Info.flags |= MachineMemOperand::MOVolatile;
1471
1472 return true;
1473 }
1474 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1475 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1476 Info.opc = ISD::INTRINSIC_W_CHAIN;
1477 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1478 Info.ptrVal = nullptr;
1479 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1481 return true;
1482 }
1483 case Intrinsic::amdgcn_ds_append:
1484 case Intrinsic::amdgcn_ds_consume: {
1485 Info.opc = ISD::INTRINSIC_W_CHAIN;
1486 Info.memVT = MVT::getVT(CI.getType());
1487 Info.ptrVal = CI.getOperand(0);
1488 Info.align.reset();
1490
1491 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1492 if (!Vol->isZero())
1493 Info.flags |= MachineMemOperand::MOVolatile;
1494
1495 return true;
1496 }
1497 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1498 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1499 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1502 Info.memVT = MVT::getVT(CI.getType());
1503 Info.ptrVal = CI.getOperand(0);
1504 Info.memVT = MVT::i64;
1505 Info.size = 8;
1506 Info.align.reset();
1508 return true;
1509 }
1510 case Intrinsic::amdgcn_global_atomic_csub: {
1511 Info.opc = ISD::INTRINSIC_W_CHAIN;
1512 Info.memVT = MVT::getVT(CI.getType());
1513 Info.ptrVal = CI.getOperand(0);
1514 Info.align.reset();
1517 return true;
1518 }
1519 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1520 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1522 Info.opc = ISD::INTRINSIC_W_CHAIN;
1523 Info.memVT =
1524 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1525 ? CI.getType()
1527 ->getElementType(0)); // XXX: what is correct VT?
1528
1529 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1530 Info.align.reset();
1531 Info.flags |=
1533 return true;
1534 }
1535 case Intrinsic::amdgcn_global_atomic_fmin_num:
1536 case Intrinsic::amdgcn_global_atomic_fmax_num:
1537 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1538 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1539 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1540 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1541 Info.opc = ISD::INTRINSIC_W_CHAIN;
1542 Info.memVT = MVT::getVT(CI.getType());
1543 Info.ptrVal = CI.getOperand(0);
1544 Info.align.reset();
1548 return true;
1549 }
1550 case Intrinsic::amdgcn_flat_load_monitor_b32:
1551 case Intrinsic::amdgcn_flat_load_monitor_b64:
1552 case Intrinsic::amdgcn_flat_load_monitor_b128:
1553 case Intrinsic::amdgcn_global_load_monitor_b32:
1554 case Intrinsic::amdgcn_global_load_monitor_b64:
1555 case Intrinsic::amdgcn_global_load_monitor_b128:
1556 case Intrinsic::amdgcn_cluster_load_b32:
1557 case Intrinsic::amdgcn_cluster_load_b64:
1558 case Intrinsic::amdgcn_cluster_load_b128:
1559 case Intrinsic::amdgcn_ds_load_tr6_b96:
1560 case Intrinsic::amdgcn_ds_load_tr4_b64:
1561 case Intrinsic::amdgcn_ds_load_tr8_b64:
1562 case Intrinsic::amdgcn_ds_load_tr16_b128:
1563 case Intrinsic::amdgcn_global_load_tr6_b96:
1564 case Intrinsic::amdgcn_global_load_tr4_b64:
1565 case Intrinsic::amdgcn_global_load_tr_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b128:
1567 case Intrinsic::amdgcn_ds_read_tr4_b64:
1568 case Intrinsic::amdgcn_ds_read_tr6_b96:
1569 case Intrinsic::amdgcn_ds_read_tr8_b64:
1570 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1571 Info.opc = ISD::INTRINSIC_W_CHAIN;
1572 Info.memVT = MVT::getVT(CI.getType());
1573 Info.ptrVal = CI.getOperand(0);
1574 Info.align.reset();
1575 Info.flags |= MachineMemOperand::MOLoad;
1576 return true;
1577 }
1578 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1579 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1581 Info.opc = ISD::INTRINSIC_W_CHAIN;
1582 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1583 Info.ptrVal = CI.getOperand(0);
1584 Info.align.reset();
1585 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1586 return true;
1587 }
1588 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1589 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1591 Info.opc = ISD::INTRINSIC_VOID;
1592 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1593 Info.ptrVal = CI.getArgOperand(0);
1594 Info.align.reset();
1595 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1596 return true;
1597 }
1598 case Intrinsic::amdgcn_ds_gws_init:
1599 case Intrinsic::amdgcn_ds_gws_barrier:
1600 case Intrinsic::amdgcn_ds_gws_sema_v:
1601 case Intrinsic::amdgcn_ds_gws_sema_br:
1602 case Intrinsic::amdgcn_ds_gws_sema_p:
1603 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1604 Info.opc = ISD::INTRINSIC_VOID;
1605
1606 const GCNTargetMachine &TM =
1607 static_cast<const GCNTargetMachine &>(getTargetMachine());
1608
1610 Info.ptrVal = MFI->getGWSPSV(TM);
1611
1612 // This is an abstract access, but we need to specify a type and size.
1613 Info.memVT = MVT::i32;
1614 Info.size = 4;
1615 Info.align = Align(4);
1616
1617 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1618 Info.flags |= MachineMemOperand::MOLoad;
1619 else
1620 Info.flags |= MachineMemOperand::MOStore;
1621 return true;
1622 }
1623 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1627 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1631 Info.opc = ISD::INTRINSIC_VOID;
1632 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1633 Info.ptrVal = CI.getArgOperand(1);
1635 return true;
1636 }
1637 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1643 Info.ptrVal = CI.getArgOperand(0);
1645 return true;
1646 }
1647 case Intrinsic::amdgcn_load_to_lds:
1648 case Intrinsic::amdgcn_global_load_lds: {
1649 Info.opc = ISD::INTRINSIC_VOID;
1650 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1651 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1652 Info.ptrVal = CI.getArgOperand(1);
1654 return true;
1655 }
1656 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1657 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1660 Info.opc = ISD::INTRINSIC_W_CHAIN;
1661
1662 const GCNTargetMachine &TM =
1663 static_cast<const GCNTargetMachine &>(getTargetMachine());
1664
1666 Info.ptrVal = MFI->getGWSPSV(TM);
1667
1668 // This is an abstract access, but we need to specify a type and size.
1669 Info.memVT = MVT::i32;
1670 Info.size = 4;
1671 Info.align = Align(4);
1672
1674 return true;
1675 }
1676 case Intrinsic::amdgcn_s_prefetch_data:
1677 case Intrinsic::amdgcn_flat_prefetch:
1678 case Intrinsic::amdgcn_global_prefetch: {
1679 Info.opc = ISD::INTRINSIC_VOID;
1680 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1681 Info.ptrVal = CI.getArgOperand(0);
1682 Info.flags |= MachineMemOperand::MOLoad;
1683 return true;
1684 }
1685 default:
1686 return false;
1687 }
1688}
1689
1691 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1693 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1694 // The DAG's ValueType loses the addrspaces.
1695 // Add them as 2 extra Constant operands "from" and "to".
1696 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1697 unsigned DstAS = I.getType()->getPointerAddressSpace();
1698 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1699 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1700 break;
1701 }
1702 default:
1703 break;
1704 }
1705}
1706
1709 Type *&AccessTy) const {
1710 Value *Ptr = nullptr;
1711 switch (II->getIntrinsicID()) {
1712 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1713 case Intrinsic::amdgcn_cluster_load_b128:
1714 case Intrinsic::amdgcn_cluster_load_b64:
1715 case Intrinsic::amdgcn_cluster_load_b32:
1716 case Intrinsic::amdgcn_ds_append:
1717 case Intrinsic::amdgcn_ds_consume:
1718 case Intrinsic::amdgcn_ds_load_tr8_b64:
1719 case Intrinsic::amdgcn_ds_load_tr16_b128:
1720 case Intrinsic::amdgcn_ds_load_tr4_b64:
1721 case Intrinsic::amdgcn_ds_load_tr6_b96:
1722 case Intrinsic::amdgcn_ds_read_tr4_b64:
1723 case Intrinsic::amdgcn_ds_read_tr6_b96:
1724 case Intrinsic::amdgcn_ds_read_tr8_b64:
1725 case Intrinsic::amdgcn_ds_read_tr16_b64:
1726 case Intrinsic::amdgcn_ds_ordered_add:
1727 case Intrinsic::amdgcn_ds_ordered_swap:
1728 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1729 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1730 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1731 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1732 case Intrinsic::amdgcn_flat_load_monitor_b128:
1733 case Intrinsic::amdgcn_flat_load_monitor_b32:
1734 case Intrinsic::amdgcn_flat_load_monitor_b64:
1735 case Intrinsic::amdgcn_global_atomic_csub:
1736 case Intrinsic::amdgcn_global_atomic_fmax_num:
1737 case Intrinsic::amdgcn_global_atomic_fmin_num:
1738 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1739 case Intrinsic::amdgcn_global_load_monitor_b128:
1740 case Intrinsic::amdgcn_global_load_monitor_b32:
1741 case Intrinsic::amdgcn_global_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_load_tr_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b128:
1744 case Intrinsic::amdgcn_global_load_tr4_b64:
1745 case Intrinsic::amdgcn_global_load_tr6_b96:
1746 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1750 Ptr = II->getArgOperand(0);
1751 break;
1752 case Intrinsic::amdgcn_load_to_lds:
1753 case Intrinsic::amdgcn_global_load_lds:
1754 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1758 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1762 Ptr = II->getArgOperand(1);
1763 break;
1764 default:
1765 return false;
1766 }
1767 AccessTy = II->getType();
1768 Ops.push_back(Ptr);
1769 return true;
1770}
1771
1773 unsigned AddrSpace) const {
1774 if (!Subtarget->hasFlatInstOffsets()) {
1775 // Flat instructions do not have offsets, and only have the register
1776 // address.
1777 return AM.BaseOffs == 0 && AM.Scale == 0;
1778 }
1779
1780 decltype(SIInstrFlags::FLAT) FlatVariant =
1784
1785 return AM.Scale == 0 &&
1786 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1787 AM.BaseOffs, AddrSpace, FlatVariant));
1788}
1789
1791 if (Subtarget->hasFlatGlobalInsts())
1793
1794 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1795 // Assume the we will use FLAT for all global memory accesses
1796 // on VI.
1797 // FIXME: This assumption is currently wrong. On VI we still use
1798 // MUBUF instructions for the r + i addressing mode. As currently
1799 // implemented, the MUBUF instructions only work on buffer < 4GB.
1800 // It may be possible to support > 4GB buffers with MUBUF instructions,
1801 // by setting the stride value in the resource descriptor which would
1802 // increase the size limit to (stride * 4GB). However, this is risky,
1803 // because it has never been validated.
1805 }
1806
1807 return isLegalMUBUFAddressingMode(AM);
1808}
1809
1810bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1811 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1812 // additionally can do r + r + i with addr64. 32-bit has more addressing
1813 // mode options. Depending on the resource constant, it can also do
1814 // (i64 r0) + (i32 r1) * (i14 i).
1815 //
1816 // Private arrays end up using a scratch buffer most of the time, so also
1817 // assume those use MUBUF instructions. Scratch loads / stores are currently
1818 // implemented as mubuf instructions with offen bit set, so slightly
1819 // different than the normal addr64.
1820 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1821 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1822 return false;
1823
1824 // FIXME: Since we can split immediate into soffset and immediate offset,
1825 // would it make sense to allow any immediate?
1826
1827 switch (AM.Scale) {
1828 case 0: // r + i or just i, depending on HasBaseReg.
1829 return true;
1830 case 1:
1831 return true; // We have r + r or r + i.
1832 case 2:
1833 if (AM.HasBaseReg) {
1834 // Reject 2 * r + r.
1835 return false;
1836 }
1837
1838 // Allow 2 * r as r + r
1839 // Or 2 * r + i is allowed as r + r + i.
1840 return true;
1841 default: // Don't allow n * r
1842 return false;
1843 }
1844}
1845
1847 const AddrMode &AM, Type *Ty,
1848 unsigned AS,
1849 Instruction *I) const {
1850 // No global is ever allowed as a base.
1851 if (AM.BaseGV)
1852 return false;
1853
1854 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1855 return isLegalGlobalAddressingMode(AM);
1856
1857 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1861 // If the offset isn't a multiple of 4, it probably isn't going to be
1862 // correctly aligned.
1863 // FIXME: Can we get the real alignment here?
1864 if (AM.BaseOffs % 4 != 0)
1865 return isLegalMUBUFAddressingMode(AM);
1866
1867 if (!Subtarget->hasScalarSubwordLoads()) {
1868 // There are no SMRD extloads, so if we have to do a small type access we
1869 // will use a MUBUF load.
1870 // FIXME?: We also need to do this if unaligned, but we don't know the
1871 // alignment here.
1872 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1873 return isLegalGlobalAddressingMode(AM);
1874 }
1875
1876 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1877 // SMRD instructions have an 8-bit, dword offset on SI.
1878 if (!isUInt<8>(AM.BaseOffs / 4))
1879 return false;
1880 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1881 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1882 // in 8-bits, it can use a smaller encoding.
1883 if (!isUInt<32>(AM.BaseOffs / 4))
1884 return false;
1885 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1886 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1887 if (!isUInt<20>(AM.BaseOffs))
1888 return false;
1889 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1890 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1891 // for S_BUFFER_* instructions).
1892 if (!isInt<21>(AM.BaseOffs))
1893 return false;
1894 } else {
1895 // On GFX12, all offsets are signed 24-bit in bytes.
1896 if (!isInt<24>(AM.BaseOffs))
1897 return false;
1898 }
1899
1900 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1902 AM.BaseOffs < 0) {
1903 // Scalar (non-buffer) loads can only use a negative offset if
1904 // soffset+offset is non-negative. Since the compiler can only prove that
1905 // in a few special cases, it is safer to claim that negative offsets are
1906 // not supported.
1907 return false;
1908 }
1909
1910 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1911 return true;
1912
1913 if (AM.Scale == 1 && AM.HasBaseReg)
1914 return true;
1915
1916 return false;
1917 }
1918
1919 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1920 return Subtarget->enableFlatScratch()
1922 : isLegalMUBUFAddressingMode(AM);
1923
1924 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1925 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1926 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1927 // field.
1928 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1929 // an 8-bit dword offset but we don't know the alignment here.
1930 if (!isUInt<16>(AM.BaseOffs))
1931 return false;
1932
1933 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1934 return true;
1935
1936 if (AM.Scale == 1 && AM.HasBaseReg)
1937 return true;
1938
1939 return false;
1940 }
1941
1943 // For an unknown address space, this usually means that this is for some
1944 // reason being used for pure arithmetic, and not based on some addressing
1945 // computation. We don't have instructions that compute pointers with any
1946 // addressing modes, so treat them as having no offset like flat
1947 // instructions.
1949 }
1950
1951 // Assume a user alias of global for unknown address spaces.
1952 return isLegalGlobalAddressingMode(AM);
1953}
1954
1956 const MachineFunction &MF) const {
1958 return (MemVT.getSizeInBits() <= 4 * 32);
1959 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1960 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1961 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1962 }
1964 return (MemVT.getSizeInBits() <= 2 * 32);
1965 return true;
1966}
1967
1969 unsigned Size, unsigned AddrSpace, Align Alignment,
1970 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1971 if (IsFast)
1972 *IsFast = 0;
1973
1974 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1975 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1976 // Check if alignment requirements for ds_read/write instructions are
1977 // disabled.
1978 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1979 return false;
1980
1981 Align RequiredAlignment(
1982 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1983 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1984 Alignment < RequiredAlignment)
1985 return false;
1986
1987 // Either, the alignment requirements are "enabled", or there is an
1988 // unaligned LDS access related hardware bug though alignment requirements
1989 // are "disabled". In either case, we need to check for proper alignment
1990 // requirements.
1991 //
1992 switch (Size) {
1993 case 64:
1994 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1995 // address is negative, then the instruction is incorrectly treated as
1996 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1997 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1998 // load later in the SILoadStoreOptimizer.
1999 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2000 return false;
2001
2002 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2003 // can do a 4 byte aligned, 8 byte access in a single operation using
2004 // ds_read2/write2_b32 with adjacent offsets.
2005 RequiredAlignment = Align(4);
2006
2007 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2008 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2009 // ds_write2_b32 depending on the alignment. In either case with either
2010 // alignment there is no faster way of doing this.
2011
2012 // The numbers returned here and below are not additive, it is a 'speed
2013 // rank'. They are just meant to be compared to decide if a certain way
2014 // of lowering an operation is faster than another. For that purpose
2015 // naturally aligned operation gets it bitsize to indicate that "it
2016 // operates with a speed comparable to N-bit wide load". With the full
2017 // alignment ds128 is slower than ds96 for example. If underaligned it
2018 // is comparable to a speed of a single dword access, which would then
2019 // mean 32 < 128 and it is faster to issue a wide load regardless.
2020 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2021 // wider load which will not be aligned anymore the latter is slower.
2022 if (IsFast)
2023 *IsFast = (Alignment >= RequiredAlignment) ? 64
2024 : (Alignment < Align(4)) ? 32
2025 : 1;
2026 return true;
2027 }
2028
2029 break;
2030 case 96:
2031 if (!Subtarget->hasDS96AndDS128())
2032 return false;
2033
2034 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2035 // gfx8 and older.
2036
2037 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2038 // Naturally aligned access is fastest. However, also report it is Fast
2039 // if memory is aligned less than DWORD. A narrow load or store will be
2040 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2041 // be more of them, so overall we will pay less penalty issuing a single
2042 // instruction.
2043
2044 // See comment on the values above.
2045 if (IsFast)
2046 *IsFast = (Alignment >= RequiredAlignment) ? 96
2047 : (Alignment < Align(4)) ? 32
2048 : 1;
2049 return true;
2050 }
2051
2052 break;
2053 case 128:
2054 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2055 return false;
2056
2057 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2058 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2059 // single operation using ds_read2/write2_b64.
2060 RequiredAlignment = Align(8);
2061
2062 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2063 // Naturally aligned access is fastest. However, also report it is Fast
2064 // if memory is aligned less than DWORD. A narrow load or store will be
2065 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2066 // will be more of them, so overall we will pay less penalty issuing a
2067 // single instruction.
2068
2069 // See comment on the values above.
2070 if (IsFast)
2071 *IsFast = (Alignment >= RequiredAlignment) ? 128
2072 : (Alignment < Align(4)) ? 32
2073 : 1;
2074 return true;
2075 }
2076
2077 break;
2078 default:
2079 if (Size > 32)
2080 return false;
2081
2082 break;
2083 }
2084
2085 // See comment on the values above.
2086 // Note that we have a single-dword or sub-dword here, so if underaligned
2087 // it is a slowest possible access, hence returned value is 0.
2088 if (IsFast)
2089 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2090
2091 return Alignment >= RequiredAlignment ||
2092 Subtarget->hasUnalignedDSAccessEnabled();
2093 }
2094
2095 // FIXME: We have to be conservative here and assume that flat operations
2096 // will access scratch. If we had access to the IR function, then we
2097 // could determine if any private memory was used in the function.
2098 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2099 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2100 bool AlignedBy4 = Alignment >= Align(4);
2101 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2102 if (IsFast)
2103 *IsFast = AlignedBy4 ? Size : 1;
2104 return true;
2105 }
2106
2107 if (IsFast)
2108 *IsFast = AlignedBy4;
2109
2110 return AlignedBy4;
2111 }
2112
2113 // So long as they are correct, wide global memory operations perform better
2114 // than multiple smaller memory ops -- even when misaligned
2115 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2116 if (IsFast)
2117 *IsFast = Size;
2118
2119 return Alignment >= Align(4) ||
2120 Subtarget->hasUnalignedBufferAccessEnabled();
2121 }
2122
2123 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2124 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2125 // out-of-bounds behavior, but in the edge case where an access starts
2126 // out-of-bounds and then enter in-bounds, the entire access would be treated
2127 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2128 // natural alignment of buffer accesses.
2129 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2130 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2131 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2132 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2133 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2134 return false;
2135 }
2136
2137 // Smaller than dword value must be aligned.
2138 if (Size < 32)
2139 return false;
2140
2141 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2142 // byte-address are ignored, thus forcing Dword alignment.
2143 // This applies to private, global, and constant memory.
2144 if (IsFast)
2145 *IsFast = 1;
2146
2147 return Size >= 32 && Alignment >= Align(4);
2148}
2149
2151 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2152 unsigned *IsFast) const {
2154 Alignment, Flags, IsFast);
2155}
2156
2158 LLVMContext &Context, const MemOp &Op,
2159 const AttributeList &FuncAttributes) const {
2160 // FIXME: Should account for address space here.
2161
2162 // The default fallback uses the private pointer size as a guess for a type to
2163 // use. Make sure we switch these to 64-bit accesses.
2164
2165 if (Op.size() >= 16 &&
2166 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2167 return MVT::v4i32;
2168
2169 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2170 return MVT::v2i32;
2171
2172 // Use the default.
2173 return MVT::Other;
2174}
2175
2177 const MemSDNode *MemNode = cast<MemSDNode>(N);
2178 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2179}
2180
2185
2187 unsigned DestAS) const {
2188 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2189 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2190 Subtarget->hasGloballyAddressableScratch()) {
2191 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2192 return false;
2193 }
2194
2195 // Flat -> private/local is a simple truncate.
2196 // Flat -> global is no-op
2197 return true;
2198 }
2199
2200 const GCNTargetMachine &TM =
2201 static_cast<const GCNTargetMachine &>(getTargetMachine());
2202 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2203}
2204
2212
2214 Type *Ty) const {
2215 // FIXME: Could be smarter if called for vector constants.
2216 return true;
2217}
2218
2220 unsigned Index) const {
2222 return false;
2223
2224 // TODO: Add more cases that are cheap.
2225 return Index == 0;
2226}
2227
2228bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2229 // TODO: This should be more aggressive, particular for 16-bit element
2230 // vectors. However there are some mixed improvements and regressions.
2231 EVT EltTy = VT.getVectorElementType();
2232 return EltTy.getSizeInBits() % 32 == 0;
2233}
2234
2236 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2237 switch (Op) {
2238 case ISD::LOAD:
2239 case ISD::STORE:
2240 return true;
2241 default:
2242 return false;
2243 }
2244 }
2245
2246 // SimplifySetCC uses this function to determine whether or not it should
2247 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2248 if (VT == MVT::i1 && Op == ISD::SETCC)
2249 return false;
2250
2252}
2253
2254SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2255 const SDLoc &SL,
2256 SDValue Chain,
2257 uint64_t Offset) const {
2258 const DataLayout &DL = DAG.getDataLayout();
2262
2263 auto [InputPtrReg, RC, ArgTy] =
2264 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2265
2266 // We may not have the kernarg segment argument if we have no kernel
2267 // arguments.
2268 if (!InputPtrReg)
2269 return DAG.getConstant(Offset, SL, PtrVT);
2270
2272 SDValue BasePtr = DAG.getCopyFromReg(
2273 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2274
2275 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2276}
2277
2278SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2279 const SDLoc &SL) const {
2282 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2283}
2284
2285SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2286 const SDLoc &SL) const {
2287
2289 std::optional<uint32_t> KnownSize =
2291 if (KnownSize.has_value())
2292 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2293 return SDValue();
2294}
2295
2296SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2297 const SDLoc &SL, SDValue Val,
2298 bool Signed,
2299 const ISD::InputArg *Arg) const {
2300 // First, if it is a widened vector, narrow it.
2301 if (VT.isVector() &&
2303 EVT NarrowedVT =
2306 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2307 DAG.getConstant(0, SL, MVT::i32));
2308 }
2309
2310 // Then convert the vector elements or scalar value.
2311 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2312 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2313 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2314 }
2315
2316 if (MemVT.isFloatingPoint())
2317 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2318 else if (Signed)
2319 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2320 else
2321 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2322
2323 return Val;
2324}
2325
2326SDValue SITargetLowering::lowerKernargMemParameter(
2327 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2328 uint64_t Offset, Align Alignment, bool Signed,
2329 const ISD::InputArg *Arg) const {
2330 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2331
2332 // Try to avoid using an extload by loading earlier than the argument address,
2333 // and extracting the relevant bits. The load should hopefully be merged with
2334 // the previous argument.
2335 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2336 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2337 int64_t AlignDownOffset = alignDown(Offset, 4);
2338 int64_t OffsetDiff = Offset - AlignDownOffset;
2339
2340 EVT IntVT = MemVT.changeTypeToInteger();
2341
2342 // TODO: If we passed in the base kernel offset we could have a better
2343 // alignment than 4, but we don't really need it.
2344 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2345 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2348
2349 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2350 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2351
2352 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2353 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2354 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2355
2356 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2357 }
2358
2359 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2360 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2363
2364 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2365 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2366}
2367
2368/// Coerce an argument which was passed in a different ABI type to the original
2369/// expected value type.
2370SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2371 SDValue Val,
2372 CCValAssign &VA,
2373 const SDLoc &SL) const {
2374 EVT ValVT = VA.getValVT();
2375
2376 // If this is an 8 or 16-bit value, it is really passed promoted
2377 // to 32 bits. Insert an assert[sz]ext to capture this, then
2378 // truncate to the right size.
2379 switch (VA.getLocInfo()) {
2380 case CCValAssign::Full:
2381 return Val;
2382 case CCValAssign::BCvt:
2383 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2384 case CCValAssign::SExt:
2385 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2386 DAG.getValueType(ValVT));
2387 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2388 case CCValAssign::ZExt:
2389 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2390 DAG.getValueType(ValVT));
2391 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2392 case CCValAssign::AExt:
2393 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2394 default:
2395 llvm_unreachable("Unknown loc info!");
2396 }
2397}
2398
2399SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2400 CCValAssign &VA, const SDLoc &SL,
2401 SDValue Chain,
2402 const ISD::InputArg &Arg) const {
2403 MachineFunction &MF = DAG.getMachineFunction();
2404 MachineFrameInfo &MFI = MF.getFrameInfo();
2405
2406 if (Arg.Flags.isByVal()) {
2407 unsigned Size = Arg.Flags.getByValSize();
2408 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2409 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2410 }
2411
2412 unsigned ArgOffset = VA.getLocMemOffset();
2413 unsigned ArgSize = VA.getValVT().getStoreSize();
2414
2415 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2416
2417 // Create load nodes to retrieve arguments from the stack.
2418 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2419
2420 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2422 MVT MemVT = VA.getValVT();
2423
2424 switch (VA.getLocInfo()) {
2425 default:
2426 break;
2427 case CCValAssign::BCvt:
2428 MemVT = VA.getLocVT();
2429 break;
2430 case CCValAssign::SExt:
2431 ExtType = ISD::SEXTLOAD;
2432 break;
2433 case CCValAssign::ZExt:
2434 ExtType = ISD::ZEXTLOAD;
2435 break;
2436 case CCValAssign::AExt:
2437 ExtType = ISD::EXTLOAD;
2438 break;
2439 }
2440
2441 SDValue ArgValue = DAG.getExtLoad(
2442 ExtType, SL, VA.getLocVT(), Chain, FIN,
2444
2445 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2446 if (ConvertedVal == ArgValue)
2447 return ConvertedVal;
2448
2449 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2450}
2451
2452SDValue SITargetLowering::lowerWorkGroupId(
2453 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2456 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2457 if (!Subtarget->hasClusters())
2458 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2459
2460 // Clusters are supported. Return the global position in the grid. If clusters
2461 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2462
2463 // WorkGroupIdXYZ = ClusterId == 0 ?
2464 // ClusterIdXYZ :
2465 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2466 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2467 SDLoc SL(ClusterIdXYZ);
2468 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2469 SDValue One = DAG.getConstant(1, SL, VT);
2470 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2471 SDValue ClusterWorkGroupIdXYZ =
2472 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2473 SDValue GlobalIdXYZ =
2474 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2475 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2476
2477 switch (MFI.getClusterDims().getKind()) {
2480 return GlobalIdXYZ;
2482 return ClusterIdXYZ;
2484 using namespace AMDGPU::Hwreg;
2485 SDValue ClusterIdField =
2486 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2487 SDNode *GetReg =
2488 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2489 SDValue ClusterId(GetReg, 0);
2490 SDValue Zero = DAG.getConstant(0, SL, VT);
2491 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2492 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2493 }
2494 }
2495
2496 llvm_unreachable("nothing should reach here");
2497}
2498
2499SDValue SITargetLowering::getPreloadedValue(
2500 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2502 const ArgDescriptor *Reg = nullptr;
2503 const TargetRegisterClass *RC;
2504 LLT Ty;
2505
2507 const ArgDescriptor WorkGroupIDX =
2508 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2509 // If GridZ is not programmed in an entry function then the hardware will set
2510 // it to all zeros, so there is no need to mask the GridY value in the low
2511 // order bits.
2512 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2513 AMDGPU::TTMP7,
2514 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2515 const ArgDescriptor WorkGroupIDZ =
2516 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2517 const ArgDescriptor ClusterWorkGroupIDX =
2518 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2519 const ArgDescriptor ClusterWorkGroupIDY =
2520 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2521 const ArgDescriptor ClusterWorkGroupIDZ =
2522 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2523 const ArgDescriptor ClusterWorkGroupMaxIDX =
2524 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2525 const ArgDescriptor ClusterWorkGroupMaxIDY =
2526 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2527 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2528 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2529 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2530 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2531
2532 auto LoadConstant = [&](unsigned N) {
2533 return DAG.getConstant(N, SDLoc(), VT);
2534 };
2535
2536 if (Subtarget->hasArchitectedSGPRs() &&
2538 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2539 bool HasFixedDims = ClusterDims.isFixedDims();
2540
2541 switch (PVID) {
2543 Reg = &WorkGroupIDX;
2544 RC = &AMDGPU::SReg_32RegClass;
2545 Ty = LLT::scalar(32);
2546 break;
2548 Reg = &WorkGroupIDY;
2549 RC = &AMDGPU::SReg_32RegClass;
2550 Ty = LLT::scalar(32);
2551 break;
2553 Reg = &WorkGroupIDZ;
2554 RC = &AMDGPU::SReg_32RegClass;
2555 Ty = LLT::scalar(32);
2556 break;
2558 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2559 return LoadConstant(0);
2560 Reg = &ClusterWorkGroupIDX;
2561 RC = &AMDGPU::SReg_32RegClass;
2562 Ty = LLT::scalar(32);
2563 break;
2565 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2566 return LoadConstant(0);
2567 Reg = &ClusterWorkGroupIDY;
2568 RC = &AMDGPU::SReg_32RegClass;
2569 Ty = LLT::scalar(32);
2570 break;
2572 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2573 return LoadConstant(0);
2574 Reg = &ClusterWorkGroupIDZ;
2575 RC = &AMDGPU::SReg_32RegClass;
2576 Ty = LLT::scalar(32);
2577 break;
2579 if (HasFixedDims)
2580 return LoadConstant(ClusterDims.getDims()[0] - 1);
2581 Reg = &ClusterWorkGroupMaxIDX;
2582 RC = &AMDGPU::SReg_32RegClass;
2583 Ty = LLT::scalar(32);
2584 break;
2586 if (HasFixedDims)
2587 return LoadConstant(ClusterDims.getDims()[1] - 1);
2588 Reg = &ClusterWorkGroupMaxIDY;
2589 RC = &AMDGPU::SReg_32RegClass;
2590 Ty = LLT::scalar(32);
2591 break;
2593 if (HasFixedDims)
2594 return LoadConstant(ClusterDims.getDims()[2] - 1);
2595 Reg = &ClusterWorkGroupMaxIDZ;
2596 RC = &AMDGPU::SReg_32RegClass;
2597 Ty = LLT::scalar(32);
2598 break;
2600 Reg = &ClusterWorkGroupMaxFlatID;
2601 RC = &AMDGPU::SReg_32RegClass;
2602 Ty = LLT::scalar(32);
2603 break;
2604 default:
2605 break;
2606 }
2607 }
2608
2609 if (!Reg)
2610 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2611 if (!Reg) {
2613 // It's possible for a kernarg intrinsic call to appear in a kernel with
2614 // no allocated segment, in which case we do not add the user sgpr
2615 // argument, so just return null.
2616 return DAG.getConstant(0, SDLoc(), VT);
2617 }
2618
2619 // It's undefined behavior if a function marked with the amdgpu-no-*
2620 // attributes uses the corresponding intrinsic.
2621 return DAG.getPOISON(VT);
2622 }
2623
2624 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2625}
2626
2628 CallingConv::ID CallConv,
2629 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2630 FunctionType *FType,
2632 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2633 const ISD::InputArg *Arg = &Ins[I];
2634
2635 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2636 "vector type argument should have been split");
2637
2638 // First check if it's a PS input addr.
2639 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2640 PSInputNum <= 15) {
2641 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2642
2643 // Inconveniently only the first part of the split is marked as isSplit,
2644 // so skip to the end. We only want to increment PSInputNum once for the
2645 // entire split argument.
2646 if (Arg->Flags.isSplit()) {
2647 while (!Arg->Flags.isSplitEnd()) {
2648 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2649 "unexpected vector split in ps argument type");
2650 if (!SkipArg)
2651 Splits.push_back(*Arg);
2652 Arg = &Ins[++I];
2653 }
2654 }
2655
2656 if (SkipArg) {
2657 // We can safely skip PS inputs.
2658 Skipped.set(Arg->getOrigArgIndex());
2659 ++PSInputNum;
2660 continue;
2661 }
2662
2663 Info->markPSInputAllocated(PSInputNum);
2664 if (Arg->Used)
2665 Info->markPSInputEnabled(PSInputNum);
2666
2667 ++PSInputNum;
2668 }
2669
2670 Splits.push_back(*Arg);
2671 }
2672}
2673
2674// Allocate special inputs passed in VGPRs.
2676 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2677 SIMachineFunctionInfo &Info) const {
2678 const LLT S32 = LLT::scalar(32);
2680
2681 if (Info.hasWorkItemIDX()) {
2682 Register Reg = AMDGPU::VGPR0;
2683 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2684
2685 CCInfo.AllocateReg(Reg);
2686 unsigned Mask =
2687 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2688 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2689 }
2690
2691 if (Info.hasWorkItemIDY()) {
2692 assert(Info.hasWorkItemIDX());
2693 if (Subtarget->hasPackedTID()) {
2694 Info.setWorkItemIDY(
2695 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2696 } else {
2697 unsigned Reg = AMDGPU::VGPR1;
2698 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2699
2700 CCInfo.AllocateReg(Reg);
2701 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2702 }
2703 }
2704
2705 if (Info.hasWorkItemIDZ()) {
2706 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2707 if (Subtarget->hasPackedTID()) {
2708 Info.setWorkItemIDZ(
2709 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2710 } else {
2711 unsigned Reg = AMDGPU::VGPR2;
2712 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2713
2714 CCInfo.AllocateReg(Reg);
2715 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2716 }
2717 }
2718}
2719
2720// Try to allocate a VGPR at the end of the argument list, or if no argument
2721// VGPRs are left allocating a stack slot.
2722// If \p Mask is is given it indicates bitfield position in the register.
2723// If \p Arg is given use it with new ]p Mask instead of allocating new.
2724static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2725 ArgDescriptor Arg = ArgDescriptor()) {
2726 if (Arg.isSet())
2727 return ArgDescriptor::createArg(Arg, Mask);
2728
2729 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2730 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2731 if (RegIdx == ArgVGPRs.size()) {
2732 // Spill to stack required.
2733 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2734
2735 return ArgDescriptor::createStack(Offset, Mask);
2736 }
2737
2738 unsigned Reg = ArgVGPRs[RegIdx];
2739 Reg = CCInfo.AllocateReg(Reg);
2740 assert(Reg != AMDGPU::NoRegister);
2741
2742 MachineFunction &MF = CCInfo.getMachineFunction();
2743 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2744 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2745 return ArgDescriptor::createRegister(Reg, Mask);
2746}
2747
2749 const TargetRegisterClass *RC,
2750 unsigned NumArgRegs) {
2751 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2752 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2753 if (RegIdx == ArgSGPRs.size())
2754 report_fatal_error("ran out of SGPRs for arguments");
2755
2756 unsigned Reg = ArgSGPRs[RegIdx];
2757 Reg = CCInfo.AllocateReg(Reg);
2758 assert(Reg != AMDGPU::NoRegister);
2759
2760 MachineFunction &MF = CCInfo.getMachineFunction();
2761 MF.addLiveIn(Reg, RC);
2763}
2764
2765// If this has a fixed position, we still should allocate the register in the
2766// CCInfo state. Technically we could get away with this for values passed
2767// outside of the normal argument range.
2769 const TargetRegisterClass *RC,
2770 MCRegister Reg) {
2771 Reg = CCInfo.AllocateReg(Reg);
2772 assert(Reg != AMDGPU::NoRegister);
2773 MachineFunction &MF = CCInfo.getMachineFunction();
2774 MF.addLiveIn(Reg, RC);
2775}
2776
2777static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2778 if (Arg) {
2779 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2780 Arg.getRegister());
2781 } else
2782 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2783}
2784
2785static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2786 if (Arg) {
2787 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2788 Arg.getRegister());
2789 } else
2790 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2791}
2792
2793/// Allocate implicit function VGPR arguments at the end of allocated user
2794/// arguments.
2796 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2797 SIMachineFunctionInfo &Info) const {
2798 const unsigned Mask = 0x3ff;
2799 ArgDescriptor Arg;
2800
2801 if (Info.hasWorkItemIDX()) {
2802 Arg = allocateVGPR32Input(CCInfo, Mask);
2803 Info.setWorkItemIDX(Arg);
2804 }
2805
2806 if (Info.hasWorkItemIDY()) {
2807 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2808 Info.setWorkItemIDY(Arg);
2809 }
2810
2811 if (Info.hasWorkItemIDZ())
2812 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2813}
2814
2815/// Allocate implicit function VGPR arguments in fixed registers.
2817 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2818 SIMachineFunctionInfo &Info) const {
2819 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2820 if (!Reg)
2821 report_fatal_error("failed to allocate VGPR for implicit arguments");
2822
2823 const unsigned Mask = 0x3ff;
2824 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2825 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2826 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2827}
2828
2830 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2831 SIMachineFunctionInfo &Info) const {
2832 auto &ArgInfo = Info.getArgInfo();
2833 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2834
2835 // TODO: Unify handling with private memory pointers.
2836 if (UserSGPRInfo.hasDispatchPtr())
2837 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2838
2839 if (UserSGPRInfo.hasQueuePtr())
2840 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2841
2842 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2843 // constant offset from the kernarg segment.
2844 if (Info.hasImplicitArgPtr())
2845 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2846
2847 if (UserSGPRInfo.hasDispatchID())
2848 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2849
2850 // flat_scratch_init is not applicable for non-kernel functions.
2851
2852 if (Info.hasWorkGroupIDX())
2853 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2854
2855 if (Info.hasWorkGroupIDY())
2856 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2857
2858 if (Info.hasWorkGroupIDZ())
2859 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2860
2861 if (Info.hasLDSKernelId())
2862 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2863}
2864
2865// Allocate special inputs passed in user SGPRs.
2867 MachineFunction &MF,
2868 const SIRegisterInfo &TRI,
2869 SIMachineFunctionInfo &Info) const {
2870 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2871 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2872 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2873 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2874 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2875 }
2876
2877 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2878 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2879 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2880 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2881 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2882 }
2883
2884 if (UserSGPRInfo.hasDispatchPtr()) {
2885 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2886 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2887 CCInfo.AllocateReg(DispatchPtrReg);
2888 }
2889
2890 if (UserSGPRInfo.hasQueuePtr()) {
2891 Register QueuePtrReg = Info.addQueuePtr(TRI);
2892 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2893 CCInfo.AllocateReg(QueuePtrReg);
2894 }
2895
2896 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2898 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2899 CCInfo.AllocateReg(InputPtrReg);
2900
2901 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2902 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2903 }
2904
2905 if (UserSGPRInfo.hasDispatchID()) {
2906 Register DispatchIDReg = Info.addDispatchID(TRI);
2907 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2908 CCInfo.AllocateReg(DispatchIDReg);
2909 }
2910
2911 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2912 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2913 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2914 CCInfo.AllocateReg(FlatScratchInitReg);
2915 }
2916
2917 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2918 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2919 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2920 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2921 }
2922
2923 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2924 // these from the dispatch pointer.
2925}
2926
2927// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2928// sequential starting from the first argument.
2930 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2932 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2933 Function &F = MF.getFunction();
2934 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2935 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2936 bool InPreloadSequence = true;
2937 unsigned InIdx = 0;
2938 bool AlignedForImplictArgs = false;
2939 unsigned ImplicitArgOffset = 0;
2940 for (auto &Arg : F.args()) {
2941 if (!InPreloadSequence || !Arg.hasInRegAttr())
2942 break;
2943
2944 unsigned ArgIdx = Arg.getArgNo();
2945 // Don't preload non-original args or parts not in the current preload
2946 // sequence.
2947 if (InIdx < Ins.size() &&
2948 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2949 break;
2950
2951 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2952 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2953 InIdx++) {
2954 assert(ArgLocs[ArgIdx].isMemLoc());
2955 auto &ArgLoc = ArgLocs[InIdx];
2956 const Align KernelArgBaseAlign = Align(16);
2957 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2958 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2959 unsigned NumAllocSGPRs =
2960 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2961
2962 // Fix alignment for hidden arguments.
2963 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2964 if (!AlignedForImplictArgs) {
2965 ImplicitArgOffset =
2966 alignTo(LastExplicitArgOffset,
2967 Subtarget->getAlignmentForImplicitArgPtr()) -
2968 LastExplicitArgOffset;
2969 AlignedForImplictArgs = true;
2970 }
2971 ArgOffset += ImplicitArgOffset;
2972 }
2973
2974 // Arg is preloaded into the previous SGPR.
2975 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2976 assert(InIdx >= 1 && "No previous SGPR");
2977 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2978 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2979 continue;
2980 }
2981
2982 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2983 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2984 // Check for free user SGPRs for preloading.
2985 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2986 InPreloadSequence = false;
2987 break;
2988 }
2989
2990 // Preload this argument.
2991 const TargetRegisterClass *RC =
2992 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2993 SmallVectorImpl<MCRegister> *PreloadRegs =
2994 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2995
2996 if (PreloadRegs->size() > 1)
2997 RC = &AMDGPU::SGPR_32RegClass;
2998 for (auto &Reg : *PreloadRegs) {
2999 assert(Reg);
3000 MF.addLiveIn(Reg, RC);
3001 CCInfo.AllocateReg(Reg);
3002 }
3003
3004 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3005 }
3006 }
3007}
3008
3010 const SIRegisterInfo &TRI,
3011 SIMachineFunctionInfo &Info) const {
3012 // Always allocate this last since it is a synthetic preload.
3013 if (Info.hasLDSKernelId()) {
3014 Register Reg = Info.addLDSKernelId();
3015 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3016 CCInfo.AllocateReg(Reg);
3017 }
3018}
3019
3020// Allocate special input registers that are initialized per-wave.
3023 CallingConv::ID CallConv,
3024 bool IsShader) const {
3025 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3026 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3027 // Note: user SGPRs are handled by the front-end for graphics shaders
3028 // Pad up the used user SGPRs with dead inputs.
3029
3030 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3031 // before enabling architected SGPRs for workgroup IDs.
3032 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3033
3034 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3035 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3036 // rely on it to reach 16 since if we end up having no stack usage, it will
3037 // not really be added.
3038 unsigned NumRequiredSystemSGPRs =
3039 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3040 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3041 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3042 Register Reg = Info.addReservedUserSGPR();
3043 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3044 CCInfo.AllocateReg(Reg);
3045 }
3046 }
3047
3048 if (!HasArchitectedSGPRs) {
3049 if (Info.hasWorkGroupIDX()) {
3050 Register Reg = Info.addWorkGroupIDX();
3051 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3052 CCInfo.AllocateReg(Reg);
3053 }
3054
3055 if (Info.hasWorkGroupIDY()) {
3056 Register Reg = Info.addWorkGroupIDY();
3057 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3058 CCInfo.AllocateReg(Reg);
3059 }
3060
3061 if (Info.hasWorkGroupIDZ()) {
3062 Register Reg = Info.addWorkGroupIDZ();
3063 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3064 CCInfo.AllocateReg(Reg);
3065 }
3066 }
3067
3068 if (Info.hasWorkGroupInfo()) {
3069 Register Reg = Info.addWorkGroupInfo();
3070 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3071 CCInfo.AllocateReg(Reg);
3072 }
3073
3074 if (Info.hasPrivateSegmentWaveByteOffset()) {
3075 // Scratch wave offset passed in system SGPR.
3076 unsigned PrivateSegmentWaveByteOffsetReg;
3077
3078 if (IsShader) {
3079 PrivateSegmentWaveByteOffsetReg =
3080 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3081
3082 // This is true if the scratch wave byte offset doesn't have a fixed
3083 // location.
3084 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3085 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3086 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3087 }
3088 } else
3089 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3090
3091 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3092 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3093 }
3094
3095 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3096 Info.getNumPreloadedSGPRs() >= 16);
3097}
3098
3100 MachineFunction &MF,
3101 const SIRegisterInfo &TRI,
3103 // Now that we've figured out where the scratch register inputs are, see if
3104 // should reserve the arguments and use them directly.
3105 MachineFrameInfo &MFI = MF.getFrameInfo();
3106 bool HasStackObjects = MFI.hasStackObjects();
3107 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3108
3109 // Record that we know we have non-spill stack objects so we don't need to
3110 // check all stack objects later.
3111 if (HasStackObjects)
3112 Info.setHasNonSpillStackObjects(true);
3113
3114 // Everything live out of a block is spilled with fast regalloc, so it's
3115 // almost certain that spilling will be required.
3116 if (TM.getOptLevel() == CodeGenOptLevel::None)
3117 HasStackObjects = true;
3118
3119 // For now assume stack access is needed in any callee functions, so we need
3120 // the scratch registers to pass in.
3121 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3122
3123 if (!ST.enableFlatScratch()) {
3124 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3125 // If we have stack objects, we unquestionably need the private buffer
3126 // resource. For the Code Object V2 ABI, this will be the first 4 user
3127 // SGPR inputs. We can reserve those and use them directly.
3128
3129 Register PrivateSegmentBufferReg =
3131 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3132 } else {
3133 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3134 // We tentatively reserve the last registers (skipping the last registers
3135 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3136 // we'll replace these with the ones immediately after those which were
3137 // really allocated. In the prologue copies will be inserted from the
3138 // argument to these reserved registers.
3139
3140 // Without HSA, relocations are used for the scratch pointer and the
3141 // buffer resource setup is always inserted in the prologue. Scratch wave
3142 // offset is still in an input SGPR.
3143 Info.setScratchRSrcReg(ReservedBufferReg);
3144 }
3145 }
3146
3148
3149 // For entry functions we have to set up the stack pointer if we use it,
3150 // whereas non-entry functions get this "for free". This means there is no
3151 // intrinsic advantage to using S32 over S34 in cases where we do not have
3152 // calls but do need a frame pointer (i.e. if we are requested to have one
3153 // because frame pointer elimination is disabled). To keep things simple we
3154 // only ever use S32 as the call ABI stack pointer, and so using it does not
3155 // imply we need a separate frame pointer.
3156 //
3157 // Try to use s32 as the SP, but move it if it would interfere with input
3158 // arguments. This won't work with calls though.
3159 //
3160 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3161 // registers.
3162 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3163 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3164 } else {
3166
3167 if (MFI.hasCalls())
3168 report_fatal_error("call in graphics shader with too many input SGPRs");
3169
3170 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3171 if (!MRI.isLiveIn(Reg)) {
3172 Info.setStackPtrOffsetReg(Reg);
3173 break;
3174 }
3175 }
3176
3177 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3178 report_fatal_error("failed to find register for SP");
3179 }
3180
3181 // hasFP should be accurate for entry functions even before the frame is
3182 // finalized, because it does not rely on the known stack size, only
3183 // properties like whether variable sized objects are present.
3184 if (ST.getFrameLowering()->hasFP(MF)) {
3185 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3186 }
3187}
3188
3191 return !Info->isEntryFunction();
3192}
3193
3195
3197 MachineBasicBlock *Entry,
3198 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3200
3201 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3202 if (!IStart)
3203 return;
3204
3205 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3206 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3207 MachineBasicBlock::iterator MBBI = Entry->begin();
3208 for (const MCPhysReg *I = IStart; *I; ++I) {
3209 const TargetRegisterClass *RC = nullptr;
3210 if (AMDGPU::SReg_64RegClass.contains(*I))
3211 RC = &AMDGPU::SGPR_64RegClass;
3212 else if (AMDGPU::SReg_32RegClass.contains(*I))
3213 RC = &AMDGPU::SGPR_32RegClass;
3214 else
3215 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3216
3217 Register NewVR = MRI->createVirtualRegister(RC);
3218 // Create copy from CSR to a virtual register.
3219 Entry->addLiveIn(*I);
3220 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3221 .addReg(*I);
3222
3223 // Insert the copy-back instructions right before the terminator.
3224 for (auto *Exit : Exits)
3225 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3226 TII->get(TargetOpcode::COPY), *I)
3227 .addReg(NewVR);
3228 }
3229}
3230
3232 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3233 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3234 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3236
3238 const Function &Fn = MF.getFunction();
3241 bool IsError = false;
3242
3243 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3245 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3246 IsError = true;
3247 }
3248
3251 BitVector Skipped(Ins.size());
3252 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3253 *DAG.getContext());
3254
3255 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3256 bool IsKernel = AMDGPU::isKernel(CallConv);
3257 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3258
3259 if (IsGraphics) {
3260 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3261 assert(!UserSGPRInfo.hasDispatchPtr() &&
3262 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3263 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3264 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3265 (void)UserSGPRInfo;
3266 if (!Subtarget->enableFlatScratch())
3267 assert(!UserSGPRInfo.hasFlatScratchInit());
3268 if ((CallConv != CallingConv::AMDGPU_CS &&
3269 CallConv != CallingConv::AMDGPU_Gfx &&
3270 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3271 !Subtarget->hasArchitectedSGPRs())
3272 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3273 !Info->hasWorkGroupIDZ());
3274 }
3275
3276 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3277
3278 if (CallConv == CallingConv::AMDGPU_PS) {
3279 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3280
3281 // At least one interpolation mode must be enabled or else the GPU will
3282 // hang.
3283 //
3284 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3285 // set PSInputAddr, the user wants to enable some bits after the compilation
3286 // based on run-time states. Since we can't know what the final PSInputEna
3287 // will look like, so we shouldn't do anything here and the user should take
3288 // responsibility for the correct programming.
3289 //
3290 // Otherwise, the following restrictions apply:
3291 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3292 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3293 // enabled too.
3294 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3295 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3296 CCInfo.AllocateReg(AMDGPU::VGPR0);
3297 CCInfo.AllocateReg(AMDGPU::VGPR1);
3298 Info->markPSInputAllocated(0);
3299 Info->markPSInputEnabled(0);
3300 }
3301 if (Subtarget->isAmdPalOS()) {
3302 // For isAmdPalOS, the user does not enable some bits after compilation
3303 // based on run-time states; the register values being generated here are
3304 // the final ones set in hardware. Therefore we need to apply the
3305 // workaround to PSInputAddr and PSInputEnable together. (The case where
3306 // a bit is set in PSInputAddr but not PSInputEnable is where the
3307 // frontend set up an input arg for a particular interpolation mode, but
3308 // nothing uses that input arg. Really we should have an earlier pass
3309 // that removes such an arg.)
3310 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3311 if ((PsInputBits & 0x7F) == 0 ||
3312 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3313 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3314 }
3315 } else if (IsKernel) {
3316 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3317 } else {
3318 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3319 Ins.end());
3320 }
3321
3322 if (IsKernel)
3323 analyzeFormalArgumentsCompute(CCInfo, Ins);
3324
3325 if (IsEntryFunc) {
3326 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3327 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3328 if (IsKernel && Subtarget->hasKernargPreload())
3329 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3330
3331 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3332 } else if (!IsGraphics) {
3333 // For the fixed ABI, pass workitem IDs in the last argument register.
3334 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3335
3336 // FIXME: Sink this into allocateSpecialInputSGPRs
3337 if (!Subtarget->enableFlatScratch())
3338 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3339
3340 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3341 }
3342
3343 if (!IsKernel) {
3344 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3345 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3346
3347 // This assumes the registers are allocated by CCInfo in ascending order
3348 // with no gaps.
3349 Info->setNumWaveDispatchSGPRs(
3350 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3351 Info->setNumWaveDispatchVGPRs(
3352 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3353 } else if (Info->getNumKernargPreloadedSGPRs()) {
3354 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3355 }
3356
3358
3359 if (IsWholeWaveFunc) {
3361 {MVT::i1, MVT::Other}, Chain);
3362 InVals.push_back(Setup.getValue(0));
3363 Chains.push_back(Setup.getValue(1));
3364 }
3365
3366 // FIXME: This is the minimum kernel argument alignment. We should improve
3367 // this to the maximum alignment of the arguments.
3368 //
3369 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3370 // kern arg offset.
3371 const Align KernelArgBaseAlign = Align(16);
3372
3373 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3374 ++i) {
3375 const ISD::InputArg &Arg = Ins[i];
3376 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3377 InVals.push_back(DAG.getPOISON(Arg.VT));
3378 continue;
3379 }
3380
3381 CCValAssign &VA = ArgLocs[ArgIdx++];
3382 MVT VT = VA.getLocVT();
3383
3384 if (IsEntryFunc && VA.isMemLoc()) {
3385 VT = Ins[i].VT;
3386 EVT MemVT = VA.getLocVT();
3387
3388 const uint64_t Offset = VA.getLocMemOffset();
3389 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3390
3391 if (Arg.Flags.isByRef()) {
3392 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3393
3394 const GCNTargetMachine &TM =
3395 static_cast<const GCNTargetMachine &>(getTargetMachine());
3396 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3397 Arg.Flags.getPointerAddrSpace())) {
3400 }
3401
3402 InVals.push_back(Ptr);
3403 continue;
3404 }
3405
3406 SDValue NewArg;
3407 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3408 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3409 // In this case the argument is packed into the previous preload SGPR.
3410 int64_t AlignDownOffset = alignDown(Offset, 4);
3411 int64_t OffsetDiff = Offset - AlignDownOffset;
3412 EVT IntVT = MemVT.changeTypeToInteger();
3413
3414 const SIMachineFunctionInfo *Info =
3417 Register Reg =
3418 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3419
3420 assert(Reg);
3421 Register VReg = MRI.getLiveInVirtReg(Reg);
3422 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3423
3424 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3425 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3426
3427 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3428 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3429 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3430 Ins[i].Flags.isSExt(), &Ins[i]);
3431
3432 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3433 } else {
3434 const SIMachineFunctionInfo *Info =
3437 const SmallVectorImpl<MCRegister> &PreloadRegs =
3438 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3439
3440 SDValue Copy;
3441 if (PreloadRegs.size() == 1) {
3442 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3443 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3444 NewArg = DAG.getCopyFromReg(
3445 Chain, DL, VReg,
3447 TRI->getRegSizeInBits(*RC)));
3448
3449 } else {
3450 // If the kernarg alignment does not match the alignment of the SGPR
3451 // tuple RC that can accommodate this argument, it will be built up
3452 // via copies from from the individual SGPRs that the argument was
3453 // preloaded to.
3455 for (auto Reg : PreloadRegs) {
3456 Register VReg = MRI.getLiveInVirtReg(Reg);
3457 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3458 Elts.push_back(Copy);
3459 }
3460 NewArg =
3461 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3462 PreloadRegs.size()),
3463 DL, Elts);
3464 }
3465
3466 // If the argument was preloaded to multiple consecutive 32-bit
3467 // registers because of misalignment between addressable SGPR tuples
3468 // and the argument size, we can still assume that because of kernarg
3469 // segment alignment restrictions that NewArg's size is the same as
3470 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3471 // truncate since we cannot preload to less than a single SGPR and the
3472 // MemVT may be smaller.
3473 EVT MemVTInt =
3475 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3476 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3477
3478 NewArg = DAG.getBitcast(MemVT, NewArg);
3479 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3480 Ins[i].Flags.isSExt(), &Ins[i]);
3481 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3482 }
3483 } else {
3484 // Hidden arguments that are in the kernel signature must be preloaded
3485 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3486 // the argument list and is not preloaded.
3487 if (Arg.isOrigArg()) {
3488 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3489 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3491 *OrigArg->getParent(),
3492 "hidden argument in kernel signature was not preloaded",
3493 DL.getDebugLoc()));
3494 }
3495 }
3496
3497 NewArg =
3498 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3499 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3500 }
3501 Chains.push_back(NewArg.getValue(1));
3502
3503 auto *ParamTy =
3504 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3505 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3506 ParamTy &&
3507 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3508 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3509 // On SI local pointers are just offsets into LDS, so they are always
3510 // less than 16-bits. On CI and newer they could potentially be
3511 // real pointers, so we can't guarantee their size.
3512 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3513 DAG.getValueType(MVT::i16));
3514 }
3515
3516 InVals.push_back(NewArg);
3517 continue;
3518 }
3519 if (!IsEntryFunc && VA.isMemLoc()) {
3520 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3521 InVals.push_back(Val);
3522 if (!Arg.Flags.isByVal())
3523 Chains.push_back(Val.getValue(1));
3524 continue;
3525 }
3526
3527 assert(VA.isRegLoc() && "Parameter must be in a register!");
3528
3529 Register Reg = VA.getLocReg();
3530 const TargetRegisterClass *RC = nullptr;
3531 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3532 RC = &AMDGPU::VGPR_32RegClass;
3533 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3534 RC = &AMDGPU::SGPR_32RegClass;
3535 else
3536 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3537
3538 Reg = MF.addLiveIn(Reg, RC);
3539 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3540
3541 if (Arg.Flags.isSRet()) {
3542 // The return object should be reasonably addressable.
3543
3544 // FIXME: This helps when the return is a real sret. If it is a
3545 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3546 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3547 unsigned NumBits =
3549 Val = DAG.getNode(
3550 ISD::AssertZext, DL, VT, Val,
3551 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3552 }
3553
3554 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3555 InVals.push_back(Val);
3556 }
3557
3558 // Start adding system SGPRs.
3559 if (IsEntryFunc)
3560 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3561
3562 // DAG.getPass() returns nullptr when using new pass manager.
3563 // TODO: Use DAG.getMFAM() to access analysis result.
3564 if (DAG.getPass()) {
3565 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3566 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3567 }
3568
3569 unsigned StackArgSize = CCInfo.getStackSize();
3570 Info->setBytesInStackArgArea(StackArgSize);
3571
3572 return Chains.empty() ? Chain
3573 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3574}
3575
3576// TODO: If return values can't fit in registers, we should return as many as
3577// possible in registers before passing on stack.
3579 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3580 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3581 const Type *RetTy) const {
3582 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3583 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3584 // for shaders. Vector types should be explicitly handled by CC.
3585 if (AMDGPU::isEntryFunctionCC(CallConv))
3586 return true;
3587
3589 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3590 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3591 return false;
3592
3593 // We must use the stack if return would require unavailable registers.
3594 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3595 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3596 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3597 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3598 return false;
3599
3600 return true;
3601}
3602
3603SDValue
3605 bool isVarArg,
3607 const SmallVectorImpl<SDValue> &OutVals,
3608 const SDLoc &DL, SelectionDAG &DAG) const {
3612
3613 if (AMDGPU::isKernel(CallConv)) {
3614 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3615 OutVals, DL, DAG);
3616 }
3617
3618 bool IsShader = AMDGPU::isShader(CallConv);
3619
3620 Info->setIfReturnsVoid(Outs.empty());
3621 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3622
3623 // CCValAssign - represent the assignment of the return value to a location.
3625
3626 // CCState - Info about the registers and stack slots.
3627 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3628 *DAG.getContext());
3629
3630 // Analyze outgoing return values.
3631 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3632
3633 SDValue Glue;
3635 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3636
3637 SDValue ReadFirstLane =
3638 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3639 // Copy the result values into the output registers.
3640 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3641 ++I, ++RealRVLocIdx) {
3642 CCValAssign &VA = RVLocs[I];
3643 assert(VA.isRegLoc() && "Can only return in registers!");
3644 // TODO: Partially return in registers if return values don't fit.
3645 SDValue Arg = OutVals[RealRVLocIdx];
3646
3647 // Copied from other backends.
3648 switch (VA.getLocInfo()) {
3649 case CCValAssign::Full:
3650 break;
3651 case CCValAssign::BCvt:
3652 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3653 break;
3654 case CCValAssign::SExt:
3655 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3656 break;
3657 case CCValAssign::ZExt:
3658 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3659 break;
3660 case CCValAssign::AExt:
3661 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3662 break;
3663 default:
3664 llvm_unreachable("Unknown loc info!");
3665 }
3666 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3668 ReadFirstLane, Arg);
3669 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3670 Glue = Chain.getValue(1);
3671 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3672 }
3673
3674 // FIXME: Does sret work properly?
3675 if (!Info->isEntryFunction()) {
3676 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3677 const MCPhysReg *I =
3678 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3679 if (I) {
3680 for (; *I; ++I) {
3681 if (AMDGPU::SReg_64RegClass.contains(*I))
3682 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3683 else if (AMDGPU::SReg_32RegClass.contains(*I))
3684 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3685 else
3686 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3687 }
3688 }
3689 }
3690
3691 // Update chain and glue.
3692 RetOps[0] = Chain;
3693 if (Glue.getNode())
3694 RetOps.push_back(Glue);
3695
3696 unsigned Opc = AMDGPUISD::ENDPGM;
3697 if (!IsWaveEnd)
3698 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3699 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3701 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3702}
3703
3705 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3706 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3707 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3708 SDValue ThisVal) const {
3709 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3710
3711 // Assign locations to each value returned by this call.
3713 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3714 *DAG.getContext());
3715 CCInfo.AnalyzeCallResult(Ins, RetCC);
3716
3717 // Copy all of the result registers out of their specified physreg.
3718 for (CCValAssign VA : RVLocs) {
3719 SDValue Val;
3720
3721 if (VA.isRegLoc()) {
3722 Val =
3723 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3724 Chain = Val.getValue(1);
3725 InGlue = Val.getValue(2);
3726 } else if (VA.isMemLoc()) {
3727 report_fatal_error("TODO: return values in memory");
3728 } else
3729 llvm_unreachable("unknown argument location type");
3730
3731 switch (VA.getLocInfo()) {
3732 case CCValAssign::Full:
3733 break;
3734 case CCValAssign::BCvt:
3735 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3736 break;
3737 case CCValAssign::ZExt:
3738 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3739 DAG.getValueType(VA.getValVT()));
3740 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3741 break;
3742 case CCValAssign::SExt:
3743 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3744 DAG.getValueType(VA.getValVT()));
3745 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3746 break;
3747 case CCValAssign::AExt:
3748 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3749 break;
3750 default:
3751 llvm_unreachable("Unknown loc info!");
3752 }
3753
3754 InVals.push_back(Val);
3755 }
3756
3757 return Chain;
3758}
3759
3760// Add code to pass special inputs required depending on used features separate
3761// from the explicit user arguments present in the IR.
3763 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3764 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3765 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3766 // If we don't have a call site, this was a call inserted by
3767 // legalization. These can never use special inputs.
3768 if (!CLI.CB)
3769 return;
3770
3771 SelectionDAG &DAG = CLI.DAG;
3772 const SDLoc &DL = CLI.DL;
3773 const Function &F = DAG.getMachineFunction().getFunction();
3774
3775 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3776 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3777
3778 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3780 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3781 // DAG.getPass() returns nullptr when using new pass manager.
3782 // TODO: Use DAG.getMFAM() to access analysis result.
3783 if (DAG.getPass()) {
3784 auto &ArgUsageInfo =
3786 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3787 }
3788 }
3789
3790 // TODO: Unify with private memory register handling. This is complicated by
3791 // the fact that at least in kernels, the input argument is not necessarily
3792 // in the same location as the input.
3793 // clang-format off
3794 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3796 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3797 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3798 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3799 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3800 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3801 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3802 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3803 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3804 };
3805 // clang-format on
3806
3807 for (auto [InputID, Attr] : ImplicitAttrs) {
3808 // If the callee does not use the attribute value, skip copying the value.
3809 if (CLI.CB->hasFnAttr(Attr))
3810 continue;
3811
3812 const auto [OutgoingArg, ArgRC, ArgTy] =
3813 CalleeArgInfo->getPreloadedValue(InputID);
3814 if (!OutgoingArg)
3815 continue;
3816
3817 const auto [IncomingArg, IncomingArgRC, Ty] =
3818 CallerArgInfo.getPreloadedValue(InputID);
3819 assert(IncomingArgRC == ArgRC);
3820
3821 // All special arguments are ints for now.
3822 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3823 SDValue InputReg;
3824
3825 if (IncomingArg) {
3826 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3827 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3828 // The implicit arg ptr is special because it doesn't have a corresponding
3829 // input for kernels, and is computed from the kernarg segment pointer.
3830 InputReg = getImplicitArgPtr(DAG, DL);
3831 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3832 std::optional<uint32_t> Id =
3834 if (Id.has_value()) {
3835 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3836 } else {
3837 InputReg = DAG.getPOISON(ArgVT);
3838 }
3839 } else {
3840 // We may have proven the input wasn't needed, although the ABI is
3841 // requiring it. We just need to allocate the register appropriately.
3842 InputReg = DAG.getPOISON(ArgVT);
3843 }
3844
3845 if (OutgoingArg->isRegister()) {
3846 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3847 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3848 report_fatal_error("failed to allocate implicit input argument");
3849 } else {
3850 unsigned SpecialArgOffset =
3851 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3852 SDValue ArgStore =
3853 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3854 MemOpChains.push_back(ArgStore);
3855 }
3856 }
3857
3858 // Pack workitem IDs into a single register or pass it as is if already
3859 // packed.
3860
3861 auto [OutgoingArg, ArgRC, Ty] =
3863 if (!OutgoingArg)
3864 std::tie(OutgoingArg, ArgRC, Ty) =
3866 if (!OutgoingArg)
3867 std::tie(OutgoingArg, ArgRC, Ty) =
3869 if (!OutgoingArg)
3870 return;
3871
3872 const ArgDescriptor *IncomingArgX = std::get<0>(
3874 const ArgDescriptor *IncomingArgY = std::get<0>(
3876 const ArgDescriptor *IncomingArgZ = std::get<0>(
3878
3879 SDValue InputReg;
3880 SDLoc SL;
3881
3882 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3883 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3884 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3885
3886 // If incoming ids are not packed we need to pack them.
3887 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3888 NeedWorkItemIDX) {
3889 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3890 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3891 } else {
3892 InputReg = DAG.getConstant(0, DL, MVT::i32);
3893 }
3894 }
3895
3896 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3897 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3898 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3899 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3900 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3901 InputReg = InputReg.getNode()
3902 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3903 : Y;
3904 }
3905
3906 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3907 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3908 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3909 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3910 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3911 InputReg = InputReg.getNode()
3912 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3913 : Z;
3914 }
3915
3916 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3917 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3918 // We're in a situation where the outgoing function requires the workitem
3919 // ID, but the calling function does not have it (e.g a graphics function
3920 // calling a C calling convention function). This is illegal, but we need
3921 // to produce something.
3922 InputReg = DAG.getPOISON(MVT::i32);
3923 } else {
3924 // Workitem ids are already packed, any of present incoming arguments
3925 // will carry all required fields.
3926 ArgDescriptor IncomingArg =
3927 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3928 : IncomingArgY ? *IncomingArgY
3929 : *IncomingArgZ,
3930 ~0u);
3931 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3932 }
3933 }
3934
3935 if (OutgoingArg->isRegister()) {
3936 if (InputReg)
3937 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3938
3939 CCInfo.AllocateReg(OutgoingArg->getRegister());
3940 } else {
3941 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3942 if (InputReg) {
3943 SDValue ArgStore =
3944 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3945 MemOpChains.push_back(ArgStore);
3946 }
3947 }
3948}
3949
3951 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3953 const SmallVectorImpl<SDValue> &OutVals,
3954 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3955 if (AMDGPU::isChainCC(CalleeCC))
3956 return true;
3957
3958 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3959 return false;
3960
3961 // For a divergent call target, we need to do a waterfall loop over the
3962 // possible callees which precludes us from using a simple jump.
3963 if (Callee->isDivergent())
3964 return false;
3965
3967 const Function &CallerF = MF.getFunction();
3968 CallingConv::ID CallerCC = CallerF.getCallingConv();
3970 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3971
3972 // Kernels aren't callable, and don't have a live in return address so it
3973 // doesn't make sense to do a tail call with entry functions.
3974 if (!CallerPreserved)
3975 return false;
3976
3977 bool CCMatch = CallerCC == CalleeCC;
3978
3980 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3981 return true;
3982 return false;
3983 }
3984
3985 // TODO: Can we handle var args?
3986 if (IsVarArg)
3987 return false;
3988
3989 for (const Argument &Arg : CallerF.args()) {
3990 if (Arg.hasByValAttr())
3991 return false;
3992 }
3993
3994 LLVMContext &Ctx = *DAG.getContext();
3995
3996 // Check that the call results are passed in the same way.
3997 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3998 CCAssignFnForCall(CalleeCC, IsVarArg),
3999 CCAssignFnForCall(CallerCC, IsVarArg)))
4000 return false;
4001
4002 // The callee has to preserve all registers the caller needs to preserve.
4003 if (!CCMatch) {
4004 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4005 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4006 return false;
4007 }
4008
4009 // Nothing more to check if the callee is taking no arguments.
4010 if (Outs.empty())
4011 return true;
4012
4014 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4015
4016 // FIXME: We are not allocating special input registers, so we will be
4017 // deciding based on incorrect register assignments.
4018 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4019
4020 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4021 // If the stack arguments for this call do not fit into our own save area then
4022 // the call cannot be made tail.
4023 // TODO: Is this really necessary?
4024 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4025 return false;
4026
4027 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4028 // FIXME: What about inreg arguments that end up passed in memory?
4029 if (!CCVA.isRegLoc())
4030 continue;
4031
4032 // If we are passing an argument in an SGPR, and the value is divergent,
4033 // this call requires a waterfall loop.
4034 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4035 LLVM_DEBUG(
4036 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4037 << printReg(CCVA.getLocReg(), TRI) << '\n');
4038 return false;
4039 }
4040 }
4041
4042 const MachineRegisterInfo &MRI = MF.getRegInfo();
4043 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4044}
4045
4047 if (!CI->isTailCall())
4048 return false;
4049
4050 const Function *ParentFn = CI->getParent()->getParent();
4052 return false;
4053 return true;
4054}
4055
4056namespace {
4057// Chain calls have special arguments that we need to handle. These are
4058// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4059// arguments (index 0 and 1 respectively).
4060enum ChainCallArgIdx {
4061 Exec = 2,
4062 Flags,
4063 NumVGPRs,
4064 FallbackExec,
4065 FallbackCallee
4066};
4067} // anonymous namespace
4068
4069// The wave scratch offset register is used as the global base pointer.
4071 SmallVectorImpl<SDValue> &InVals) const {
4072 CallingConv::ID CallConv = CLI.CallConv;
4073 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4074
4075 SelectionDAG &DAG = CLI.DAG;
4076
4077 const SDLoc &DL = CLI.DL;
4078 SDValue Chain = CLI.Chain;
4079 SDValue Callee = CLI.Callee;
4080
4081 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4082 bool UsesDynamicVGPRs = false;
4083 if (IsChainCallConv) {
4084 // The last arguments should be the value that we need to put in EXEC,
4085 // followed by the flags and any other arguments with special meanings.
4086 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4087 // we don't treat them like the "real" arguments.
4088 auto RequestedExecIt =
4089 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4090 return Arg.OrigArgIndex == 2;
4091 });
4092 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4093
4094 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4095 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4096 CLI.OutVals.end());
4097 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4098
4099 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4100 "Haven't popped all the special args");
4101
4102 TargetLowering::ArgListEntry RequestedExecArg =
4103 CLI.Args[ChainCallArgIdx::Exec];
4104 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4105 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4106
4107 // Convert constants into TargetConstants, so they become immediate operands
4108 // instead of being selected into S_MOV.
4109 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4110 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4111 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4112 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4113 } else
4114 ChainCallSpecialArgs.push_back(Arg.Node);
4115 };
4116
4117 PushNodeOrTargetConstant(RequestedExecArg);
4118
4119 // Process any other special arguments depending on the value of the flags.
4120 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4121
4122 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4123 if (FlagsValue.isZero()) {
4124 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4125 return lowerUnhandledCall(CLI, InVals,
4126 "no additional args allowed if flags == 0");
4127 } else if (FlagsValue.isOneBitSet(0)) {
4128 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4129 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4130 }
4131
4132 if (!Subtarget->isWave32()) {
4133 return lowerUnhandledCall(
4134 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4135 }
4136
4137 UsesDynamicVGPRs = true;
4138 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4139 CLI.Args.end(), PushNodeOrTargetConstant);
4140 }
4141 }
4142
4144 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4146 bool &IsTailCall = CLI.IsTailCall;
4147 bool IsVarArg = CLI.IsVarArg;
4148 bool IsSibCall = false;
4150
4151 if (Callee.isUndef() || isNullConstant(Callee)) {
4152 if (!CLI.IsTailCall) {
4153 for (ISD::InputArg &Arg : CLI.Ins)
4154 InVals.push_back(DAG.getPOISON(Arg.VT));
4155 }
4156
4157 return Chain;
4158 }
4159
4160 if (IsVarArg) {
4161 return lowerUnhandledCall(CLI, InVals,
4162 "unsupported call to variadic function ");
4163 }
4164
4165 if (!CLI.CB)
4166 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4167
4168 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4169 return lowerUnhandledCall(CLI, InVals,
4170 "unsupported required tail call to function ");
4171 }
4172
4173 if (IsTailCall) {
4174 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4175 Outs, OutVals, Ins, DAG);
4176 if (!IsTailCall &&
4177 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4178 report_fatal_error("failed to perform tail call elimination on a call "
4179 "site marked musttail or on llvm.amdgcn.cs.chain");
4180 }
4181
4182 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4183
4184 // A sibling call is one where we're under the usual C ABI and not planning
4185 // to change that but can still do a tail call:
4186 if (!TailCallOpt && IsTailCall)
4187 IsSibCall = true;
4188
4189 if (IsTailCall)
4190 ++NumTailCalls;
4191 }
4192
4195 SmallVector<SDValue, 8> MemOpChains;
4196
4197 // Analyze operands of the call, assigning locations to each operand.
4199 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4200 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4201
4202 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4204 // With a fixed ABI, allocate fixed registers before user arguments.
4205 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4206 }
4207
4208 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4209
4210 // Get a count of how many bytes are to be pushed on the stack.
4211 unsigned NumBytes = CCInfo.getStackSize();
4212
4213 if (IsSibCall) {
4214 // Since we're not changing the ABI to make this a tail call, the memory
4215 // operands are already available in the caller's incoming argument space.
4216 NumBytes = 0;
4217 }
4218
4219 // FPDiff is the byte offset of the call's argument area from the callee's.
4220 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4221 // by this amount for a tail call. In a sibling call it must be 0 because the
4222 // caller will deallocate the entire stack and the callee still expects its
4223 // arguments to begin at SP+0. Completely unused for non-tail calls.
4224 int32_t FPDiff = 0;
4225 MachineFrameInfo &MFI = MF.getFrameInfo();
4226 auto *TRI = Subtarget->getRegisterInfo();
4227
4228 // Adjust the stack pointer for the new arguments...
4229 // These operations are automatically eliminated by the prolog/epilog pass
4230 if (!IsSibCall)
4231 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4232
4233 if (!IsSibCall || IsChainCallConv) {
4234 if (!Subtarget->enableFlatScratch()) {
4235 SmallVector<SDValue, 4> CopyFromChains;
4236
4237 // In the HSA case, this should be an identity copy.
4238 SDValue ScratchRSrcReg =
4239 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4240 RegsToPass.emplace_back(IsChainCallConv
4241 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4242 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4243 ScratchRSrcReg);
4244 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4245 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4246 }
4247 }
4248
4249 const unsigned NumSpecialInputs = RegsToPass.size();
4250
4251 MVT PtrVT = MVT::i32;
4252
4253 // Walk the register/memloc assignments, inserting copies/loads.
4254 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4255 CCValAssign &VA = ArgLocs[i];
4256 SDValue Arg = OutVals[i];
4257
4258 // Promote the value if needed.
4259 switch (VA.getLocInfo()) {
4260 case CCValAssign::Full:
4261 break;
4262 case CCValAssign::BCvt:
4263 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4264 break;
4265 case CCValAssign::ZExt:
4266 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4267 break;
4268 case CCValAssign::SExt:
4269 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4270 break;
4271 case CCValAssign::AExt:
4272 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4273 break;
4274 case CCValAssign::FPExt:
4275 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4276 break;
4277 default:
4278 llvm_unreachable("Unknown loc info!");
4279 }
4280
4281 if (VA.isRegLoc()) {
4282 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4283 } else {
4284 assert(VA.isMemLoc());
4285
4286 SDValue DstAddr;
4287 MachinePointerInfo DstInfo;
4288
4289 unsigned LocMemOffset = VA.getLocMemOffset();
4290 int32_t Offset = LocMemOffset;
4291
4292 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4293 MaybeAlign Alignment;
4294
4295 if (IsTailCall) {
4296 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4297 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4298 : VA.getValVT().getStoreSize();
4299
4300 // FIXME: We can have better than the minimum byval required alignment.
4301 Alignment =
4302 Flags.isByVal()
4303 ? Flags.getNonZeroByValAlign()
4304 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4305
4306 Offset = Offset + FPDiff;
4307 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4308
4309 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4310 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4311
4312 // Make sure any stack arguments overlapping with where we're storing
4313 // are loaded before this eventual operation. Otherwise they'll be
4314 // clobbered.
4315
4316 // FIXME: Why is this really necessary? This seems to just result in a
4317 // lot of code to copy the stack and write them back to the same
4318 // locations, which are supposed to be immutable?
4319 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4320 } else {
4321 // Stores to the argument stack area are relative to the stack pointer.
4322 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4323 MVT::i32);
4324 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4325 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4326 Alignment =
4327 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4328 }
4329
4330 if (Outs[i].Flags.isByVal()) {
4331 SDValue SizeNode =
4332 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4333 SDValue Cpy =
4334 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4335 Outs[i].Flags.getNonZeroByValAlign(),
4336 /*isVol = */ false, /*AlwaysInline = */ true,
4337 /*CI=*/nullptr, std::nullopt, DstInfo,
4339
4340 MemOpChains.push_back(Cpy);
4341 } else {
4342 SDValue Store =
4343 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4344 MemOpChains.push_back(Store);
4345 }
4346 }
4347 }
4348
4349 if (!MemOpChains.empty())
4350 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4351
4352 SDValue ReadFirstLaneID =
4353 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4354
4355 SDValue TokenGlue;
4356 if (CLI.ConvergenceControlToken) {
4357 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4359 }
4360
4361 // Build a sequence of copy-to-reg nodes chained together with token chain
4362 // and flag operands which copy the outgoing args into the appropriate regs.
4363 SDValue InGlue;
4364
4365 unsigned ArgIdx = 0;
4366 for (auto [Reg, Val] : RegsToPass) {
4367 if (ArgIdx++ >= NumSpecialInputs &&
4368 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4369 // For chain calls, the inreg arguments are required to be
4370 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4371 // they are uniform.
4372 //
4373 // For other calls, if an inreg arguments is known to be uniform,
4374 // speculatively insert a readfirstlane in case it is in a VGPR.
4375 //
4376 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4377 // value, so let that continue to produce invalid code.
4378
4379 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4380 if (TokenGlue)
4381 ReadfirstlaneArgs.push_back(TokenGlue);
4383 ReadfirstlaneArgs);
4384 }
4385
4386 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4387 InGlue = Chain.getValue(1);
4388 }
4389
4390 // We don't usually want to end the call-sequence here because we would tidy
4391 // the frame up *after* the call, however in the ABI-changing tail-call case
4392 // we've carefully laid out the parameters so that when sp is reset they'll be
4393 // in the correct location.
4394 if (IsTailCall && !IsSibCall) {
4395 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4396 InGlue = Chain.getValue(1);
4397 }
4398
4399 std::vector<SDValue> Ops({Chain});
4400
4401 // Add a redundant copy of the callee global which will not be legalized, as
4402 // we need direct access to the callee later.
4404 const GlobalValue *GV = GSD->getGlobal();
4405 Ops.push_back(Callee);
4406 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4407 } else {
4408 if (IsTailCall) {
4409 // isEligibleForTailCallOptimization considered whether the call target is
4410 // divergent, but we may still end up with a uniform value in a VGPR.
4411 // Insert a readfirstlane just in case.
4412 SDValue ReadFirstLaneID =
4413 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4414
4415 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4416 if (TokenGlue)
4417 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4418 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4419 ReadfirstlaneArgs);
4420 }
4421
4422 Ops.push_back(Callee);
4423 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4424 }
4425
4426 if (IsTailCall) {
4427 // Each tail call may have to adjust the stack by a different amount, so
4428 // this information must travel along with the operation for eventual
4429 // consumption by emitEpilogue.
4430 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4431 }
4432
4433 if (IsChainCallConv)
4434 llvm::append_range(Ops, ChainCallSpecialArgs);
4435
4436 // Add argument registers to the end of the list so that they are known live
4437 // into the call.
4438 for (auto &[Reg, Val] : RegsToPass)
4439 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4440
4441 // Add a register mask operand representing the call-preserved registers.
4442 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4443 assert(Mask && "Missing call preserved mask for calling convention");
4444 Ops.push_back(DAG.getRegisterMask(Mask));
4445
4446 if (SDValue Token = CLI.ConvergenceControlToken) {
4448 GlueOps.push_back(Token);
4449 if (InGlue)
4450 GlueOps.push_back(InGlue);
4451
4452 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4453 MVT::Glue, GlueOps),
4454 0);
4455 }
4456
4457 if (InGlue)
4458 Ops.push_back(InGlue);
4459
4460 // If we're doing a tall call, use a TC_RETURN here rather than an
4461 // actual call instruction.
4462 if (IsTailCall) {
4463 MFI.setHasTailCall();
4464 unsigned OPC = AMDGPUISD::TC_RETURN;
4465 switch (CallConv) {
4468 break;
4471 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4473 break;
4474 }
4475
4476 // If the caller is a whole wave function, we need to use a special opcode
4477 // so we can patch up EXEC.
4478 if (Info->isWholeWaveFunction())
4480
4481 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4482 }
4483
4484 // Returns a chain and a flag for retval copy to use.
4485 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4486 Chain = Call.getValue(0);
4487 InGlue = Call.getValue(1);
4488
4489 uint64_t CalleePopBytes = NumBytes;
4490 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4491 if (!Ins.empty())
4492 InGlue = Chain.getValue(1);
4493
4494 // Handle result values, copying them out of physregs into vregs that we
4495 // return.
4496 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4497 InVals, /*IsThisReturn=*/false, SDValue());
4498}
4499
4500// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4501// except for:
4502// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4503// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4505 SelectionDAG &DAG) const {
4506 const MachineFunction &MF = DAG.getMachineFunction();
4508
4509 SDLoc dl(Op);
4510 EVT VT = Op.getValueType();
4511 SDValue Chain = Op.getOperand(0);
4512 Register SPReg = Info->getStackPtrOffsetReg();
4513
4514 // Chain the dynamic stack allocation so that it doesn't modify the stack
4515 // pointer when other instructions are using the stack.
4516 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4517
4518 SDValue Size = Op.getOperand(1);
4519 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4520 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4521
4522 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4524 "Stack grows upwards for AMDGPU");
4525
4526 Chain = BaseAddr.getValue(1);
4527 Align StackAlign = TFL->getStackAlign();
4528 if (Alignment > StackAlign) {
4529 uint64_t ScaledAlignment = Alignment.value()
4530 << Subtarget->getWavefrontSizeLog2();
4531 uint64_t StackAlignMask = ScaledAlignment - 1;
4532 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4533 DAG.getConstant(StackAlignMask, dl, VT));
4534 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4535 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4536 }
4537
4538 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4539 SDValue NewSP;
4541 // For constant sized alloca, scale alloca size by wave-size
4542 SDValue ScaledSize = DAG.getNode(
4543 ISD::SHL, dl, VT, Size,
4544 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4545 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4546 } else {
4547 // For dynamic sized alloca, perform wave-wide reduction to get max of
4548 // alloca size(divergent) and then scale it by wave-size
4549 SDValue WaveReduction =
4550 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4551 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4552 Size, DAG.getConstant(0, dl, MVT::i32));
4553 SDValue ScaledSize = DAG.getNode(
4554 ISD::SHL, dl, VT, Size,
4555 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4556 NewSP =
4557 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4558 SDValue ReadFirstLaneID =
4559 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4560 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4561 NewSP);
4562 }
4563
4564 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4565 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4566
4567 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4568}
4569
4571 if (Op.getValueType() != MVT::i32)
4572 return Op; // Defer to cannot select error.
4573
4575 SDLoc SL(Op);
4576
4577 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4578
4579 // Convert from wave uniform to swizzled vector address. This should protect
4580 // from any edge cases where the stacksave result isn't directly used with
4581 // stackrestore.
4582 SDValue VectorAddress =
4583 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4584 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4585}
4586
4588 SelectionDAG &DAG) const {
4589 SDLoc SL(Op);
4590 assert(Op.getValueType() == MVT::i32);
4591
4592 uint32_t BothRoundHwReg =
4594 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4595
4596 SDValue IntrinID =
4597 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4598 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4599 Op.getOperand(0), IntrinID, GetRoundBothImm);
4600
4601 // There are two rounding modes, one for f32 and one for f64/f16. We only
4602 // report in the standard value range if both are the same.
4603 //
4604 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4605 // ties away from zero is not supported, and the other values are rotated by
4606 // 1.
4607 //
4608 // If the two rounding modes are not the same, report a target defined value.
4609
4610 // Mode register rounding mode fields:
4611 //
4612 // [1:0] Single-precision round mode.
4613 // [3:2] Double/Half-precision round mode.
4614 //
4615 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4616 //
4617 // Hardware Spec
4618 // Toward-0 3 0
4619 // Nearest Even 0 1
4620 // +Inf 1 2
4621 // -Inf 2 3
4622 // NearestAway0 N/A 4
4623 //
4624 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4625 // table we can index by the raw hardware mode.
4626 //
4627 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4628
4629 SDValue BitTable =
4631
4632 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4633 SDValue RoundModeTimesNumBits =
4634 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4635
4636 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4637 // knew only one mode was demanded.
4638 SDValue TableValue =
4639 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4640 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4641
4642 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4643 SDValue TableEntry =
4644 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4645
4646 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4647 // if it's an extended value.
4648 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4649 SDValue IsStandardValue =
4650 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4651 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4652 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4653 TableEntry, EnumOffset);
4654
4655 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4656}
4657
4659 SelectionDAG &DAG) const {
4660 SDLoc SL(Op);
4661
4662 SDValue NewMode = Op.getOperand(1);
4663 assert(NewMode.getValueType() == MVT::i32);
4664
4665 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4666 // hardware MODE.fp_round values.
4667 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4668 uint32_t ClampedVal = std::min(
4669 static_cast<uint32_t>(ConstMode->getZExtValue()),
4671 NewMode = DAG.getConstant(
4672 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4673 } else {
4674 // If we know the input can only be one of the supported standard modes in
4675 // the range 0-3, we can use a simplified mapping to hardware values.
4676 KnownBits KB = DAG.computeKnownBits(NewMode);
4677 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4678 // The supported standard values are 0-3. The extended values start at 8. We
4679 // need to offset by 4 if the value is in the extended range.
4680
4681 if (UseReducedTable) {
4682 // Truncate to the low 32-bits.
4683 SDValue BitTable = DAG.getConstant(
4684 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4685
4686 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4687 SDValue RoundModeTimesNumBits =
4688 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4689
4690 NewMode =
4691 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4692
4693 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4694 // the table extracted bits into inline immediates.
4695 } else {
4696 // table_index = umin(value, value - 4)
4697 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4698 SDValue BitTable =
4700
4701 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4702 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4703 SDValue IndexVal =
4704 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4705
4706 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4707 SDValue RoundModeTimesNumBits =
4708 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4709
4710 SDValue TableValue =
4711 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4712 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4713
4714 // No need to mask out the high bits since the setreg will ignore them
4715 // anyway.
4716 NewMode = TruncTable;
4717 }
4718
4719 // Insert a readfirstlane in case the value is a VGPR. We could do this
4720 // earlier and keep more operations scalar, but that interferes with
4721 // combining the source.
4722 SDValue ReadFirstLaneID =
4723 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4724 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4725 ReadFirstLaneID, NewMode);
4726 }
4727
4728 // N.B. The setreg will be later folded into s_round_mode on supported
4729 // targets.
4730 SDValue IntrinID =
4731 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4732 uint32_t BothRoundHwReg =
4734 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4735
4736 SDValue SetReg =
4737 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4738 IntrinID, RoundBothImm, NewMode);
4739
4740 return SetReg;
4741}
4742
4744 if (Op->isDivergent() &&
4745 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4746 // Cannot do I$ prefetch with divergent pointer.
4747 return SDValue();
4748
4749 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4753 break;
4755 if (Subtarget->hasSafeSmemPrefetch())
4756 break;
4757 [[fallthrough]];
4758 default:
4759 return SDValue();
4760 }
4761
4762 // I$ prefetch
4763 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4764 return SDValue();
4765
4766 return Op;
4767}
4768
4769// Work around DAG legality rules only based on the result type.
4771 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4772 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4773 EVT SrcVT = Src.getValueType();
4774
4775 if (SrcVT.getScalarType() != MVT::bf16)
4776 return Op;
4777
4778 SDLoc SL(Op);
4779 SDValue BitCast =
4780 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4781
4782 EVT DstVT = Op.getValueType();
4783 if (IsStrict)
4784 llvm_unreachable("Need STRICT_BF16_TO_FP");
4785
4786 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4787}
4788
4790 SDLoc SL(Op);
4791 if (Op.getValueType() != MVT::i64)
4792 return Op;
4793
4794 uint32_t ModeHwReg =
4796 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4797 uint32_t TrapHwReg =
4799 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4800
4801 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4802 SDValue IntrinID =
4803 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4804 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4805 Op.getOperand(0), IntrinID, ModeHwRegImm);
4806 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4807 Op.getOperand(0), IntrinID, TrapHwRegImm);
4808 SDValue TokenReg =
4809 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4810 GetTrapReg.getValue(1));
4811
4812 SDValue CvtPtr =
4813 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4814 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4815
4816 return DAG.getMergeValues({Result, TokenReg}, SL);
4817}
4818
4820 SDLoc SL(Op);
4821 if (Op.getOperand(1).getValueType() != MVT::i64)
4822 return Op;
4823
4824 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4825 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4826 DAG.getConstant(0, SL, MVT::i32));
4827 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4828 DAG.getConstant(1, SL, MVT::i32));
4829
4830 SDValue ReadFirstLaneID =
4831 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4832 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4833 ReadFirstLaneID, NewModeReg);
4834 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4835 ReadFirstLaneID, NewTrapReg);
4836
4837 unsigned ModeHwReg =
4839 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4840 unsigned TrapHwReg =
4842 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4843
4844 SDValue IntrinID =
4845 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4846 SDValue SetModeReg =
4847 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4848 IntrinID, ModeHwRegImm, NewModeReg);
4849 SDValue SetTrapReg =
4850 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4851 IntrinID, TrapHwRegImm, NewTrapReg);
4852 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4853}
4854
4856 const MachineFunction &MF) const {
4857 const Function &Fn = MF.getFunction();
4858
4860 .Case("m0", AMDGPU::M0)
4861 .Case("exec", AMDGPU::EXEC)
4862 .Case("exec_lo", AMDGPU::EXEC_LO)
4863 .Case("exec_hi", AMDGPU::EXEC_HI)
4864 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4865 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4866 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4867 .Default(Register());
4868 if (!Reg)
4869 return Reg;
4870
4871 if (!Subtarget->hasFlatScrRegister() &&
4872 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4873 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4874 "\" for subtarget."));
4875 }
4876
4877 switch (Reg) {
4878 case AMDGPU::M0:
4879 case AMDGPU::EXEC_LO:
4880 case AMDGPU::EXEC_HI:
4881 case AMDGPU::FLAT_SCR_LO:
4882 case AMDGPU::FLAT_SCR_HI:
4883 if (VT.getSizeInBits() == 32)
4884 return Reg;
4885 break;
4886 case AMDGPU::EXEC:
4887 case AMDGPU::FLAT_SCR:
4888 if (VT.getSizeInBits() == 64)
4889 return Reg;
4890 break;
4891 default:
4892 llvm_unreachable("missing register type checking");
4893 }
4894
4896 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4897}
4898
4899// If kill is not the last instruction, split the block so kill is always a
4900// proper terminator.
4903 MachineBasicBlock *BB) const {
4904 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4906 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4907 return SplitBB;
4908}
4909
4910// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4911// \p MI will be the only instruction in the loop body block. Otherwise, it will
4912// be the first instruction in the remainder block.
4913//
4914/// \returns { LoopBody, Remainder }
4915static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4917 MachineFunction *MF = MBB.getParent();
4919
4920 // To insert the loop we need to split the block. Move everything after this
4921 // point to a new block, and insert a new empty block between the two.
4923 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4925 ++MBBI;
4926
4927 MF->insert(MBBI, LoopBB);
4928 MF->insert(MBBI, RemainderBB);
4929
4930 LoopBB->addSuccessor(LoopBB);
4931 LoopBB->addSuccessor(RemainderBB);
4932
4933 // Move the rest of the block into a new block.
4934 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4935
4936 if (InstInLoop) {
4937 auto Next = std::next(I);
4938
4939 // Move instruction to loop body.
4940 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4941
4942 // Move the rest of the block.
4943 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4944 } else {
4945 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4946 }
4947
4948 MBB.addSuccessor(LoopBB);
4949
4950 return std::pair(LoopBB, RemainderBB);
4951}
4952
4953/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4955 MachineBasicBlock *MBB = MI.getParent();
4957 auto I = MI.getIterator();
4958 auto E = std::next(I);
4959
4960 // clang-format off
4961 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4962 .addImm(0);
4963 // clang-format on
4964
4965 MIBundleBuilder Bundler(*MBB, I, E);
4966 finalizeBundle(*MBB, Bundler.begin());
4967}
4968
4971 MachineBasicBlock *BB) const {
4972 const DebugLoc &DL = MI.getDebugLoc();
4973
4975
4977
4978 // Apparently kill flags are only valid if the def is in the same block?
4979 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4980 Src->setIsKill(false);
4981
4982 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4983
4984 MachineBasicBlock::iterator I = LoopBB->end();
4985
4986 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4988
4989 // Clear TRAP_STS.MEM_VIOL
4990 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4991 .addImm(0)
4992 .addImm(EncodedReg);
4993
4995
4996 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4997
4998 // Load and check TRAP_STS.MEM_VIOL
4999 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5000 .addImm(EncodedReg);
5001
5002 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5003 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5004 .addReg(Reg, RegState::Kill)
5005 .addImm(0);
5006 // clang-format off
5007 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5008 .addMBB(LoopBB);
5009 // clang-format on
5010
5011 return RemainderBB;
5012}
5013
5014// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5015// wavefront. If the value is uniform and just happens to be in a VGPR, this
5016// will only do one iteration. In the worst case, this will loop 64 times.
5017//
5018// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5021 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5022 const DebugLoc &DL, const MachineOperand &Idx,
5023 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5024 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5025 Register &SGPRIdxReg) {
5026
5027 MachineFunction *MF = OrigBB.getParent();
5028 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5029 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5031
5032 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5033 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5034 Register NewExec = MRI.createVirtualRegister(BoolRC);
5035 Register CurrentIdxReg =
5036 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5037 Register CondReg = MRI.createVirtualRegister(BoolRC);
5038
5039 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5040 .addReg(InitReg)
5041 .addMBB(&OrigBB)
5042 .addReg(ResultReg)
5043 .addMBB(&LoopBB);
5044
5045 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5046 .addReg(InitSaveExecReg)
5047 .addMBB(&OrigBB)
5048 .addReg(NewExec)
5049 .addMBB(&LoopBB);
5050
5051 // Read the next variant <- also loop target.
5052 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5053 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5054
5055 // Compare the just read M0 value to all possible Idx values.
5056 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5057 .addReg(CurrentIdxReg)
5058 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5059
5060 // Update EXEC, save the original EXEC value to VCC.
5061 BuildMI(LoopBB, I, DL,
5062 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
5063 : AMDGPU::S_AND_SAVEEXEC_B64),
5064 NewExec)
5065 .addReg(CondReg, RegState::Kill);
5066
5067 MRI.setSimpleHint(NewExec, CondReg);
5068
5069 if (UseGPRIdxMode) {
5070 if (Offset == 0) {
5071 SGPRIdxReg = CurrentIdxReg;
5072 } else {
5073 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5074 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5075 .addReg(CurrentIdxReg, RegState::Kill)
5076 .addImm(Offset);
5077 }
5078 } else {
5079 // Move index from VCC into M0
5080 if (Offset == 0) {
5081 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5082 .addReg(CurrentIdxReg, RegState::Kill);
5083 } else {
5084 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5085 .addReg(CurrentIdxReg, RegState::Kill)
5086 .addImm(Offset);
5087 }
5088 }
5089
5090 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5091 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5092 MachineInstr *InsertPt =
5093 BuildMI(LoopBB, I, DL,
5094 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
5095 : AMDGPU::S_XOR_B64_term),
5096 Exec)
5097 .addReg(Exec)
5098 .addReg(NewExec);
5099
5100 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5101 // s_cbranch_scc0?
5102
5103 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5104 // clang-format off
5105 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5106 .addMBB(&LoopBB);
5107 // clang-format on
5108
5109 return InsertPt->getIterator();
5110}
5111
5112// This has slightly sub-optimal regalloc when the source vector is killed by
5113// the read. The register allocator does not understand that the kill is
5114// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5115// subregister from it, using 1 more VGPR than necessary. This was saved when
5116// this was expanded after register allocation.
5119 unsigned InitResultReg, unsigned PhiReg, int Offset,
5120 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5121 MachineFunction *MF = MBB.getParent();
5122 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5123 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5125 const DebugLoc &DL = MI.getDebugLoc();
5127
5128 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5129 Register DstReg = MI.getOperand(0).getReg();
5130 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5131 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5132 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5133 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5134
5135 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5136
5137 // Save the EXEC mask
5138 // clang-format off
5139 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
5140 .addReg(Exec);
5141 // clang-format on
5142
5143 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5144
5145 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5146
5147 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5148 InitResultReg, DstReg, PhiReg, TmpExec,
5149 Offset, UseGPRIdxMode, SGPRIdxReg);
5150
5151 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5153 ++MBBI;
5154 MF->insert(MBBI, LandingPad);
5155 LoopBB->removeSuccessor(RemainderBB);
5156 LandingPad->addSuccessor(RemainderBB);
5157 LoopBB->addSuccessor(LandingPad);
5158 MachineBasicBlock::iterator First = LandingPad->begin();
5159 // clang-format off
5160 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
5161 .addReg(SaveExec);
5162 // clang-format on
5163
5164 return InsPt;
5165}
5166
5167// Returns subreg index, offset
5168static std::pair<unsigned, int>
5170 const TargetRegisterClass *SuperRC, unsigned VecReg,
5171 int Offset) {
5172 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5173
5174 // Skip out of bounds offsets, or else we would end up using an undefined
5175 // register.
5176 if (Offset >= NumElts || Offset < 0)
5177 return std::pair(AMDGPU::sub0, Offset);
5178
5179 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5180}
5181
5184 int Offset) {
5185 MachineBasicBlock *MBB = MI.getParent();
5186 const DebugLoc &DL = MI.getDebugLoc();
5188
5189 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5190
5191 assert(Idx->getReg() != AMDGPU::NoRegister);
5192
5193 if (Offset == 0) {
5194 // clang-format off
5195 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5196 .add(*Idx);
5197 // clang-format on
5198 } else {
5199 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5200 .add(*Idx)
5201 .addImm(Offset);
5202 }
5203}
5204
5207 int Offset) {
5208 MachineBasicBlock *MBB = MI.getParent();
5209 const DebugLoc &DL = MI.getDebugLoc();
5211
5212 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5213
5214 if (Offset == 0)
5215 return Idx->getReg();
5216
5217 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5218 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5219 .add(*Idx)
5220 .addImm(Offset);
5221 return Tmp;
5222}
5223
5226 const GCNSubtarget &ST) {
5227 const SIInstrInfo *TII = ST.getInstrInfo();
5228 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5229 MachineFunction *MF = MBB.getParent();
5231
5232 Register Dst = MI.getOperand(0).getReg();
5233 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5234 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5235 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5236
5237 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5238 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5239
5240 unsigned SubReg;
5241 std::tie(SubReg, Offset) =
5242 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5243
5244 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5245
5246 // Check for a SGPR index.
5247 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5249 const DebugLoc &DL = MI.getDebugLoc();
5250
5251 if (UseGPRIdxMode) {
5252 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5253 // to avoid interfering with other uses, so probably requires a new
5254 // optimization pass.
5256
5257 const MCInstrDesc &GPRIDXDesc =
5258 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5259 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5260 .addReg(SrcReg)
5261 .addReg(Idx)
5262 .addImm(SubReg);
5263 } else {
5265
5266 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5267 .addReg(SrcReg, 0, SubReg)
5268 .addReg(SrcReg, RegState::Implicit);
5269 }
5270
5271 MI.eraseFromParent();
5272
5273 return &MBB;
5274 }
5275
5276 // Control flow needs to be inserted if indexing with a VGPR.
5277 const DebugLoc &DL = MI.getDebugLoc();
5279
5280 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5281 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5282
5283 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5284
5285 Register SGPRIdxReg;
5286 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5287 UseGPRIdxMode, SGPRIdxReg);
5288
5289 MachineBasicBlock *LoopBB = InsPt->getParent();
5290
5291 if (UseGPRIdxMode) {
5292 const MCInstrDesc &GPRIDXDesc =
5293 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5294
5295 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5296 .addReg(SrcReg)
5297 .addReg(SGPRIdxReg)
5298 .addImm(SubReg);
5299 } else {
5300 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5301 .addReg(SrcReg, 0, SubReg)
5302 .addReg(SrcReg, RegState::Implicit);
5303 }
5304
5305 MI.eraseFromParent();
5306
5307 return LoopBB;
5308}
5309
5312 const GCNSubtarget &ST) {
5313 const SIInstrInfo *TII = ST.getInstrInfo();
5314 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5315 MachineFunction *MF = MBB.getParent();
5317
5318 Register Dst = MI.getOperand(0).getReg();
5319 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5320 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5321 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5322 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5323 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5324 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5325
5326 // This can be an immediate, but will be folded later.
5327 assert(Val->getReg());
5328
5329 unsigned SubReg;
5330 std::tie(SubReg, Offset) =
5331 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5332 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5333
5334 if (Idx->getReg() == AMDGPU::NoRegister) {
5336 const DebugLoc &DL = MI.getDebugLoc();
5337
5338 assert(Offset == 0);
5339
5340 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5341 .add(*SrcVec)
5342 .add(*Val)
5343 .addImm(SubReg);
5344
5345 MI.eraseFromParent();
5346 return &MBB;
5347 }
5348
5349 // Check for a SGPR index.
5350 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5352 const DebugLoc &DL = MI.getDebugLoc();
5353
5354 if (UseGPRIdxMode) {
5356
5357 const MCInstrDesc &GPRIDXDesc =
5358 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5359 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5360 .addReg(SrcVec->getReg())
5361 .add(*Val)
5362 .addReg(Idx)
5363 .addImm(SubReg);
5364 } else {
5366
5367 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5368 TRI.getRegSizeInBits(*VecRC), 32, false);
5369 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5370 .addReg(SrcVec->getReg())
5371 .add(*Val)
5372 .addImm(SubReg);
5373 }
5374 MI.eraseFromParent();
5375 return &MBB;
5376 }
5377
5378 // Control flow needs to be inserted if indexing with a VGPR.
5379 if (Val->isReg())
5380 MRI.clearKillFlags(Val->getReg());
5381
5382 const DebugLoc &DL = MI.getDebugLoc();
5383
5384 Register PhiReg = MRI.createVirtualRegister(VecRC);
5385
5386 Register SGPRIdxReg;
5387 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5388 UseGPRIdxMode, SGPRIdxReg);
5389 MachineBasicBlock *LoopBB = InsPt->getParent();
5390
5391 if (UseGPRIdxMode) {
5392 const MCInstrDesc &GPRIDXDesc =
5393 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5394
5395 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5396 .addReg(PhiReg)
5397 .add(*Val)
5398 .addReg(SGPRIdxReg)
5399 .addImm(SubReg);
5400 } else {
5401 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5402 TRI.getRegSizeInBits(*VecRC), 32, false);
5403 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5404 .addReg(PhiReg)
5405 .add(*Val)
5406 .addImm(SubReg);
5407 }
5408
5409 MI.eraseFromParent();
5410 return LoopBB;
5411}
5412
5414 MachineBasicBlock *BB) {
5415 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5416 // For GFX12, we emit s_add_u64 and s_sub_u64.
5417 MachineFunction *MF = BB->getParent();
5418 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5419 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5421 const DebugLoc &DL = MI.getDebugLoc();
5422 MachineOperand &Dest = MI.getOperand(0);
5423 MachineOperand &Src0 = MI.getOperand(1);
5424 MachineOperand &Src1 = MI.getOperand(2);
5425 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5426 if (ST.hasScalarAddSub64()) {
5427 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5428 // clang-format off
5429 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5430 .add(Src0)
5431 .add(Src1);
5432 // clang-format on
5433 } else {
5434 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5435 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5436
5437 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5438 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5439
5440 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5441 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5442 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5443 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5444
5445 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5446 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5447 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5448 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5449
5450 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5451 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5452 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5453 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5454 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5455 .addReg(DestSub0)
5456 .addImm(AMDGPU::sub0)
5457 .addReg(DestSub1)
5458 .addImm(AMDGPU::sub1);
5459 }
5460 MI.eraseFromParent();
5461 return BB;
5462}
5463
5465 switch (Opc) {
5466 case AMDGPU::S_MIN_U32:
5467 return std::numeric_limits<uint32_t>::max();
5468 case AMDGPU::S_MIN_I32:
5469 return std::numeric_limits<int32_t>::max();
5470 case AMDGPU::S_MAX_U32:
5471 return std::numeric_limits<uint32_t>::min();
5472 case AMDGPU::S_MAX_I32:
5473 return std::numeric_limits<int32_t>::min();
5474 case AMDGPU::S_ADD_I32:
5475 case AMDGPU::S_SUB_I32:
5476 case AMDGPU::S_OR_B32:
5477 case AMDGPU::S_XOR_B32:
5478 return std::numeric_limits<uint32_t>::min();
5479 case AMDGPU::S_AND_B32:
5480 return std::numeric_limits<uint32_t>::max();
5481 default:
5483 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5484 }
5485}
5486
5488 switch (Opc) {
5489 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5490 return std::numeric_limits<uint64_t>::max();
5491 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5492 return std::numeric_limits<int64_t>::max();
5493 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5494 return std::numeric_limits<uint64_t>::min();
5495 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5496 return std::numeric_limits<int64_t>::min();
5497 case AMDGPU::S_ADD_U64_PSEUDO:
5498 case AMDGPU::S_SUB_U64_PSEUDO:
5499 case AMDGPU::S_OR_B64:
5500 case AMDGPU::S_XOR_B64:
5501 return std::numeric_limits<uint64_t>::min();
5502 case AMDGPU::S_AND_B64:
5503 return std::numeric_limits<uint64_t>::max();
5504 default:
5506 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5507 }
5508}
5509
5510static bool is32bitWaveReduceOperation(unsigned Opc) {
5511 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5512 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5513 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5514 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5515 Opc == AMDGPU::S_XOR_B32;
5516}
5517
5520 const GCNSubtarget &ST,
5521 unsigned Opc) {
5523 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5524 const DebugLoc &DL = MI.getDebugLoc();
5525 const SIInstrInfo *TII = ST.getInstrInfo();
5526
5527 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5528 Register SrcReg = MI.getOperand(1).getReg();
5529 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5530 Register DstReg = MI.getOperand(0).getReg();
5531 MachineBasicBlock *RetBB = nullptr;
5532 if (isSGPR) {
5533 switch (Opc) {
5534 case AMDGPU::S_MIN_U32:
5535 case AMDGPU::S_MIN_I32:
5536 case AMDGPU::S_MAX_U32:
5537 case AMDGPU::S_MAX_I32:
5538 case AMDGPU::S_AND_B32:
5539 case AMDGPU::S_OR_B32: {
5540 // Idempotent operations.
5541 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5542 RetBB = &BB;
5543 break;
5544 }
5545 case AMDGPU::V_CMP_LT_U64_e64: // umin
5546 case AMDGPU::V_CMP_LT_I64_e64: // min
5547 case AMDGPU::V_CMP_GT_U64_e64: // umax
5548 case AMDGPU::V_CMP_GT_I64_e64: // max
5549 case AMDGPU::S_AND_B64:
5550 case AMDGPU::S_OR_B64: {
5551 // Idempotent operations.
5552 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5553 RetBB = &BB;
5554 break;
5555 }
5556 case AMDGPU::S_XOR_B32:
5557 case AMDGPU::S_XOR_B64:
5558 case AMDGPU::S_ADD_I32:
5559 case AMDGPU::S_ADD_U64_PSEUDO:
5560 case AMDGPU::S_SUB_I32:
5561 case AMDGPU::S_SUB_U64_PSEUDO: {
5562 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5563 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5564 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5565 Register NumActiveLanes =
5566 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5567
5568 bool IsWave32 = ST.isWave32();
5569 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5570 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5571 unsigned BitCountOpc =
5572 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5573
5574 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5575
5576 auto NewAccumulator =
5577 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5578 .addReg(ExecMask);
5579
5580 switch (Opc) {
5581 case AMDGPU::S_XOR_B32:
5582 case AMDGPU::S_XOR_B64: {
5583 // Performing an XOR operation on a uniform value
5584 // depends on the parity of the number of active lanes.
5585 // For even parity, the result will be 0, for odd
5586 // parity the result will be the same as the input value.
5587 Register ParityRegister =
5588 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5589
5590 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5591 .addReg(NewAccumulator->getOperand(0).getReg())
5592 .addImm(1)
5593 .setOperandDead(3); // Dead scc
5594 if (Opc == AMDGPU::S_XOR_B32) {
5595 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5596 .addReg(SrcReg)
5597 .addReg(ParityRegister);
5598 } else {
5599 Register DestSub0 =
5600 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5601 Register DestSub1 =
5602 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5603
5604 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5605 const TargetRegisterClass *SrcSubRC =
5606 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5607
5608 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5609 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5610 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5611 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5612
5613 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5614 .add(Op1L)
5615 .addReg(ParityRegister);
5616
5617 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5618 .add(Op1H)
5619 .addReg(ParityRegister);
5620
5621 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5622 .addReg(DestSub0)
5623 .addImm(AMDGPU::sub0)
5624 .addReg(DestSub1)
5625 .addImm(AMDGPU::sub1);
5626 }
5627 break;
5628 }
5629 case AMDGPU::S_SUB_I32: {
5630 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5631
5632 // Take the negation of the source operand.
5633 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5634 .addImm(0)
5635 .addReg(SrcReg);
5636 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5637 .addReg(NegatedVal)
5638 .addReg(NewAccumulator->getOperand(0).getReg());
5639 break;
5640 }
5641 case AMDGPU::S_ADD_I32: {
5642 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5643 .addReg(SrcReg)
5644 .addReg(NewAccumulator->getOperand(0).getReg());
5645 break;
5646 }
5647 case AMDGPU::S_ADD_U64_PSEUDO:
5648 case AMDGPU::S_SUB_U64_PSEUDO: {
5649 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register Op1H_Op0L_Reg =
5652 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register Op1L_Op0H_Reg =
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5656 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5657 Register NegatedValLo =
5658 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5659 Register NegatedValHi =
5660 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5661
5662 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5663 const TargetRegisterClass *Src1SubRC =
5664 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5665
5666 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5667 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5668 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5669 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5670
5671 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5672 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5673 .addImm(0)
5674 .addReg(NewAccumulator->getOperand(0).getReg())
5675 .setOperandDead(3); // Dead scc
5676 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5677 .addReg(NegatedValLo)
5678 .addImm(31)
5679 .setOperandDead(3); // Dead scc
5680 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5681 .add(Op1L)
5682 .addReg(NegatedValHi);
5683 }
5684 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5685 ? NegatedValLo
5686 : NewAccumulator->getOperand(0).getReg();
5687 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5688 .add(Op1L)
5689 .addReg(LowOpcode);
5690 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5691 .add(Op1L)
5692 .addReg(LowOpcode);
5693 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5694 .add(Op1H)
5695 .addReg(LowOpcode);
5696
5697 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5698 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5699 .addReg(CarryReg)
5700 .addReg(Op1H_Op0L_Reg)
5701 .setOperandDead(3); // Dead scc
5702
5703 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5704 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5705 .addReg(HiVal)
5706 .addReg(Op1L_Op0H_Reg)
5707 .setOperandDead(3); // Dead scc
5708 }
5709 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5710 .addReg(DestSub0)
5711 .addImm(AMDGPU::sub0)
5712 .addReg(DestSub1)
5713 .addImm(AMDGPU::sub1);
5714 break;
5715 }
5716 }
5717 RetBB = &BB;
5718 }
5719 }
5720 } else {
5721 // TODO: Implement DPP Strategy and switch based on immediate strategy
5722 // operand. For now, for all the cases (default, Iterative and DPP we use
5723 // iterative approach by default.)
5724
5725 // To reduce the VGPR using iterative approach, we need to iterate
5726 // over all the active lanes. Lowering consists of ComputeLoop,
5727 // which iterate over only active lanes. We use copy of EXEC register
5728 // as induction variable and every active lane modifies it using bitset0
5729 // so that we will get the next active lane for next iteration.
5731 Register SrcReg = MI.getOperand(1).getReg();
5732 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5733
5734 // Create Control flow for loop
5735 // Split MI's Machine Basic block into For loop
5736 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5737
5738 // Create virtual registers required for lowering.
5739 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5740 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5741 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5742 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5743 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5744 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5745 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5746 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5747 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5748
5749 bool IsWave32 = ST.isWave32();
5750 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5751 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5752
5753 // Create initial values of induction variable from Exec, Accumulator and
5754 // insert branch instr to newly created ComputeBlock
5755 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5756 if (is32BitOpc) {
5758 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5759 .addImm(IdentityValue);
5760 } else {
5762 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5763 .addImm(IdentityValue);
5764 }
5765 // clang-format off
5766 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5767 .addMBB(ComputeLoop);
5768 // clang-format on
5769
5770 // Start constructing ComputeLoop
5771 I = ComputeLoop->begin();
5772 auto Accumulator =
5773 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5774 .addReg(IdentityValReg)
5775 .addMBB(&BB);
5776 auto ActiveBits =
5777 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5778 .addReg(LoopIterator)
5779 .addMBB(&BB);
5780
5781 I = ComputeLoop->end();
5782 MachineInstr *NewAccumulator;
5783 // Perform the computations
5784 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5785 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5786 .addReg(ActiveBitsReg);
5787 if (is32BitOpc) {
5788 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5789 LaneValueReg)
5790 .addReg(SrcReg)
5791 .addReg(FF1Reg);
5792 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5793 .addReg(Accumulator->getOperand(0).getReg())
5794 .addReg(LaneValueReg);
5795 } else {
5796 Register LaneValueLoReg =
5797 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5798 Register LaneValueHiReg =
5799 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5800 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5801 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5802 const TargetRegisterClass *SrcSubRC =
5803 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5804 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5805 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5806 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5807 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5808 // lane value input should be in an sgpr
5809 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5810 LaneValueLoReg)
5811 .add(Op1L)
5812 .addReg(FF1Reg);
5813 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5814 LaneValueHiReg)
5815 .add(Op1H)
5816 .addReg(FF1Reg);
5817 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5818 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5819 .addReg(LaneValueLoReg)
5820 .addImm(AMDGPU::sub0)
5821 .addReg(LaneValueHiReg)
5822 .addImm(AMDGPU::sub1);
5823 switch (Opc) {
5824 case AMDGPU::S_OR_B64:
5825 case AMDGPU::S_AND_B64:
5826 case AMDGPU::S_XOR_B64: {
5827 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5828 .addReg(Accumulator->getOperand(0).getReg())
5829 .addReg(LaneValue->getOperand(0).getReg())
5830 .setOperandDead(3); // Dead scc
5831 break;
5832 }
5833 case AMDGPU::V_CMP_GT_I64_e64:
5834 case AMDGPU::V_CMP_GT_U64_e64:
5835 case AMDGPU::V_CMP_LT_I64_e64:
5836 case AMDGPU::V_CMP_LT_U64_e64: {
5837 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5838 Register ComparisonResultReg =
5839 MRI.createVirtualRegister(WaveMaskRegClass);
5840 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5841 const TargetRegisterClass *VSubRegClass =
5842 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5843 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5844 MachineOperand SrcReg0Sub0 =
5845 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5846 VregClass, AMDGPU::sub0, VSubRegClass);
5847 MachineOperand SrcReg0Sub1 =
5848 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5849 VregClass, AMDGPU::sub1, VSubRegClass);
5850 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5851 AccumulatorVReg)
5852 .add(SrcReg0Sub0)
5853 .addImm(AMDGPU::sub0)
5854 .add(SrcReg0Sub1)
5855 .addImm(AMDGPU::sub1);
5856 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5857 .addReg(LaneValue->getOperand(0).getReg())
5858 .addReg(AccumulatorVReg);
5859
5860 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5861 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5862 .addReg(LaneMaskReg)
5863 .addReg(ActiveBitsReg);
5864
5865 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5866 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5867 .addReg(LaneValue->getOperand(0).getReg())
5868 .addReg(Accumulator->getOperand(0).getReg());
5869 break;
5870 }
5871 case AMDGPU::S_ADD_U64_PSEUDO:
5872 case AMDGPU::S_SUB_U64_PSEUDO: {
5873 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5874 .addReg(Accumulator->getOperand(0).getReg())
5875 .addReg(LaneValue->getOperand(0).getReg());
5876 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5877 break;
5878 }
5879 }
5880 }
5881 // Manipulate the iterator to get the next active lane
5882 unsigned BITSETOpc =
5883 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5884 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5885 .addReg(FF1Reg)
5886 .addReg(ActiveBitsReg);
5887
5888 // Add phi nodes
5889 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5890 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5891
5892 // Creating branching
5893 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5894 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5895 .addReg(NewActiveBitsReg)
5896 .addImm(0);
5897 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5898 .addMBB(ComputeLoop);
5899
5900 RetBB = ComputeEnd;
5901 }
5902 MI.eraseFromParent();
5903 return RetBB;
5904}
5905
5908 MachineBasicBlock *BB) const {
5909
5911 MachineFunction *MF = BB->getParent();
5913
5914 switch (MI.getOpcode()) {
5915 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5916 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5917 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5918 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5919 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5920 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5921 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5922 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5923 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5924 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5925 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5926 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5927 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5928 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5929 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5930 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5931 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5932 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5933 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5934 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5935 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5936 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5937 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5938 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5939 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5940 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5941 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5942 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5943 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5944 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5945 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5946 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5947 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5948 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5949 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5950 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5951 case AMDGPU::S_UADDO_PSEUDO:
5952 case AMDGPU::S_USUBO_PSEUDO: {
5953 const DebugLoc &DL = MI.getDebugLoc();
5954 MachineOperand &Dest0 = MI.getOperand(0);
5955 MachineOperand &Dest1 = MI.getOperand(1);
5956 MachineOperand &Src0 = MI.getOperand(2);
5957 MachineOperand &Src1 = MI.getOperand(3);
5958
5959 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5960 ? AMDGPU::S_ADD_I32
5961 : AMDGPU::S_SUB_I32;
5962 // clang-format off
5963 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5964 .add(Src0)
5965 .add(Src1);
5966 // clang-format on
5967
5968 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5969 .addImm(1)
5970 .addImm(0);
5971
5972 MI.eraseFromParent();
5973 return BB;
5974 }
5975 case AMDGPU::S_ADD_U64_PSEUDO:
5976 case AMDGPU::S_SUB_U64_PSEUDO: {
5977 return Expand64BitScalarArithmetic(MI, BB);
5978 }
5979 case AMDGPU::V_ADD_U64_PSEUDO:
5980 case AMDGPU::V_SUB_U64_PSEUDO: {
5982 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5984 const DebugLoc &DL = MI.getDebugLoc();
5985
5986 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5987
5988 MachineOperand &Dest = MI.getOperand(0);
5989 MachineOperand &Src0 = MI.getOperand(1);
5990 MachineOperand &Src1 = MI.getOperand(2);
5991
5992 if (ST.hasAddSubU64Insts()) {
5993 auto I = BuildMI(*BB, MI, DL,
5994 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5995 : AMDGPU::V_SUB_U64_e64),
5996 Dest.getReg())
5997 .add(Src0)
5998 .add(Src1)
5999 .addImm(0); // clamp
6000 TII->legalizeOperands(*I);
6001 MI.eraseFromParent();
6002 return BB;
6003 }
6004
6005 if (IsAdd && ST.hasLshlAddU64Inst()) {
6006 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6007 Dest.getReg())
6008 .add(Src0)
6009 .addImm(0)
6010 .add(Src1);
6011 TII->legalizeOperands(*Add);
6012 MI.eraseFromParent();
6013 return BB;
6014 }
6015
6016 const auto *CarryRC = TRI->getWaveMaskRegClass();
6017
6018 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6019 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6020
6021 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6022 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6023
6024 const TargetRegisterClass *Src0RC = Src0.isReg()
6025 ? MRI.getRegClass(Src0.getReg())
6026 : &AMDGPU::VReg_64RegClass;
6027 const TargetRegisterClass *Src1RC = Src1.isReg()
6028 ? MRI.getRegClass(Src1.getReg())
6029 : &AMDGPU::VReg_64RegClass;
6030
6031 const TargetRegisterClass *Src0SubRC =
6032 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6033 const TargetRegisterClass *Src1SubRC =
6034 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6035
6036 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6037 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6038 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6039 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6040
6041 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6042 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6043 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6044 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6045
6046 unsigned LoOpc =
6047 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6048 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6049 .addReg(CarryReg, RegState::Define)
6050 .add(SrcReg0Sub0)
6051 .add(SrcReg1Sub0)
6052 .addImm(0); // clamp bit
6053
6054 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6055 MachineInstr *HiHalf =
6056 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6057 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6058 .add(SrcReg0Sub1)
6059 .add(SrcReg1Sub1)
6060 .addReg(CarryReg, RegState::Kill)
6061 .addImm(0); // clamp bit
6062
6063 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6064 .addReg(DestSub0)
6065 .addImm(AMDGPU::sub0)
6066 .addReg(DestSub1)
6067 .addImm(AMDGPU::sub1);
6068 TII->legalizeOperands(*LoHalf);
6069 TII->legalizeOperands(*HiHalf);
6070 MI.eraseFromParent();
6071 return BB;
6072 }
6073 case AMDGPU::S_ADD_CO_PSEUDO:
6074 case AMDGPU::S_SUB_CO_PSEUDO: {
6075 // This pseudo has a chance to be selected
6076 // only from uniform add/subcarry node. All the VGPR operands
6077 // therefore assumed to be splat vectors.
6079 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6080 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6082 const DebugLoc &DL = MI.getDebugLoc();
6083 MachineOperand &Dest = MI.getOperand(0);
6084 MachineOperand &CarryDest = MI.getOperand(1);
6085 MachineOperand &Src0 = MI.getOperand(2);
6086 MachineOperand &Src1 = MI.getOperand(3);
6087 MachineOperand &Src2 = MI.getOperand(4);
6088 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6089 ? AMDGPU::S_ADDC_U32
6090 : AMDGPU::S_SUBB_U32;
6091 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6092 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6093 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6094 .addReg(Src0.getReg());
6095 Src0.setReg(RegOp0);
6096 }
6097 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6098 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6099 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6100 .addReg(Src1.getReg());
6101 Src1.setReg(RegOp1);
6102 }
6103 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6104 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6105 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6106 .addReg(Src2.getReg());
6107 Src2.setReg(RegOp2);
6108 }
6109
6110 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6111 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
6112 assert(WaveSize == 64 || WaveSize == 32);
6113
6114 if (WaveSize == 64) {
6115 if (ST.hasScalarCompareEq64()) {
6116 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6117 .addReg(Src2.getReg())
6118 .addImm(0);
6119 } else {
6120 const TargetRegisterClass *SubRC =
6121 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6122 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6123 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6124 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6125 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6126 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6127
6128 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6129 .add(Src2Sub0)
6130 .add(Src2Sub1);
6131
6132 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6133 .addReg(Src2_32, RegState::Kill)
6134 .addImm(0);
6135 }
6136 } else {
6137 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6138 .addReg(Src2.getReg())
6139 .addImm(0);
6140 }
6141
6142 // clang-format off
6143 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6144 .add(Src0)
6145 .add(Src1);
6146 // clang-format on
6147
6148 unsigned SelOpc =
6149 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6150
6151 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6152 .addImm(-1)
6153 .addImm(0);
6154
6155 MI.eraseFromParent();
6156 return BB;
6157 }
6158 case AMDGPU::SI_INIT_M0: {
6159 MachineOperand &M0Init = MI.getOperand(0);
6160 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6161 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6162 AMDGPU::M0)
6163 .add(M0Init);
6164 MI.eraseFromParent();
6165 return BB;
6166 }
6167 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6168 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6169 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6170 TII->get(AMDGPU::S_CMP_EQ_U32))
6171 .addImm(0)
6172 .addImm(0);
6173 return BB;
6174 }
6175 case AMDGPU::GET_GROUPSTATICSIZE: {
6176 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6177 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6178 DebugLoc DL = MI.getDebugLoc();
6179 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6180 .add(MI.getOperand(0))
6181 .addImm(MFI->getLDSSize());
6182 MI.eraseFromParent();
6183 return BB;
6184 }
6185 case AMDGPU::GET_SHADERCYCLESHILO: {
6188 const DebugLoc &DL = MI.getDebugLoc();
6189 // The algorithm is:
6190 //
6191 // hi1 = getreg(SHADER_CYCLES_HI)
6192 // lo1 = getreg(SHADER_CYCLES_LO)
6193 // hi2 = getreg(SHADER_CYCLES_HI)
6194 //
6195 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6196 // Otherwise there was overflow and the result is hi2:0. In both cases the
6197 // result should represent the actual time at some point during the sequence
6198 // of three getregs.
6199 using namespace AMDGPU::Hwreg;
6200 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6201 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6202 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6203 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6204 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6205 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6206 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6207 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6208 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6209 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6210 .addReg(RegHi1)
6211 .addReg(RegHi2);
6212 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6213 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6214 .addReg(RegLo1)
6215 .addImm(0);
6216 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6217 .add(MI.getOperand(0))
6218 .addReg(RegLo)
6219 .addImm(AMDGPU::sub0)
6220 .addReg(RegHi2)
6221 .addImm(AMDGPU::sub1);
6222 MI.eraseFromParent();
6223 return BB;
6224 }
6225 case AMDGPU::SI_INDIRECT_SRC_V1:
6226 case AMDGPU::SI_INDIRECT_SRC_V2:
6227 case AMDGPU::SI_INDIRECT_SRC_V4:
6228 case AMDGPU::SI_INDIRECT_SRC_V8:
6229 case AMDGPU::SI_INDIRECT_SRC_V9:
6230 case AMDGPU::SI_INDIRECT_SRC_V10:
6231 case AMDGPU::SI_INDIRECT_SRC_V11:
6232 case AMDGPU::SI_INDIRECT_SRC_V12:
6233 case AMDGPU::SI_INDIRECT_SRC_V16:
6234 case AMDGPU::SI_INDIRECT_SRC_V32:
6235 return emitIndirectSrc(MI, *BB, *getSubtarget());
6236 case AMDGPU::SI_INDIRECT_DST_V1:
6237 case AMDGPU::SI_INDIRECT_DST_V2:
6238 case AMDGPU::SI_INDIRECT_DST_V4:
6239 case AMDGPU::SI_INDIRECT_DST_V8:
6240 case AMDGPU::SI_INDIRECT_DST_V9:
6241 case AMDGPU::SI_INDIRECT_DST_V10:
6242 case AMDGPU::SI_INDIRECT_DST_V11:
6243 case AMDGPU::SI_INDIRECT_DST_V12:
6244 case AMDGPU::SI_INDIRECT_DST_V16:
6245 case AMDGPU::SI_INDIRECT_DST_V32:
6246 return emitIndirectDst(MI, *BB, *getSubtarget());
6247 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6248 case AMDGPU::SI_KILL_I1_PSEUDO:
6249 return splitKillBlock(MI, BB);
6250 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6252 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6253 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6254
6255 Register Dst = MI.getOperand(0).getReg();
6256 const MachineOperand &Src0 = MI.getOperand(1);
6257 const MachineOperand &Src1 = MI.getOperand(2);
6258 const DebugLoc &DL = MI.getDebugLoc();
6259 Register SrcCond = MI.getOperand(3).getReg();
6260
6261 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6262 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6263 const auto *CondRC = TRI->getWaveMaskRegClass();
6264 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6265
6266 const TargetRegisterClass *Src0RC = Src0.isReg()
6267 ? MRI.getRegClass(Src0.getReg())
6268 : &AMDGPU::VReg_64RegClass;
6269 const TargetRegisterClass *Src1RC = Src1.isReg()
6270 ? MRI.getRegClass(Src1.getReg())
6271 : &AMDGPU::VReg_64RegClass;
6272
6273 const TargetRegisterClass *Src0SubRC =
6274 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6275 const TargetRegisterClass *Src1SubRC =
6276 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6277
6278 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6279 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6280 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6281 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6282
6283 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6284 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6285 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6286 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6287
6288 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6289 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6290 .addImm(0)
6291 .add(Src0Sub0)
6292 .addImm(0)
6293 .add(Src1Sub0)
6294 .addReg(SrcCondCopy);
6295 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6296 .addImm(0)
6297 .add(Src0Sub1)
6298 .addImm(0)
6299 .add(Src1Sub1)
6300 .addReg(SrcCondCopy);
6301
6302 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6303 .addReg(DstLo)
6304 .addImm(AMDGPU::sub0)
6305 .addReg(DstHi)
6306 .addImm(AMDGPU::sub1);
6307 MI.eraseFromParent();
6308 return BB;
6309 }
6310 case AMDGPU::SI_BR_UNDEF: {
6312 const DebugLoc &DL = MI.getDebugLoc();
6313 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6314 .add(MI.getOperand(0));
6315 Br->getOperand(1).setIsUndef(); // read undef SCC
6316 MI.eraseFromParent();
6317 return BB;
6318 }
6319 case AMDGPU::ADJCALLSTACKUP:
6320 case AMDGPU::ADJCALLSTACKDOWN: {
6322 MachineInstrBuilder MIB(*MF, &MI);
6323 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6324 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6325 return BB;
6326 }
6327 case AMDGPU::SI_CALL_ISEL: {
6329 const DebugLoc &DL = MI.getDebugLoc();
6330
6331 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6332
6334 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6335
6336 for (const MachineOperand &MO : MI.operands())
6337 MIB.add(MO);
6338
6339 MIB.cloneMemRefs(MI);
6340 MI.eraseFromParent();
6341 return BB;
6342 }
6343 case AMDGPU::V_ADD_CO_U32_e32:
6344 case AMDGPU::V_SUB_CO_U32_e32:
6345 case AMDGPU::V_SUBREV_CO_U32_e32: {
6346 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6347 const DebugLoc &DL = MI.getDebugLoc();
6348 unsigned Opc = MI.getOpcode();
6349
6350 bool NeedClampOperand = false;
6351 if (TII->pseudoToMCOpcode(Opc) == -1) {
6353 NeedClampOperand = true;
6354 }
6355
6356 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6357 if (TII->isVOP3(*I)) {
6358 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6359 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6360 I.addReg(TRI->getVCC(), RegState::Define);
6361 }
6362 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6363 if (NeedClampOperand)
6364 I.addImm(0); // clamp bit for e64 encoding
6365
6366 TII->legalizeOperands(*I);
6367
6368 MI.eraseFromParent();
6369 return BB;
6370 }
6371 case AMDGPU::V_ADDC_U32_e32:
6372 case AMDGPU::V_SUBB_U32_e32:
6373 case AMDGPU::V_SUBBREV_U32_e32:
6374 // These instructions have an implicit use of vcc which counts towards the
6375 // constant bus limit.
6376 TII->legalizeOperands(MI);
6377 return BB;
6378 case AMDGPU::DS_GWS_INIT:
6379 case AMDGPU::DS_GWS_SEMA_BR:
6380 case AMDGPU::DS_GWS_BARRIER:
6381 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6382 [[fallthrough]];
6383 case AMDGPU::DS_GWS_SEMA_V:
6384 case AMDGPU::DS_GWS_SEMA_P:
6385 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6386 // A s_waitcnt 0 is required to be the instruction immediately following.
6387 if (getSubtarget()->hasGWSAutoReplay()) {
6389 return BB;
6390 }
6391
6392 return emitGWSMemViolTestLoop(MI, BB);
6393 case AMDGPU::S_SETREG_B32: {
6394 // Try to optimize cases that only set the denormal mode or rounding mode.
6395 //
6396 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6397 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6398 // instead.
6399 //
6400 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6401 // allow you to have a no side effect instruction in the output of a
6402 // sideeffecting pattern.
6403 auto [ID, Offset, Width] =
6404 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6406 return BB;
6407
6408 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6409 const unsigned SetMask = WidthMask << Offset;
6410
6411 if (getSubtarget()->hasDenormModeInst()) {
6412 unsigned SetDenormOp = 0;
6413 unsigned SetRoundOp = 0;
6414
6415 // The dedicated instructions can only set the whole denorm or round mode
6416 // at once, not a subset of bits in either.
6417 if (SetMask ==
6419 // If this fully sets both the round and denorm mode, emit the two
6420 // dedicated instructions for these.
6421 SetRoundOp = AMDGPU::S_ROUND_MODE;
6422 SetDenormOp = AMDGPU::S_DENORM_MODE;
6423 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6424 SetRoundOp = AMDGPU::S_ROUND_MODE;
6425 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6426 SetDenormOp = AMDGPU::S_DENORM_MODE;
6427 }
6428
6429 if (SetRoundOp || SetDenormOp) {
6431 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6432 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6433 unsigned ImmVal = Def->getOperand(1).getImm();
6434 if (SetRoundOp) {
6435 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6436 .addImm(ImmVal & 0xf);
6437
6438 // If we also have the denorm mode, get just the denorm mode bits.
6439 ImmVal >>= 4;
6440 }
6441
6442 if (SetDenormOp) {
6443 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6444 .addImm(ImmVal & 0xf);
6445 }
6446
6447 MI.eraseFromParent();
6448 return BB;
6449 }
6450 }
6451 }
6452
6453 // If only FP bits are touched, used the no side effects pseudo.
6454 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6455 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6456 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6457
6458 return BB;
6459 }
6460 case AMDGPU::S_INVERSE_BALLOT_U32:
6461 case AMDGPU::S_INVERSE_BALLOT_U64:
6462 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6463 // necessary. After that they are equivalent to a COPY.
6464 MI.setDesc(TII->get(AMDGPU::COPY));
6465 return BB;
6466 case AMDGPU::ENDPGM_TRAP: {
6467 const DebugLoc &DL = MI.getDebugLoc();
6468 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6469 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6470 MI.addOperand(MachineOperand::CreateImm(0));
6471 return BB;
6472 }
6473
6474 // We need a block split to make the real endpgm a terminator. We also don't
6475 // want to break phis in successor blocks, so we can't just delete to the
6476 // end of the block.
6477
6478 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6480 MF->push_back(TrapBB);
6481 // clang-format off
6482 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6483 .addImm(0);
6484 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6485 .addMBB(TrapBB);
6486 // clang-format on
6487
6488 BB->addSuccessor(TrapBB);
6489 MI.eraseFromParent();
6490 return SplitBB;
6491 }
6492 case AMDGPU::SIMULATED_TRAP: {
6493 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6495 MachineBasicBlock *SplitBB =
6496 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6497 MI.eraseFromParent();
6498 return SplitBB;
6499 }
6500 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6501 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6503
6504 // During ISel, it's difficult to propagate the original EXEC mask to use as
6505 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6506 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6507 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6508 Register OriginalExec = Setup->getOperand(0).getReg();
6509 MF->getRegInfo().clearKillFlags(OriginalExec);
6510 MI.getOperand(0).setReg(OriginalExec);
6511 return BB;
6512 }
6513 default:
6514 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6515 if (!MI.mayStore())
6517 return BB;
6518 }
6520 }
6521}
6522
6524 // This currently forces unfolding various combinations of fsub into fma with
6525 // free fneg'd operands. As long as we have fast FMA (controlled by
6526 // isFMAFasterThanFMulAndFAdd), we should perform these.
6527
6528 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6529 // most of these combines appear to be cycle neutral but save on instruction
6530 // count / code size.
6531 return true;
6532}
6533
6535
6537 EVT VT) const {
6538 if (!VT.isVector()) {
6539 return MVT::i1;
6540 }
6541 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6542}
6543
6545 // TODO: Should i16 be used always if legal? For now it would force VALU
6546 // shifts.
6547 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6548}
6549
6551 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6552 ? Ty.changeElementSize(16)
6553 : Ty.changeElementSize(32);
6554}
6555
6556// Answering this is somewhat tricky and depends on the specific device which
6557// have different rates for fma or all f64 operations.
6558//
6559// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6560// regardless of which device (although the number of cycles differs between
6561// devices), so it is always profitable for f64.
6562//
6563// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6564// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6565// which we can always do even without fused FP ops since it returns the same
6566// result as the separate operations and since it is always full
6567// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6568// however does not support denormals, so we do report fma as faster if we have
6569// a fast fma device and require denormals.
6570//
6572 EVT VT) const {
6573 VT = VT.getScalarType();
6574
6575 switch (VT.getSimpleVT().SimpleTy) {
6576 case MVT::f32: {
6577 // If mad is not available this depends only on if f32 fma is full rate.
6578 if (!Subtarget->hasMadMacF32Insts())
6579 return Subtarget->hasFastFMAF32();
6580
6581 // Otherwise f32 mad is always full rate and returns the same result as
6582 // the separate operations so should be preferred over fma.
6583 // However does not support denormals.
6585 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6586
6587 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6588 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6589 }
6590 case MVT::f64:
6591 return true;
6592 case MVT::f16:
6593 case MVT::bf16:
6594 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6595 default:
6596 break;
6597 }
6598
6599 return false;
6600}
6601
6603 LLT Ty) const {
6604 switch (Ty.getScalarSizeInBits()) {
6605 case 16:
6606 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6607 case 32:
6608 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6609 case 64:
6610 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6611 default:
6612 break;
6613 }
6614
6615 return false;
6616}
6617
6619 if (!Ty.isScalar())
6620 return false;
6621
6622 if (Ty.getScalarSizeInBits() == 16)
6623 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6624 if (Ty.getScalarSizeInBits() == 32)
6625 return Subtarget->hasMadMacF32Insts() &&
6626 denormalModeIsFlushAllF32(*MI.getMF());
6627
6628 return false;
6629}
6630
6632 const SDNode *N) const {
6633 // TODO: Check future ftz flag
6634 // v_mad_f32/v_mac_f32 do not support denormals.
6635 EVT VT = N->getValueType(0);
6636 if (VT == MVT::f32)
6637 return Subtarget->hasMadMacF32Insts() &&
6639 if (VT == MVT::f16) {
6640 return Subtarget->hasMadF16() &&
6642 }
6643
6644 return false;
6645}
6646
6647//===----------------------------------------------------------------------===//
6648// Custom DAG Lowering Operations
6649//===----------------------------------------------------------------------===//
6650
6651// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6652// wider vector type is legal.
6654 SelectionDAG &DAG) const {
6655 unsigned Opc = Op.getOpcode();
6656 EVT VT = Op.getValueType();
6657 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6658 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6659 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6660 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6661
6662 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6663
6664 SDLoc SL(Op);
6665 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6666 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6667
6668 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6669}
6670
6671// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6672// regression whereby extra unnecessary instructions were added to codegen
6673// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6674// instructions to extract the result from the vector.
6676 [[maybe_unused]] EVT VT = Op.getValueType();
6677
6678 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6679 VT == MVT::v16i32) &&
6680 "Unexpected ValueType.");
6681
6682 return DAG.UnrollVectorOp(Op.getNode());
6683}
6684
6685// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6686// wider vector type is legal.
6688 SelectionDAG &DAG) const {
6689 unsigned Opc = Op.getOpcode();
6690 EVT VT = Op.getValueType();
6691 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6692 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6693 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6694 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6695 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6696 VT == MVT::v32bf16);
6697
6698 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6699 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6700
6701 SDLoc SL(Op);
6702
6703 SDValue OpLo =
6704 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6705 SDValue OpHi =
6706 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6707
6708 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6709}
6710
6712 SelectionDAG &DAG) const {
6713 unsigned Opc = Op.getOpcode();
6714 EVT VT = Op.getValueType();
6715 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6716 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6717 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6718 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6719 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6720 VT == MVT::v32bf16);
6721
6722 SDValue Op0 = Op.getOperand(0);
6723 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6724 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6725 : std::pair(Op0, Op0);
6726
6727 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6728 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6729
6730 SDLoc SL(Op);
6731 auto ResVT = DAG.GetSplitDestVTs(VT);
6732
6733 SDValue OpLo =
6734 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6735 SDValue OpHi =
6736 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6737
6738 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6739}
6740
6742 switch (Op.getOpcode()) {
6743 default:
6745 case ISD::BRCOND:
6746 return LowerBRCOND(Op, DAG);
6747 case ISD::RETURNADDR:
6748 return LowerRETURNADDR(Op, DAG);
6749 case ISD::LOAD: {
6750 SDValue Result = LowerLOAD(Op, DAG);
6751 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6752 "Load should return a value and a chain");
6753 return Result;
6754 }
6755 case ISD::FSQRT: {
6756 EVT VT = Op.getValueType();
6757 if (VT == MVT::f32)
6758 return lowerFSQRTF32(Op, DAG);
6759 if (VT == MVT::f64)
6760 return lowerFSQRTF64(Op, DAG);
6761 return SDValue();
6762 }
6763 case ISD::FSIN:
6764 case ISD::FCOS:
6765 return LowerTrig(Op, DAG);
6766 case ISD::SELECT:
6767 return LowerSELECT(Op, DAG);
6768 case ISD::FDIV:
6769 return LowerFDIV(Op, DAG);
6770 case ISD::FFREXP:
6771 return LowerFFREXP(Op, DAG);
6772 case ISD::ATOMIC_CMP_SWAP:
6773 return LowerATOMIC_CMP_SWAP(Op, DAG);
6774 case ISD::STORE:
6775 return LowerSTORE(Op, DAG);
6776 case ISD::GlobalAddress: {
6779 return LowerGlobalAddress(MFI, Op, DAG);
6780 }
6782 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6784 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6786 return LowerINTRINSIC_VOID(Op, DAG);
6787 case ISD::ADDRSPACECAST:
6788 return lowerADDRSPACECAST(Op, DAG);
6790 return lowerINSERT_SUBVECTOR(Op, DAG);
6792 return lowerINSERT_VECTOR_ELT(Op, DAG);
6794 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6796 return lowerVECTOR_SHUFFLE(Op, DAG);
6798 return lowerSCALAR_TO_VECTOR(Op, DAG);
6799 case ISD::BUILD_VECTOR:
6800 return lowerBUILD_VECTOR(Op, DAG);
6801 case ISD::FP_ROUND:
6803 return lowerFP_ROUND(Op, DAG);
6804 case ISD::TRAP:
6805 return lowerTRAP(Op, DAG);
6806 case ISD::DEBUGTRAP:
6807 return lowerDEBUGTRAP(Op, DAG);
6808 case ISD::ABS:
6809 case ISD::FABS:
6810 case ISD::FNEG:
6811 case ISD::FCANONICALIZE:
6812 case ISD::BSWAP:
6813 return splitUnaryVectorOp(Op, DAG);
6814 case ISD::FMINNUM:
6815 case ISD::FMAXNUM:
6816 return lowerFMINNUM_FMAXNUM(Op, DAG);
6817 case ISD::FMINIMUMNUM:
6818 case ISD::FMAXIMUMNUM:
6819 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6820 case ISD::FMINIMUM:
6821 case ISD::FMAXIMUM:
6822 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6823 case ISD::FLDEXP:
6824 case ISD::STRICT_FLDEXP:
6825 return lowerFLDEXP(Op, DAG);
6826 case ISD::FMA:
6827 return splitTernaryVectorOp(Op, DAG);
6828 case ISD::FP_TO_SINT:
6829 case ISD::FP_TO_UINT:
6830 return LowerFP_TO_INT(Op, DAG);
6831 case ISD::SHL:
6832 case ISD::SRA:
6833 case ISD::SRL:
6834 case ISD::ADD:
6835 case ISD::SUB:
6836 case ISD::SMIN:
6837 case ISD::SMAX:
6838 case ISD::UMIN:
6839 case ISD::UMAX:
6840 case ISD::FADD:
6841 case ISD::FMUL:
6842 case ISD::FMINNUM_IEEE:
6843 case ISD::FMAXNUM_IEEE:
6844 case ISD::UADDSAT:
6845 case ISD::USUBSAT:
6846 case ISD::SADDSAT:
6847 case ISD::SSUBSAT:
6848 return splitBinaryVectorOp(Op, DAG);
6849 case ISD::FCOPYSIGN:
6850 return lowerFCOPYSIGN(Op, DAG);
6851 case ISD::MUL:
6852 return lowerMUL(Op, DAG);
6853 case ISD::SMULO:
6854 case ISD::UMULO:
6855 return lowerXMULO(Op, DAG);
6856 case ISD::SMUL_LOHI:
6857 case ISD::UMUL_LOHI:
6858 return lowerXMUL_LOHI(Op, DAG);
6859 case ISD::DYNAMIC_STACKALLOC:
6860 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6861 case ISD::STACKSAVE:
6862 return LowerSTACKSAVE(Op, DAG);
6863 case ISD::GET_ROUNDING:
6864 return lowerGET_ROUNDING(Op, DAG);
6865 case ISD::SET_ROUNDING:
6866 return lowerSET_ROUNDING(Op, DAG);
6867 case ISD::PREFETCH:
6868 return lowerPREFETCH(Op, DAG);
6869 case ISD::FP_EXTEND:
6871 return lowerFP_EXTEND(Op, DAG);
6872 case ISD::GET_FPENV:
6873 return lowerGET_FPENV(Op, DAG);
6874 case ISD::SET_FPENV:
6875 return lowerSET_FPENV(Op, DAG);
6876 case ISD::ROTR:
6877 return lowerROTR(Op, DAG);
6878 }
6879 return SDValue();
6880}
6881
6882// Used for D16: Casts the result of an instruction into the right vector,
6883// packs values if loads return unpacked values.
6885 const SDLoc &DL, SelectionDAG &DAG,
6886 bool Unpacked) {
6887 if (!LoadVT.isVector())
6888 return Result;
6889
6890 // Cast back to the original packed type or to a larger type that is a
6891 // multiple of 32 bit for D16. Widening the return type is a required for
6892 // legalization.
6893 EVT FittingLoadVT = LoadVT;
6894 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6895 FittingLoadVT =
6897 LoadVT.getVectorNumElements() + 1);
6898 }
6899
6900 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6901 // Truncate to v2i16/v4i16.
6902 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6903
6904 // Workaround legalizer not scalarizing truncate after vector op
6905 // legalization but not creating intermediate vector trunc.
6907 DAG.ExtractVectorElements(Result, Elts);
6908 for (SDValue &Elt : Elts)
6909 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6910
6911 // Pad illegal v1i16/v3fi6 to v4i16
6912 if ((LoadVT.getVectorNumElements() % 2) == 1)
6913 Elts.push_back(DAG.getPOISON(MVT::i16));
6914
6915 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6916
6917 // Bitcast to original type (v2f16/v4f16).
6918 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6919 }
6920
6921 // Cast back to the original packed type.
6922 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6923}
6924
6925SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6926 SelectionDAG &DAG,
6928 bool IsIntrinsic) const {
6929 SDLoc DL(M);
6930
6931 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6932 EVT LoadVT = M->getValueType(0);
6933
6934 EVT EquivLoadVT = LoadVT;
6935 if (LoadVT.isVector()) {
6936 if (Unpacked) {
6937 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6938 LoadVT.getVectorNumElements());
6939 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6940 // Widen v3f16 to legal type
6941 EquivLoadVT =
6943 LoadVT.getVectorNumElements() + 1);
6944 }
6945 }
6946
6947 // Change from v4f16/v2f16 to EquivLoadVT.
6948 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6949
6951 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6952 M->getMemoryVT(), M->getMemOperand());
6953
6954 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6955
6956 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6957}
6958
6959SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6960 SelectionDAG &DAG,
6961 ArrayRef<SDValue> Ops) const {
6962 SDLoc DL(M);
6963 EVT LoadVT = M->getValueType(0);
6964 EVT EltType = LoadVT.getScalarType();
6965 EVT IntVT = LoadVT.changeTypeToInteger();
6966
6967 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6968
6969 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6970 bool IsTFE = M->getNumValues() == 3;
6971
6972 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6974 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6975 : AMDGPUISD::BUFFER_LOAD;
6976
6977 if (IsD16) {
6978 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6979 }
6980
6981 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6982 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6983 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6984 IsTFE);
6985
6986 if (isTypeLegal(LoadVT)) {
6987 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6988 M->getMemOperand(), DAG);
6989 }
6990
6991 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6992 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6993 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6994 M->getMemOperand(), DAG);
6995 return DAG.getMergeValues(
6996 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6997 DL);
6998}
6999
7001 SelectionDAG &DAG) {
7002 EVT VT = N->getValueType(0);
7003 unsigned CondCode = N->getConstantOperandVal(3);
7004 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7005 return DAG.getPOISON(VT);
7006
7007 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7008
7009 SDValue LHS = N->getOperand(1);
7010 SDValue RHS = N->getOperand(2);
7011
7012 SDLoc DL(N);
7013
7014 EVT CmpVT = LHS.getValueType();
7015 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7016 unsigned PromoteOp =
7018 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7019 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7020 }
7021
7022 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7023
7024 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7025 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7026
7027 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7028 DAG.getCondCode(CCOpcode));
7029 if (VT.bitsEq(CCVT))
7030 return SetCC;
7031 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7032}
7033
7035 SelectionDAG &DAG) {
7036 EVT VT = N->getValueType(0);
7037
7038 unsigned CondCode = N->getConstantOperandVal(3);
7039 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7040 return DAG.getPOISON(VT);
7041
7042 SDValue Src0 = N->getOperand(1);
7043 SDValue Src1 = N->getOperand(2);
7044 EVT CmpVT = Src0.getValueType();
7045 SDLoc SL(N);
7046
7047 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7048 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7049 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7050 }
7051
7052 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7053 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7054 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7055 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7056 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7057 DAG.getCondCode(CCOpcode));
7058 if (VT.bitsEq(CCVT))
7059 return SetCC;
7060 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7061}
7062
7064 SelectionDAG &DAG) {
7065 EVT VT = N->getValueType(0);
7066 SDValue Src = N->getOperand(1);
7067 SDLoc SL(N);
7068
7069 if (Src.getOpcode() == ISD::SETCC) {
7070 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7071 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7072 Src.getOperand(1), Src.getOperand(2));
7073 }
7074 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7075 // (ballot 0) -> 0
7076 if (Arg->isZero())
7077 return DAG.getConstant(0, SL, VT);
7078
7079 // (ballot 1) -> EXEC/EXEC_LO
7080 if (Arg->isOne()) {
7081 Register Exec;
7082 if (VT.getScalarSizeInBits() == 32)
7083 Exec = AMDGPU::EXEC_LO;
7084 else if (VT.getScalarSizeInBits() == 64)
7085 Exec = AMDGPU::EXEC;
7086 else
7087 return SDValue();
7088
7089 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7090 }
7091 }
7092
7093 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7094 // ISD::SETNE)
7095 return DAG.getNode(
7096 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7097 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7098}
7099
7101 SelectionDAG &DAG) {
7102 EVT VT = N->getValueType(0);
7103 unsigned ValSize = VT.getSizeInBits();
7104 unsigned IID = N->getConstantOperandVal(0);
7105 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7106 IID == Intrinsic::amdgcn_permlanex16;
7107 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7108 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7109 SDLoc SL(N);
7110 MVT IntVT = MVT::getIntegerVT(ValSize);
7111 const GCNSubtarget *ST = TLI.getSubtarget();
7112 unsigned SplitSize = 32;
7113 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7114 ST->hasDPALU_DPP() &&
7115 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7116 SplitSize = 64;
7117
7118 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7119 SDValue Src2, MVT ValT) -> SDValue {
7121 switch (IID) {
7122 case Intrinsic::amdgcn_permlane16:
7123 case Intrinsic::amdgcn_permlanex16:
7124 case Intrinsic::amdgcn_update_dpp:
7125 Operands.push_back(N->getOperand(6));
7126 Operands.push_back(N->getOperand(5));
7127 Operands.push_back(N->getOperand(4));
7128 [[fallthrough]];
7129 case Intrinsic::amdgcn_writelane:
7130 Operands.push_back(Src2);
7131 [[fallthrough]];
7132 case Intrinsic::amdgcn_readlane:
7133 case Intrinsic::amdgcn_set_inactive:
7134 case Intrinsic::amdgcn_set_inactive_chain_arg:
7135 case Intrinsic::amdgcn_mov_dpp8:
7136 Operands.push_back(Src1);
7137 [[fallthrough]];
7138 case Intrinsic::amdgcn_readfirstlane:
7139 case Intrinsic::amdgcn_permlane64:
7140 Operands.push_back(Src0);
7141 break;
7142 default:
7143 llvm_unreachable("unhandled lane op");
7144 }
7145
7146 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7147 std::reverse(Operands.begin(), Operands.end());
7148
7149 if (SDNode *GL = N->getGluedNode()) {
7150 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7151 GL = GL->getOperand(0).getNode();
7152 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7153 SDValue(GL, 0)));
7154 }
7155
7156 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7157 };
7158
7159 SDValue Src0 = N->getOperand(1);
7160 SDValue Src1, Src2;
7161 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7162 IID == Intrinsic::amdgcn_mov_dpp8 ||
7163 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7164 Src1 = N->getOperand(2);
7165 if (IID == Intrinsic::amdgcn_writelane ||
7166 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7167 Src2 = N->getOperand(3);
7168 }
7169
7170 if (ValSize == SplitSize) {
7171 // Already legal
7172 return SDValue();
7173 }
7174
7175 if (ValSize < 32) {
7176 bool IsFloat = VT.isFloatingPoint();
7177 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7178 SL, MVT::i32);
7179
7180 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7181 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7182 SL, MVT::i32);
7183 }
7184
7185 if (IID == Intrinsic::amdgcn_writelane) {
7186 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7187 SL, MVT::i32);
7188 }
7189
7190 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7191 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7192 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7193 }
7194
7195 if (ValSize % SplitSize != 0)
7196 return SDValue();
7197
7198 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7199 EVT VT = N->getValueType(0);
7200 unsigned NE = VT.getVectorNumElements();
7201 EVT EltVT = VT.getVectorElementType();
7203 unsigned NumOperands = N->getNumOperands();
7204 SmallVector<SDValue, 4> Operands(NumOperands);
7205 SDNode *GL = N->getGluedNode();
7206
7207 // only handle convergencectrl_glue
7208 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7209
7210 for (unsigned i = 0; i != NE; ++i) {
7211 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7212 ++j) {
7213 SDValue Operand = N->getOperand(j);
7214 EVT OperandVT = Operand.getValueType();
7215 if (OperandVT.isVector()) {
7216 // A vector operand; extract a single element.
7217 EVT OperandEltVT = OperandVT.getVectorElementType();
7218 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7219 Operand, DAG.getVectorIdxConstant(i, SL));
7220 } else {
7221 // A scalar operand; just use it as is.
7222 Operands[j] = Operand;
7223 }
7224 }
7225
7226 if (GL)
7227 Operands[NumOperands - 1] =
7228 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7229 SDValue(GL->getOperand(0).getNode(), 0));
7230
7231 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7232 }
7233
7234 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7235 return DAG.getBuildVector(VecVT, SL, Scalars);
7236 };
7237
7238 if (VT.isVector()) {
7239 switch (MVT::SimpleValueType EltTy =
7241 case MVT::i32:
7242 case MVT::f32:
7243 if (SplitSize == 32) {
7244 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7245 return unrollLaneOp(LaneOp.getNode());
7246 }
7247 [[fallthrough]];
7248 case MVT::i16:
7249 case MVT::f16:
7250 case MVT::bf16: {
7251 unsigned SubVecNumElt =
7252 SplitSize / VT.getVectorElementType().getSizeInBits();
7253 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7255 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7256 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7257 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7258 DAG.getConstant(EltIdx, SL, MVT::i32));
7259
7260 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7261 IsPermLane16)
7262 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7263 DAG.getConstant(EltIdx, SL, MVT::i32));
7264
7265 if (IID == Intrinsic::amdgcn_writelane)
7266 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7267 DAG.getConstant(EltIdx, SL, MVT::i32));
7268
7269 Pieces.push_back(
7270 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7271 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7272 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7273 EltIdx += SubVecNumElt;
7274 }
7275 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7276 }
7277 default:
7278 // Handle all other cases by bitcasting to i32 vectors
7279 break;
7280 }
7281 }
7282
7283 MVT VecVT =
7284 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7285 Src0 = DAG.getBitcast(VecVT, Src0);
7286
7287 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7288 Src1 = DAG.getBitcast(VecVT, Src1);
7289
7290 if (IID == Intrinsic::amdgcn_writelane)
7291 Src2 = DAG.getBitcast(VecVT, Src2);
7292
7293 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7294 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7295 return DAG.getBitcast(VT, UnrolledLaneOp);
7296}
7297
7300 SelectionDAG &DAG) const {
7301 switch (N->getOpcode()) {
7303 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7304 Results.push_back(Res);
7305 return;
7306 }
7308 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7309 Results.push_back(Res);
7310 return;
7311 }
7313 unsigned IID = N->getConstantOperandVal(0);
7314 switch (IID) {
7315 case Intrinsic::amdgcn_make_buffer_rsrc:
7316 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7317 return;
7318 case Intrinsic::amdgcn_cvt_pkrtz: {
7319 SDValue Src0 = N->getOperand(1);
7320 SDValue Src1 = N->getOperand(2);
7321 SDLoc SL(N);
7322 SDValue Cvt =
7323 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7324 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7325 return;
7326 }
7327 case Intrinsic::amdgcn_cvt_pknorm_i16:
7328 case Intrinsic::amdgcn_cvt_pknorm_u16:
7329 case Intrinsic::amdgcn_cvt_pk_i16:
7330 case Intrinsic::amdgcn_cvt_pk_u16: {
7331 SDValue Src0 = N->getOperand(1);
7332 SDValue Src1 = N->getOperand(2);
7333 SDLoc SL(N);
7334 unsigned Opcode;
7335
7336 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7338 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7340 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7342 else
7344
7345 EVT VT = N->getValueType(0);
7346 if (isTypeLegal(VT))
7347 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7348 else {
7349 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7350 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7351 }
7352 return;
7353 }
7354 case Intrinsic::amdgcn_s_buffer_load: {
7355 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7356 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7357 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7358 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7359 // s_buffer_load_i8.
7360 if (!Subtarget->hasScalarSubwordLoads())
7361 return;
7362 SDValue Op = SDValue(N, 0);
7363 SDValue Rsrc = Op.getOperand(1);
7364 SDValue Offset = Op.getOperand(2);
7365 SDValue CachePolicy = Op.getOperand(3);
7366 EVT VT = Op.getValueType();
7367 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7368 SDLoc DL(Op);
7370 const DataLayout &DataLayout = DAG.getDataLayout();
7371 Align Alignment =
7377 VT.getStoreSize(), Alignment);
7378 SDValue LoadVal;
7379 if (!Offset->isDivergent()) {
7380 SDValue Ops[] = {Rsrc, // source register
7381 Offset, CachePolicy};
7382 SDValue BufferLoad =
7384 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7385 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7386 } else {
7387 SDValue Ops[] = {
7388 DAG.getEntryNode(), // Chain
7389 Rsrc, // rsrc
7390 DAG.getConstant(0, DL, MVT::i32), // vindex
7391 {}, // voffset
7392 {}, // soffset
7393 {}, // offset
7394 CachePolicy, // cachepolicy
7395 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7396 };
7397 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7398 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7399 }
7400 Results.push_back(LoadVal);
7401 return;
7402 }
7403 case Intrinsic::amdgcn_dead: {
7404 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7405 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7406 return;
7407 }
7408 }
7409 break;
7410 }
7412 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7413 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7414 // FIXME: Hacky
7415 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7416 Results.push_back(Res.getOperand(I));
7417 }
7418 } else {
7419 Results.push_back(Res);
7420 Results.push_back(Res.getValue(1));
7421 }
7422 return;
7423 }
7424
7425 break;
7426 }
7427 case ISD::SELECT: {
7428 SDLoc SL(N);
7429 EVT VT = N->getValueType(0);
7430 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7431 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7432 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7433
7434 EVT SelectVT = NewVT;
7435 if (NewVT.bitsLT(MVT::i32)) {
7436 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7437 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7438 SelectVT = MVT::i32;
7439 }
7440
7441 SDValue NewSelect =
7442 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7443
7444 if (NewVT != SelectVT)
7445 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7446 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7447 return;
7448 }
7449 case ISD::FNEG: {
7450 if (N->getValueType(0) != MVT::v2f16)
7451 break;
7452
7453 SDLoc SL(N);
7454 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7455
7456 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7457 DAG.getConstant(0x80008000, SL, MVT::i32));
7458 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7459 return;
7460 }
7461 case ISD::FABS: {
7462 if (N->getValueType(0) != MVT::v2f16)
7463 break;
7464
7465 SDLoc SL(N);
7466 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7467
7468 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7469 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7470 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7471 return;
7472 }
7473 case ISD::FSQRT: {
7474 if (N->getValueType(0) != MVT::f16)
7475 break;
7476 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7477 break;
7478 }
7479 default:
7481 break;
7482 }
7483}
7484
7485/// Helper function for LowerBRCOND
7486static SDNode *findUser(SDValue Value, unsigned Opcode) {
7487
7488 for (SDUse &U : Value->uses()) {
7489 if (U.get() != Value)
7490 continue;
7491
7492 if (U.getUser()->getOpcode() == Opcode)
7493 return U.getUser();
7494 }
7495 return nullptr;
7496}
7497
7498unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7499 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7500 switch (Intr->getConstantOperandVal(1)) {
7501 case Intrinsic::amdgcn_if:
7502 return AMDGPUISD::IF;
7503 case Intrinsic::amdgcn_else:
7504 return AMDGPUISD::ELSE;
7505 case Intrinsic::amdgcn_loop:
7506 return AMDGPUISD::LOOP;
7507 case Intrinsic::amdgcn_end_cf:
7508 llvm_unreachable("should not occur");
7509 default:
7510 return 0;
7511 }
7512 }
7513
7514 // break, if_break, else_break are all only used as inputs to loop, not
7515 // directly as branch conditions.
7516 return 0;
7517}
7518
7525
7527 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7528 return false;
7529
7530 // FIXME: Either avoid relying on address space here or change the default
7531 // address space for functions to avoid the explicit check.
7532 return (GV->getValueType()->isFunctionTy() ||
7535}
7536
7538 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7539}
7540
7542 if (!GV->hasExternalLinkage())
7543 return true;
7544
7545 const auto OS = getTargetMachine().getTargetTriple().getOS();
7546 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7547}
7548
7549/// This transforms the control flow intrinsics to get the branch destination as
7550/// last parameter, also switches branch target with BR if the need arise
7551SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7552 SDLoc DL(BRCOND);
7553
7554 SDNode *Intr = BRCOND.getOperand(1).getNode();
7555 SDValue Target = BRCOND.getOperand(2);
7556 SDNode *BR = nullptr;
7557 SDNode *SetCC = nullptr;
7558
7559 if (Intr->getOpcode() == ISD::SETCC) {
7560 // As long as we negate the condition everything is fine
7561 SetCC = Intr;
7562 Intr = SetCC->getOperand(0).getNode();
7563
7564 } else {
7565 // Get the target from BR if we don't negate the condition
7566 BR = findUser(BRCOND, ISD::BR);
7567 assert(BR && "brcond missing unconditional branch user");
7568 Target = BR->getOperand(1);
7569 }
7570
7571 unsigned CFNode = isCFIntrinsic(Intr);
7572 if (CFNode == 0) {
7573 // This is a uniform branch so we don't need to legalize.
7574 return BRCOND;
7575 }
7576
7577 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7579
7580 assert(!SetCC ||
7581 (SetCC->getConstantOperandVal(1) == 1 &&
7582 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7583 ISD::SETNE));
7584
7585 // operands of the new intrinsic call
7587 if (HaveChain)
7588 Ops.push_back(BRCOND.getOperand(0));
7589
7590 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7591 Ops.push_back(Target);
7592
7593 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7594
7595 // build the new intrinsic call
7596 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7597
7598 if (!HaveChain) {
7599 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7600
7602 }
7603
7604 if (BR) {
7605 // Give the branch instruction our target
7606 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7607 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7608 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7609 }
7610
7611 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7612
7613 // Copy the intrinsic results to registers
7614 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7615 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7616 if (!CopyToReg)
7617 continue;
7618
7619 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7620 SDValue(Result, i - 1), SDValue());
7621
7622 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7623 }
7624
7625 // Remove the old intrinsic from the chain
7626 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7627 Intr->getOperand(0));
7628
7629 return Chain;
7630}
7631
7632SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7633 MVT VT = Op.getSimpleValueType();
7634 SDLoc DL(Op);
7635 // Checking the depth
7636 if (Op.getConstantOperandVal(0) != 0)
7637 return DAG.getConstant(0, DL, VT);
7638
7639 MachineFunction &MF = DAG.getMachineFunction();
7640 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7641 // Check for kernel and shader functions
7642 if (Info->isEntryFunction())
7643 return DAG.getConstant(0, DL, VT);
7644
7645 MachineFrameInfo &MFI = MF.getFrameInfo();
7646 // There is a call to @llvm.returnaddress in this function
7647 MFI.setReturnAddressIsTaken(true);
7648
7649 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7650 // Get the return address reg and mark it as an implicit live-in
7651 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7652 getRegClassFor(VT, Op.getNode()->isDivergent()));
7653
7654 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7655}
7656
7657SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7658 const SDLoc &DL, EVT VT) const {
7659 return Op.getValueType().bitsLE(VT)
7660 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7661 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7662 DAG.getTargetConstant(0, DL, MVT::i32));
7663}
7664
7665SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7666 SelectionDAG &DAG) const {
7667 EVT DstVT = Op.getValueType();
7668 unsigned NumElts = DstVT.getVectorNumElements();
7669 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7670
7671 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7672
7673 SDLoc DL(Op);
7674 unsigned Opc = Op.getOpcode();
7675 SDValue Flags = Op.getOperand(1);
7676 EVT HalfDstVT =
7677 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7678 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7679 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7680
7681 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7682}
7683
7684SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7685 SDValue Src = Op.getOperand(0);
7686 EVT SrcVT = Src.getValueType();
7687 EVT DstVT = Op.getValueType();
7688
7689 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7690 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7691 if (SrcVT.getScalarType() != MVT::f32)
7692 return SDValue();
7693 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7694 }
7695
7696 if (SrcVT.getScalarType() != MVT::f64)
7697 return Op;
7698
7699 SDLoc DL(Op);
7700 if (DstVT == MVT::f16) {
7701 // TODO: Handle strictfp
7702 if (Op.getOpcode() != ISD::FP_ROUND)
7703 return Op;
7704
7705 if (!Subtarget->has16BitInsts()) {
7706 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7707 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7708 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7709 }
7710 if (Op->getFlags().hasApproximateFuncs()) {
7711 SDValue Flags = Op.getOperand(1);
7712 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7713 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7714 }
7715 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7716 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7717 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7718 }
7719
7720 assert(DstVT.getScalarType() == MVT::bf16 &&
7721 "custom lower FP_ROUND for f16 or bf16");
7722 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7723
7724 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7725 // hardware f32 -> bf16 instruction.
7726 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7727 MVT::f32;
7728 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7729 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7730 DAG.getTargetConstant(0, DL, MVT::i32));
7731}
7732
7733SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7734 SelectionDAG &DAG) const {
7735 EVT VT = Op.getValueType();
7736 const MachineFunction &MF = DAG.getMachineFunction();
7737 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7738 bool IsIEEEMode = Info->getMode().IEEE;
7739
7740 // FIXME: Assert during selection that this is only selected for
7741 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7742 // mode functions, but this happens to be OK since it's only done in cases
7743 // where there is known no sNaN.
7744 if (IsIEEEMode)
7745 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7746
7747 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7748 VT == MVT::v16bf16)
7749 return splitBinaryVectorOp(Op, DAG);
7750 return Op;
7751}
7752
7753SDValue
7754SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7755 SelectionDAG &DAG) const {
7756 EVT VT = Op.getValueType();
7757 const MachineFunction &MF = DAG.getMachineFunction();
7758 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7759 bool IsIEEEMode = Info->getMode().IEEE;
7760
7761 if (IsIEEEMode)
7762 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7763
7764 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7765 VT == MVT::v16bf16)
7766 return splitBinaryVectorOp(Op, DAG);
7767 return Op;
7768}
7769
7770SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7771 SelectionDAG &DAG) const {
7772 EVT VT = Op.getValueType();
7773 if (VT.isVector())
7774 return splitBinaryVectorOp(Op, DAG);
7775
7776 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7777 !Subtarget->hasMinimum3Maximum3F16() &&
7778 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7779 "should not need to widen f16 minimum/maximum to v2f16");
7780
7781 // Widen f16 operation to v2f16
7782
7783 // fminimum f16:x, f16:y ->
7784 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7785 // (v2f16 (scalar_to_vector y))), 0
7786 SDLoc SL(Op);
7787 SDValue WideSrc0 =
7788 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7789 SDValue WideSrc1 =
7790 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7791
7792 SDValue Widened =
7793 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7794
7795 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7796 DAG.getConstant(0, SL, MVT::i32));
7797}
7798
7799SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7800 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7801 EVT VT = Op.getValueType();
7802 assert(VT == MVT::f16);
7803
7804 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7805 EVT ExpVT = Exp.getValueType();
7806 if (ExpVT == MVT::i16)
7807 return Op;
7808
7809 SDLoc DL(Op);
7810
7811 // Correct the exponent type for f16 to i16.
7812 // Clamp the range of the exponent to the instruction's range.
7813
7814 // TODO: This should be a generic narrowing legalization, and can easily be
7815 // for GlobalISel.
7816
7817 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7818 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7819
7820 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7821 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7822
7823 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7824
7825 if (IsStrict) {
7826 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7827 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7828 }
7829
7830 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7831}
7832
7834 switch (Op->getOpcode()) {
7835 case ISD::SRA:
7836 case ISD::SMIN:
7837 case ISD::SMAX:
7838 return ISD::SIGN_EXTEND;
7839 case ISD::SRL:
7840 case ISD::UMIN:
7841 case ISD::UMAX:
7842 return ISD::ZERO_EXTEND;
7843 case ISD::ADD:
7844 case ISD::SUB:
7845 case ISD::AND:
7846 case ISD::OR:
7847 case ISD::XOR:
7848 case ISD::SHL:
7849 case ISD::SELECT:
7850 case ISD::MUL:
7851 // operation result won't be influenced by garbage high bits.
7852 // TODO: are all of those cases correct, and are there more?
7853 return ISD::ANY_EXTEND;
7854 case ISD::SETCC: {
7855 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7857 }
7858 default:
7859 llvm_unreachable("unexpected opcode!");
7860 }
7861}
7862
7863SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7864 DAGCombinerInfo &DCI) const {
7865 const unsigned Opc = Op.getOpcode();
7866 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7867 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7868 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7869 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7870 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7871
7872 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7873 : Op->getOperand(0).getValueType();
7874 auto ExtTy = OpTy.changeElementType(MVT::i32);
7875
7876 if (DCI.isBeforeLegalizeOps() ||
7877 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7878 return SDValue();
7879
7880 auto &DAG = DCI.DAG;
7881
7882 SDLoc DL(Op);
7883 SDValue LHS;
7884 SDValue RHS;
7885 if (Opc == ISD::SELECT) {
7886 LHS = Op->getOperand(1);
7887 RHS = Op->getOperand(2);
7888 } else {
7889 LHS = Op->getOperand(0);
7890 RHS = Op->getOperand(1);
7891 }
7892
7893 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7894 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7895
7896 // Special case: for shifts, the RHS always needs a zext.
7897 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7898 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7899 else
7900 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7901
7902 // setcc always return i1/i1 vec so no need to truncate after.
7903 if (Opc == ISD::SETCC) {
7904 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7905 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7906 }
7907
7908 // For other ops, we extend the operation's return type as well so we need to
7909 // truncate back to the original type.
7910 SDValue NewVal;
7911 if (Opc == ISD::SELECT)
7912 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7913 else
7914 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7915
7916 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7917}
7918
7919SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7920 SDValue Mag = Op.getOperand(0);
7921 EVT MagVT = Mag.getValueType();
7922
7923 if (MagVT.getVectorNumElements() > 2)
7924 return splitBinaryVectorOp(Op, DAG);
7925
7926 SDValue Sign = Op.getOperand(1);
7927 EVT SignVT = Sign.getValueType();
7928
7929 if (MagVT == SignVT)
7930 return Op;
7931
7932 // fcopysign v2f16:mag, v2f32:sign ->
7933 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7934
7935 SDLoc SL(Op);
7936 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7937 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7938
7939 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7940
7941 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7942}
7943
7944// Custom lowering for vector multiplications and s_mul_u64.
7945SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7946 EVT VT = Op.getValueType();
7947
7948 // Split vector operands.
7949 if (VT.isVector())
7950 return splitBinaryVectorOp(Op, DAG);
7951
7952 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7953
7954 // There are four ways to lower s_mul_u64:
7955 //
7956 // 1. If all the operands are uniform, then we lower it as it is.
7957 //
7958 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7959 // multiplications because there is not a vector equivalent of s_mul_u64.
7960 //
7961 // 3. If the cost model decides that it is more efficient to use vector
7962 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7963 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7964 //
7965 // 4. If the cost model decides to use vector registers and both of the
7966 // operands are zero-extended/sign-extended from 32-bits, then we split the
7967 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7968 // possible to check if the operands are zero-extended or sign-extended in
7969 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7970 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7971 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7972 // If the cost model decides that we have to use vector registers, then
7973 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7974 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7975 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7976 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7977 // SIInstrInfo.cpp .
7978
7979 if (Op->isDivergent())
7980 return SDValue();
7981
7982 SDValue Op0 = Op.getOperand(0);
7983 SDValue Op1 = Op.getOperand(1);
7984 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7985 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7986 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7987 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7988 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7989 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7990 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7991 SDLoc SL(Op);
7992 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7993 return SDValue(
7994 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7995 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7996 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7997 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7998 return SDValue(
7999 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8000 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8001 return Op;
8002}
8003
8004SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8005 EVT VT = Op.getValueType();
8006 SDLoc SL(Op);
8007 SDValue LHS = Op.getOperand(0);
8008 SDValue RHS = Op.getOperand(1);
8009 bool isSigned = Op.getOpcode() == ISD::SMULO;
8010
8011 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8012 const APInt &C = RHSC->getAPIntValue();
8013 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8014 if (C.isPowerOf2()) {
8015 // smulo(x, signed_min) is same as umulo(x, signed_min).
8016 bool UseArithShift = isSigned && !C.isMinSignedValue();
8017 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8018 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8019 SDValue Overflow =
8020 DAG.getSetCC(SL, MVT::i1,
8021 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8022 Result, ShiftAmt),
8023 LHS, ISD::SETNE);
8024 return DAG.getMergeValues({Result, Overflow}, SL);
8025 }
8026 }
8027
8028 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8029 SDValue Top =
8030 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8031
8032 SDValue Sign = isSigned
8033 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8034 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8035 SL, MVT::i32))
8036 : DAG.getConstant(0, SL, VT);
8037 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8038
8039 return DAG.getMergeValues({Result, Overflow}, SL);
8040}
8041
8042SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8043 if (Op->isDivergent()) {
8044 // Select to V_MAD_[IU]64_[IU]32.
8045 return Op;
8046 }
8047 if (Subtarget->hasSMulHi()) {
8048 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8049 return SDValue();
8050 }
8051 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8052 // calculate the high part, so we might as well do the whole thing with
8053 // V_MAD_[IU]64_[IU]32.
8054 return Op;
8055}
8056
8057SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8058 if (!Subtarget->isTrapHandlerEnabled() ||
8059 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8060 return lowerTrapEndpgm(Op, DAG);
8061
8062 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8063 : lowerTrapHsaQueuePtr(Op, DAG);
8064}
8065
8066SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8067 SDLoc SL(Op);
8068 SDValue Chain = Op.getOperand(0);
8069 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8070}
8071
8072SDValue
8073SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8074 const SDLoc &DL, Align Alignment,
8075 ImplicitParameter Param) const {
8076 MachineFunction &MF = DAG.getMachineFunction();
8077 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8078 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8079 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8080 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8083}
8084
8085SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8086 SelectionDAG &DAG) const {
8087 SDLoc SL(Op);
8088 SDValue Chain = Op.getOperand(0);
8089
8090 SDValue QueuePtr;
8091 // For code object version 5, QueuePtr is passed through implicit kernarg.
8092 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8094 QueuePtr =
8095 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8096 } else {
8097 MachineFunction &MF = DAG.getMachineFunction();
8098 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8099 Register UserSGPR = Info->getQueuePtrUserSGPR();
8100
8101 if (UserSGPR == AMDGPU::NoRegister) {
8102 // We probably are in a function incorrectly marked with
8103 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8104 // trap, so just use a null pointer.
8105 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8106 } else {
8107 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8108 MVT::i64);
8109 }
8110 }
8111
8112 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8113 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8114
8115 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8116 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8117 ToReg.getValue(1)};
8118 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8119}
8120
8121SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8122 SDLoc SL(Op);
8123 SDValue Chain = Op.getOperand(0);
8124
8125 // We need to simulate the 's_trap 2' instruction on targets that run in
8126 // PRIV=1 (where it is treated as a nop).
8127 if (Subtarget->hasPrivEnabledTrap2NopBug())
8128 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8129
8130 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8131 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8132 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8133}
8134
8135SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8136 SDLoc SL(Op);
8137 SDValue Chain = Op.getOperand(0);
8138 MachineFunction &MF = DAG.getMachineFunction();
8139
8140 if (!Subtarget->isTrapHandlerEnabled() ||
8141 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8142 LLVMContext &Ctx = MF.getFunction().getContext();
8143 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8144 "debugtrap handler not supported",
8145 Op.getDebugLoc(), DS_Warning));
8146 return Chain;
8147 }
8148
8149 uint64_t TrapID =
8150 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8151 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8152 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8153}
8154
8155SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8156 SelectionDAG &DAG) const {
8157 if (Subtarget->hasApertureRegs()) {
8158 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8159 ? AMDGPU::SRC_SHARED_BASE
8160 : AMDGPU::SRC_PRIVATE_BASE;
8161 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8162 !Subtarget->hasGloballyAddressableScratch()) &&
8163 "Cannot use src_private_base with globally addressable scratch!");
8164 // Note: this feature (register) is broken. When used as a 32-bit operand,
8165 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8166 // bits.
8167 //
8168 // To work around the issue, directly emit a 64 bit mov from this register
8169 // then extract the high bits. Note that this shouldn't even result in a
8170 // shift being emitted and simply become a pair of registers (e.g.):
8171 // s_mov_b64 s[6:7], src_shared_base
8172 // v_mov_b32_e32 v1, s7
8173 //
8174 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
8175 // coalescing would kick in and it would think it's okay to use the "HI"
8176 // subregister directly (instead of extracting the HI 32 bits) which is an
8177 // artificial (unusable) register.
8178 // Register TableGen definitions would need an overhaul to get rid of the
8179 // artificial "HI" aperture registers and prevent this kind of issue from
8180 // happening.
8181 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
8182 DAG.getRegister(ApertureRegNo, MVT::i64));
8183 return DAG.getNode(
8184 ISD::TRUNCATE, DL, MVT::i32,
8185 DAG.getNode(ISD::SRL, DL, MVT::i64,
8186 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
8187 }
8188
8189 // For code object version 5, private_base and shared_base are passed through
8190 // implicit kernargs.
8191 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8195 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8196 }
8197
8198 MachineFunction &MF = DAG.getMachineFunction();
8199 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8200 Register UserSGPR = Info->getQueuePtrUserSGPR();
8201 if (UserSGPR == AMDGPU::NoRegister) {
8202 // We probably are in a function incorrectly marked with
8203 // amdgpu-no-queue-ptr. This is undefined.
8204 return DAG.getPOISON(MVT::i32);
8205 }
8206
8207 SDValue QueuePtr =
8208 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8209
8210 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8211 // private_segment_aperture_base_hi.
8212 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8213
8214 SDValue Ptr =
8215 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8216
8217 // TODO: Use custom target PseudoSourceValue.
8218 // TODO: We should use the value from the IR intrinsic call, but it might not
8219 // be available and how do we get it?
8220 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8221 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8222 commonAlignment(Align(64), StructOffset),
8225}
8226
8227/// Return true if the value is a known valid address, such that a null check is
8228/// not necessary.
8230 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8232 return true;
8233
8234 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8235 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8236
8237 // TODO: Search through arithmetic, handle arguments and loads
8238 // marked nonnull.
8239 return false;
8240}
8241
8242SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8243 SelectionDAG &DAG) const {
8244 SDLoc SL(Op);
8245
8246 const AMDGPUTargetMachine &TM =
8247 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8248
8249 unsigned DestAS, SrcAS;
8250 SDValue Src;
8251 bool IsNonNull = false;
8252 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8253 SrcAS = ASC->getSrcAddressSpace();
8254 Src = ASC->getOperand(0);
8255 DestAS = ASC->getDestAddressSpace();
8256 } else {
8257 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8258 Op.getConstantOperandVal(0) ==
8259 Intrinsic::amdgcn_addrspacecast_nonnull);
8260 Src = Op->getOperand(1);
8261 SrcAS = Op->getConstantOperandVal(2);
8262 DestAS = Op->getConstantOperandVal(3);
8263 IsNonNull = true;
8264 }
8265
8266 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8267
8268 // flat -> local/private
8269 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8270 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8271 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8272 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8273
8274 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8275 Subtarget->hasGloballyAddressableScratch()) {
8276 // flat -> private with globally addressable scratch: subtract
8277 // src_flat_scratch_base_lo.
8278 SDValue FlatScratchBaseLo(
8279 DAG.getMachineNode(
8280 AMDGPU::S_MOV_B32, SL, MVT::i32,
8281 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8282 0);
8283 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8284 }
8285
8286 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8287 return Ptr;
8288
8289 unsigned NullVal = TM.getNullPointerValue(DestAS);
8290 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8291 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8292
8293 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8294 SegmentNullPtr);
8295 }
8296 }
8297
8298 // local/private -> flat
8299 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8300 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8301 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8302 SDValue CvtPtr;
8303 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8304 Subtarget->hasGloballyAddressableScratch()) {
8305 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8306 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8307 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8308 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8309 ThreadID = DAG.getNode(
8310 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8311 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8312 AllOnes, ThreadID);
8313 if (Subtarget->isWave64())
8314 ThreadID = DAG.getNode(
8315 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8316 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8317 AllOnes, ThreadID);
8318 SDValue ShAmt = DAG.getShiftAmountConstant(
8319 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8320 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8321 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8322 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8323 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8324 // 64-bit hi:lo value.
8325 SDValue FlatScratchBase = {
8326 DAG.getMachineNode(
8327 AMDGPU::S_MOV_B64, SL, MVT::i64,
8328 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8329 0};
8330 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8331 } else {
8332 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8333 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8334 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8335 }
8336
8337 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8338 return CvtPtr;
8339
8340 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8341 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8342
8343 SDValue NonNull =
8344 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8345
8346 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8347 FlatNullPtr);
8348 }
8349 }
8350
8351 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8352 Op.getValueType() == MVT::i64) {
8353 const SIMachineFunctionInfo *Info =
8354 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8355 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8356 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8357 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8358 }
8359
8360 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8361 Src.getValueType() == MVT::i64)
8362 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8363
8364 // global <-> flat are no-ops and never emitted.
8365
8366 // Invalid casts are poison.
8367 return DAG.getPOISON(Op->getValueType(0));
8368}
8369
8370// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8371// the small vector and inserting them into the big vector. That is better than
8372// the default expansion of doing it via a stack slot. Even though the use of
8373// the stack slot would be optimized away afterwards, the stack slot itself
8374// remains.
8375SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8376 SelectionDAG &DAG) const {
8377 SDValue Vec = Op.getOperand(0);
8378 SDValue Ins = Op.getOperand(1);
8379 SDValue Idx = Op.getOperand(2);
8380 EVT VecVT = Vec.getValueType();
8381 EVT InsVT = Ins.getValueType();
8382 EVT EltVT = VecVT.getVectorElementType();
8383 unsigned InsNumElts = InsVT.getVectorNumElements();
8384 unsigned IdxVal = Idx->getAsZExtVal();
8385 SDLoc SL(Op);
8386
8387 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8388 // Insert 32-bit registers at a time.
8389 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8390
8391 unsigned VecNumElts = VecVT.getVectorNumElements();
8392 EVT NewVecVT =
8393 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8394 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8396 MVT::i32, InsNumElts / 2);
8397
8398 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8399 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8400
8401 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8402 SDValue Elt;
8403 if (InsNumElts == 2) {
8404 Elt = Ins;
8405 } else {
8406 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8407 DAG.getConstant(I, SL, MVT::i32));
8408 }
8409 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8410 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8411 }
8412
8413 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8414 }
8415
8416 for (unsigned I = 0; I != InsNumElts; ++I) {
8417 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8418 DAG.getConstant(I, SL, MVT::i32));
8419 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8420 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8421 }
8422 return Vec;
8423}
8424
8425SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8426 SelectionDAG &DAG) const {
8427 SDValue Vec = Op.getOperand(0);
8428 SDValue InsVal = Op.getOperand(1);
8429 SDValue Idx = Op.getOperand(2);
8430 EVT VecVT = Vec.getValueType();
8431 EVT EltVT = VecVT.getVectorElementType();
8432 unsigned VecSize = VecVT.getSizeInBits();
8433 unsigned EltSize = EltVT.getSizeInBits();
8434 SDLoc SL(Op);
8435
8436 // Specially handle the case of v4i16 with static indexing.
8437 unsigned NumElts = VecVT.getVectorNumElements();
8438 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8439 if (NumElts == 4 && EltSize == 16 && KIdx) {
8440 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8441
8442 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8443 DAG.getConstant(0, SL, MVT::i32));
8444 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8445 DAG.getConstant(1, SL, MVT::i32));
8446
8447 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8448 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8449
8450 unsigned Idx = KIdx->getZExtValue();
8451 bool InsertLo = Idx < 2;
8452 SDValue InsHalf = DAG.getNode(
8453 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8454 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8455 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8456
8457 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8458
8459 SDValue Concat =
8460 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8461 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8462
8463 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8464 }
8465
8466 // Static indexing does not lower to stack access, and hence there is no need
8467 // for special custom lowering to avoid stack access.
8468 if (isa<ConstantSDNode>(Idx))
8469 return SDValue();
8470
8471 // Avoid stack access for dynamic indexing by custom lowering to
8472 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8473
8474 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8475
8476 MVT IntVT = MVT::getIntegerVT(VecSize);
8477
8478 // Convert vector index to bit-index and get the required bit mask.
8479 assert(isPowerOf2_32(EltSize));
8480 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8481 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8482 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8483 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8484 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8485
8486 // 1. Create a congruent vector with the target value in each element.
8487 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8488 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8489
8490 // 2. Mask off all other indices except the required index within (1).
8491 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8492
8493 // 3. Mask off the required index within the target vector.
8494 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8495 SDValue RHS =
8496 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8497
8498 // 4. Get (2) and (3) ORed into the target vector.
8499 SDValue BFI =
8500 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8501
8502 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8503}
8504
8505SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8506 SelectionDAG &DAG) const {
8507 SDLoc SL(Op);
8508
8509 EVT ResultVT = Op.getValueType();
8510 SDValue Vec = Op.getOperand(0);
8511 SDValue Idx = Op.getOperand(1);
8512 EVT VecVT = Vec.getValueType();
8513 unsigned VecSize = VecVT.getSizeInBits();
8514 EVT EltVT = VecVT.getVectorElementType();
8515
8516 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8517
8518 // Make sure we do any optimizations that will make it easier to fold
8519 // source modifiers before obscuring it with bit operations.
8520
8521 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8522 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8523 return Combined;
8524
8525 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8526 SDValue Lo, Hi;
8527 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8528
8529 if (VecSize == 128) {
8530 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8531 Lo = DAG.getBitcast(LoVT,
8532 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8533 DAG.getConstant(0, SL, MVT::i32)));
8534 Hi = DAG.getBitcast(HiVT,
8535 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8536 DAG.getConstant(1, SL, MVT::i32)));
8537 } else if (VecSize == 256) {
8538 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8539 SDValue Parts[4];
8540 for (unsigned P = 0; P < 4; ++P) {
8541 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8542 DAG.getConstant(P, SL, MVT::i32));
8543 }
8544
8545 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8546 Parts[0], Parts[1]));
8547 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8548 Parts[2], Parts[3]));
8549 } else {
8550 assert(VecSize == 512);
8551
8552 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8553 SDValue Parts[8];
8554 for (unsigned P = 0; P < 8; ++P) {
8555 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8556 DAG.getConstant(P, SL, MVT::i32));
8557 }
8558
8559 Lo = DAG.getBitcast(LoVT,
8560 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8561 Parts[0], Parts[1], Parts[2], Parts[3]));
8562 Hi = DAG.getBitcast(HiVT,
8563 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8564 Parts[4], Parts[5], Parts[6], Parts[7]));
8565 }
8566
8567 EVT IdxVT = Idx.getValueType();
8568 unsigned NElem = VecVT.getVectorNumElements();
8569 assert(isPowerOf2_32(NElem));
8570 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8571 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8572 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8573 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8574 }
8575
8576 assert(VecSize <= 64);
8577
8578 MVT IntVT = MVT::getIntegerVT(VecSize);
8579
8580 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8581 SDValue VecBC = peekThroughBitcasts(Vec);
8582 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8583 SDValue Src = VecBC.getOperand(0);
8584 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8585 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8586 }
8587
8588 unsigned EltSize = EltVT.getSizeInBits();
8589 assert(isPowerOf2_32(EltSize));
8590
8591 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8592
8593 // Convert vector index to bit-index (* EltSize)
8594 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8595
8596 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8597 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8598
8599 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8600 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8601 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8602 }
8603
8604 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8605}
8606
8607static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8608 assert(Elt % 2 == 0);
8609 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8610}
8611
8612static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8613 assert(Elt % 2 == 0);
8614 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8615 !(Mask[Elt + 1] & 1);
8616}
8617
8618SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8619 SelectionDAG &DAG) const {
8620 SDLoc SL(Op);
8621 EVT ResultVT = Op.getValueType();
8622 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8623 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8624 const int NewSrcNumElts = 2;
8625 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8626 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8627
8628 // Break up the shuffle into registers sized pieces.
8629 //
8630 // We're trying to form sub-shuffles that the register allocation pipeline
8631 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8632 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8633 // pair of copies into a consecutive register copy, so use the ordinary
8634 // extract_vector_elt lowering unless we can use the shuffle.
8635 //
8636 // TODO: This is a bit of hack, and we should probably always use
8637 // extract_subvector for the largest possible subvector we can (or at least
8638 // use it for PackVT aligned pieces). However we have worse support for
8639 // combines on them don't directly treat extract_subvector / insert_subvector
8640 // as legal. The DAG scheduler also ends up doing a worse job with the
8641 // extract_subvectors.
8642 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8643
8644 // vector_shuffle <0,1,6,7> lhs, rhs
8645 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8646 //
8647 // vector_shuffle <6,7,2,3> lhs, rhs
8648 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8649 //
8650 // vector_shuffle <6,7,0,1> lhs, rhs
8651 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8652
8653 // Avoid scalarizing when both halves are reading from consecutive elements.
8654
8655 // If we're treating 2 element shuffles as legal, also create odd-to-even
8656 // shuffles of neighboring pairs.
8657 //
8658 // vector_shuffle <3,2,7,6> lhs, rhs
8659 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8660 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8661
8663 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8664 if (ShouldUseConsecutiveExtract &&
8666 const int Idx = SVN->getMaskElt(I);
8667 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8668 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8669 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8670 SVN->getOperand(VecIdx),
8671 DAG.getConstant(EltIdx, SL, MVT::i32));
8672 Pieces.push_back(SubVec);
8673 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8675 int Idx0 = SVN->getMaskElt(I);
8676 int Idx1 = SVN->getMaskElt(I + 1);
8677
8678 SDValue SrcOp0 = SVN->getOperand(0);
8679 SDValue SrcOp1 = SrcOp0;
8680 if (Idx0 >= SrcNumElts) {
8681 SrcOp0 = SVN->getOperand(1);
8682 Idx0 -= SrcNumElts;
8683 }
8684
8685 if (Idx1 >= SrcNumElts) {
8686 SrcOp1 = SVN->getOperand(1);
8687 Idx1 -= SrcNumElts;
8688 }
8689
8690 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8691 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8692
8693 // Extract nearest even aligned piece.
8694 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8695 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8696 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8697 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8698
8699 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8700 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8701
8702 SDValue Result0 = SubVec0;
8703 SDValue Result1 = SubVec0;
8704
8705 if (SubVec0 != SubVec1) {
8706 NewMaskIdx1 += NewSrcNumElts;
8707 Result1 = SubVec1;
8708 } else {
8709 Result1 = DAG.getPOISON(PackVT);
8710 }
8711
8712 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8713 {NewMaskIdx0, NewMaskIdx1});
8714 Pieces.push_back(Shuf);
8715 } else {
8716 const int Idx0 = SVN->getMaskElt(I);
8717 const int Idx1 = SVN->getMaskElt(I + 1);
8718 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8719 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8720 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8721 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8722
8723 SDValue Vec0 = SVN->getOperand(VecIdx0);
8724 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8725 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8726
8727 SDValue Vec1 = SVN->getOperand(VecIdx1);
8728 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8729 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8730 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8731 }
8732 }
8733
8734 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8735}
8736
8737SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8738 SelectionDAG &DAG) const {
8739 SDValue SVal = Op.getOperand(0);
8740 EVT ResultVT = Op.getValueType();
8741 EVT SValVT = SVal.getValueType();
8742 SDValue UndefVal = DAG.getPOISON(SValVT);
8743 SDLoc SL(Op);
8744
8746 VElts.push_back(SVal);
8747 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8748 VElts.push_back(UndefVal);
8749
8750 return DAG.getBuildVector(ResultVT, SL, VElts);
8751}
8752
8753SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8754 SelectionDAG &DAG) const {
8755 SDLoc SL(Op);
8756 EVT VT = Op.getValueType();
8757
8758 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8759 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8760
8761 SDValue Lo = Op.getOperand(0);
8762 SDValue Hi = Op.getOperand(1);
8763
8764 // Avoid adding defined bits with the zero_extend.
8765 if (Hi.isUndef()) {
8766 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8767 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8768 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8769 }
8770
8771 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8772 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8773
8774 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8775 DAG.getConstant(16, SL, MVT::i32));
8776 if (Lo.isUndef())
8777 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8778
8779 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8780 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8781
8782 SDValue Or =
8783 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8784 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8785 }
8786
8787 // Split into 2-element chunks.
8788 const unsigned NumParts = VT.getVectorNumElements() / 2;
8789 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8790 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8791
8793 for (unsigned P = 0; P < NumParts; ++P) {
8794 SDValue Vec = DAG.getBuildVector(
8795 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8796 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8797 }
8798
8799 SDValue Blend =
8800 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8801 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8802}
8803
8805 const GlobalAddressSDNode *GA) const {
8806 // OSes that use ELF REL relocations (instead of RELA) can only store a
8807 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8808 // which can create arbitrary 64-bit addends. (This is only a problem for
8809 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8810 // the high 32 bits of the addend.)
8811 //
8812 // This should be kept in sync with how HasRelocationAddend is initialized in
8813 // the constructor of ELFAMDGPUAsmBackend.
8814 if (!Subtarget->isAmdHsaOS())
8815 return false;
8816
8817 // We can fold offsets for anything that doesn't require a GOT relocation.
8818 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8822}
8823
8824static SDValue
8826 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8827 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8828 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8829 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8830 // lowered to the following code sequence:
8831 //
8832 // For constant address space:
8833 // s_getpc_b64 s[0:1]
8834 // s_add_u32 s0, s0, $symbol
8835 // s_addc_u32 s1, s1, 0
8836 //
8837 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8838 // a fixup or relocation is emitted to replace $symbol with a literal
8839 // constant, which is a pc-relative offset from the encoding of the $symbol
8840 // operand to the global variable.
8841 //
8842 // For global address space:
8843 // s_getpc_b64 s[0:1]
8844 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8845 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8846 //
8847 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8848 // fixups or relocations are emitted to replace $symbol@*@lo and
8849 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8850 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8851 // operand to the global variable.
8852 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8853 assert(GAFlags != SIInstrInfo::MO_NONE);
8854
8855 SDValue Ptr =
8856 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8857 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8858 }
8859
8860 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8861 SDValue PtrHi;
8862 if (GAFlags == SIInstrInfo::MO_NONE)
8863 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8864 else
8865 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8866 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8867}
8868
8869SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8870 SDValue Op,
8871 SelectionDAG &DAG) const {
8872 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8873 SDLoc DL(GSD);
8874 EVT PtrVT = Op.getValueType();
8875
8876 const GlobalValue *GV = GSD->getGlobal();
8882 GV->hasExternalLinkage()) {
8883 Type *Ty = GV->getValueType();
8884 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8885 // zero-sized type in other languages to declare the dynamic shared
8886 // memory which size is not known at the compile time. They will be
8887 // allocated by the runtime and placed directly after the static
8888 // allocated ones. They all share the same offset.
8889 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8890 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8891 // Adjust alignment for that dynamic shared memory array.
8894 MFI->setUsesDynamicLDS(true);
8895 return SDValue(
8896 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8897 }
8898 }
8900 }
8901
8903 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8905 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8906 }
8907
8908 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8909 if (Subtarget->has64BitLiterals()) {
8911 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8912 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8913 0);
8914 }
8915
8916 SDValue AddrLo = DAG.getTargetGlobalAddress(
8917 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8918 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8919
8920 SDValue AddrHi = DAG.getTargetGlobalAddress(
8921 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8922 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8923
8924 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8925 }
8926
8927 if (shouldEmitFixup(GV))
8928 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8929
8930 if (shouldEmitPCReloc(GV))
8931 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8933
8934 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8936 PointerType *PtrTy =
8938 const DataLayout &DataLayout = DAG.getDataLayout();
8939 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8940 MachinePointerInfo PtrInfo =
8942
8943 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8946}
8947
8949 const SDLoc &DL, SDValue V) const {
8950 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8951 // the destination register.
8952 //
8953 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8954 // so we will end up with redundant moves to m0.
8955 //
8956 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8957
8958 // A Null SDValue creates a glue result.
8959 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8960 V, Chain);
8961 return SDValue(M0, 0);
8962}
8963
8964SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8965 MVT VT,
8966 unsigned Offset) const {
8967 SDLoc SL(Op);
8968 SDValue Param = lowerKernargMemParameter(
8969 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8970 // The local size values will have the hi 16-bits as zero.
8971 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8972 DAG.getValueType(VT));
8973}
8974
8976 EVT VT) {
8979 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8980 return DAG.getPOISON(VT);
8981}
8982
8984 EVT VT) {
8987 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8988 return DAG.getPOISON(VT);
8989}
8990
8992 ArrayRef<SDValue> Elts) {
8993 assert(!Elts.empty());
8994 MVT Type;
8995 unsigned NumElts = Elts.size();
8996
8997 if (NumElts <= 12) {
8998 Type = MVT::getVectorVT(MVT::f32, NumElts);
8999 } else {
9000 assert(Elts.size() <= 16);
9001 Type = MVT::v16f32;
9002 NumElts = 16;
9003 }
9004
9005 SmallVector<SDValue, 16> VecElts(NumElts);
9006 for (unsigned i = 0; i < Elts.size(); ++i) {
9007 SDValue Elt = Elts[i];
9008 if (Elt.getValueType() != MVT::f32)
9009 Elt = DAG.getBitcast(MVT::f32, Elt);
9010 VecElts[i] = Elt;
9011 }
9012 for (unsigned i = Elts.size(); i < NumElts; ++i)
9013 VecElts[i] = DAG.getPOISON(MVT::f32);
9014
9015 if (NumElts == 1)
9016 return VecElts[0];
9017 return DAG.getBuildVector(Type, DL, VecElts);
9018}
9019
9020static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9021 SDValue Src, int ExtraElts) {
9022 EVT SrcVT = Src.getValueType();
9023
9025
9026 if (SrcVT.isVector())
9027 DAG.ExtractVectorElements(Src, Elts);
9028 else
9029 Elts.push_back(Src);
9030
9031 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9032 while (ExtraElts--)
9033 Elts.push_back(Undef);
9034
9035 return DAG.getBuildVector(CastVT, DL, Elts);
9036}
9037
9038// Re-construct the required return value for a image load intrinsic.
9039// This is more complicated due to the optional use TexFailCtrl which means the
9040// required return type is an aggregate
9042 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9043 bool Unpacked, bool IsD16, int DMaskPop,
9044 int NumVDataDwords, bool IsAtomicPacked16Bit,
9045 const SDLoc &DL) {
9046 // Determine the required return type. This is the same regardless of
9047 // IsTexFail flag
9048 EVT ReqRetVT = ResultTypes[0];
9049 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9050 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9051 ? (ReqRetNumElts + 1) / 2
9052 : ReqRetNumElts;
9053
9054 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9055
9056 MVT DataDwordVT =
9057 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9058
9059 MVT MaskPopVT =
9060 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9061
9062 SDValue Data(Result, 0);
9063 SDValue TexFail;
9064
9065 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9066 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9067 if (MaskPopVT.isVector()) {
9068 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9069 SDValue(Result, 0), ZeroIdx);
9070 } else {
9071 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9072 SDValue(Result, 0), ZeroIdx);
9073 }
9074 }
9075
9076 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9077 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9078 NumDataDwords - MaskPopDwords);
9079
9080 if (IsD16)
9081 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9082
9083 EVT LegalReqRetVT = ReqRetVT;
9084 if (!ReqRetVT.isVector()) {
9085 if (!Data.getValueType().isInteger())
9086 Data = DAG.getNode(ISD::BITCAST, DL,
9087 Data.getValueType().changeTypeToInteger(), Data);
9088 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9089 } else {
9090 // We need to widen the return vector to a legal type
9091 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9092 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9093 LegalReqRetVT =
9095 ReqRetVT.getVectorNumElements() + 1);
9096 }
9097 }
9098 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9099
9100 if (IsTexFail) {
9101 TexFail =
9102 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9103 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9104
9105 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9106 }
9107
9108 if (Result->getNumValues() == 1)
9109 return Data;
9110
9111 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9112}
9113
9114static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9115 SDValue *LWE, bool &IsTexFail) {
9116 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9117
9118 uint64_t Value = TexFailCtrlConst->getZExtValue();
9119 if (Value) {
9120 IsTexFail = true;
9121 }
9122
9123 SDLoc DL(TexFailCtrlConst);
9124 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9125 Value &= ~(uint64_t)0x1;
9126 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9127 Value &= ~(uint64_t)0x2;
9128
9129 return Value == 0;
9130}
9131
9133 MVT PackVectorVT,
9134 SmallVectorImpl<SDValue> &PackedAddrs,
9135 unsigned DimIdx, unsigned EndIdx,
9136 unsigned NumGradients) {
9137 SDLoc DL(Op);
9138 for (unsigned I = DimIdx; I < EndIdx; I++) {
9139 SDValue Addr = Op.getOperand(I);
9140
9141 // Gradients are packed with undef for each coordinate.
9142 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9143 // 1D: undef,dx/dh; undef,dx/dv
9144 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9145 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9146 if (((I + 1) >= EndIdx) ||
9147 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9148 I == DimIdx + NumGradients - 1))) {
9149 if (Addr.getValueType() != MVT::i16)
9150 Addr = DAG.getBitcast(MVT::i16, Addr);
9151 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9152 } else {
9153 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9154 I++;
9155 }
9156 Addr = DAG.getBitcast(MVT::f32, Addr);
9157 PackedAddrs.push_back(Addr);
9158 }
9159}
9160
9161SDValue SITargetLowering::lowerImage(SDValue Op,
9163 SelectionDAG &DAG, bool WithChain) const {
9164 SDLoc DL(Op);
9165 MachineFunction &MF = DAG.getMachineFunction();
9166 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9167 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9169 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9170 unsigned IntrOpcode = Intr->BaseOpcode;
9171 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9172 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9173 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9174
9175 SmallVector<EVT, 3> ResultTypes(Op->values());
9176 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9177 bool IsD16 = false;
9178 bool IsG16 = false;
9179 bool IsA16 = false;
9180 SDValue VData;
9181 int NumVDataDwords = 0;
9182 bool AdjustRetType = false;
9183 bool IsAtomicPacked16Bit = false;
9184
9185 // Offset of intrinsic arguments
9186 const unsigned ArgOffset = WithChain ? 2 : 1;
9187
9188 unsigned DMask;
9189 unsigned DMaskLanes = 0;
9190
9191 if (BaseOpcode->Atomic) {
9192 VData = Op.getOperand(2);
9193
9194 IsAtomicPacked16Bit =
9195 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9196 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9197
9198 bool Is64Bit = VData.getValueSizeInBits() == 64;
9199 if (BaseOpcode->AtomicX2) {
9200 SDValue VData2 = Op.getOperand(3);
9201 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9202 {VData, VData2});
9203 if (Is64Bit)
9204 VData = DAG.getBitcast(MVT::v4i32, VData);
9205
9206 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9207 DMask = Is64Bit ? 0xf : 0x3;
9208 NumVDataDwords = Is64Bit ? 4 : 2;
9209 } else {
9210 DMask = Is64Bit ? 0x3 : 0x1;
9211 NumVDataDwords = Is64Bit ? 2 : 1;
9212 }
9213 } else {
9214 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9215 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9216
9217 if (BaseOpcode->Store) {
9218 VData = Op.getOperand(2);
9219
9220 MVT StoreVT = VData.getSimpleValueType();
9221 if (StoreVT.getScalarType() == MVT::f16) {
9222 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9223 return Op; // D16 is unsupported for this instruction
9224
9225 IsD16 = true;
9226 VData = handleD16VData(VData, DAG, true);
9227 }
9228
9229 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9230 } else if (!BaseOpcode->NoReturn) {
9231 // Work out the num dwords based on the dmask popcount and underlying type
9232 // and whether packing is supported.
9233 MVT LoadVT = ResultTypes[0].getSimpleVT();
9234 if (LoadVT.getScalarType() == MVT::f16) {
9235 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9236 return Op; // D16 is unsupported for this instruction
9237
9238 IsD16 = true;
9239 }
9240
9241 // Confirm that the return type is large enough for the dmask specified
9242 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9243 (!LoadVT.isVector() && DMaskLanes > 1))
9244 return Op;
9245
9246 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9247 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9248 // instructions.
9249 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9250 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9251 NumVDataDwords = (DMaskLanes + 1) / 2;
9252 else
9253 NumVDataDwords = DMaskLanes;
9254
9255 AdjustRetType = true;
9256 }
9257 }
9258
9259 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9261
9262 // Check for 16 bit addresses or derivatives and pack if true.
9263 MVT VAddrVT =
9264 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9265 MVT VAddrScalarVT = VAddrVT.getScalarType();
9266 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9267 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9268
9269 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9270 VAddrScalarVT = VAddrVT.getScalarType();
9271 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9272 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9273
9274 // Push back extra arguments.
9275 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9276 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9277 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9278 // Special handling of bias when A16 is on. Bias is of type half but
9279 // occupies full 32-bit.
9280 SDValue Bias = DAG.getBuildVector(
9281 MVT::v2f16, DL,
9282 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9283 VAddrs.push_back(Bias);
9284 } else {
9285 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9286 "Bias needs to be converted to 16 bit in A16 mode");
9287 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9288 }
9289 }
9290
9291 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9292 // 16 bit gradients are supported, but are tied to the A16 control
9293 // so both gradients and addresses must be 16 bit
9294 LLVM_DEBUG(
9295 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9296 "require 16 bit args for both gradients and addresses");
9297 return Op;
9298 }
9299
9300 if (IsA16) {
9301 if (!ST->hasA16()) {
9302 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9303 "support 16 bit addresses\n");
9304 return Op;
9305 }
9306 }
9307
9308 // We've dealt with incorrect input so we know that if IsA16, IsG16
9309 // are set then we have to compress/pack operands (either address,
9310 // gradient or both)
9311 // In the case where a16 and gradients are tied (no G16 support) then we
9312 // have already verified that both IsA16 and IsG16 are true
9313 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9314 // Activate g16
9315 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9317 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9318 }
9319
9320 // Add gradients (packed or unpacked)
9321 if (IsG16) {
9322 // Pack the gradients
9323 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9324 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9325 ArgOffset + Intr->GradientStart,
9326 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9327 } else {
9328 for (unsigned I = ArgOffset + Intr->GradientStart;
9329 I < ArgOffset + Intr->CoordStart; I++)
9330 VAddrs.push_back(Op.getOperand(I));
9331 }
9332
9333 // Add addresses (packed or unpacked)
9334 if (IsA16) {
9335 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9336 ArgOffset + Intr->CoordStart, VAddrEnd,
9337 0 /* No gradients */);
9338 } else {
9339 // Add uncompressed address
9340 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9341 VAddrs.push_back(Op.getOperand(I));
9342 }
9343
9344 // If the register allocator cannot place the address registers contiguously
9345 // without introducing moves, then using the non-sequential address encoding
9346 // is always preferable, since it saves VALU instructions and is usually a
9347 // wash in terms of code size or even better.
9348 //
9349 // However, we currently have no way of hinting to the register allocator that
9350 // MIMG addresses should be placed contiguously when it is possible to do so,
9351 // so force non-NSA for the common 2-address case as a heuristic.
9352 //
9353 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9354 // allocation when possible.
9355 //
9356 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9357 // set of the remaining addresses.
9358 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9359 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9360 const bool UseNSA = ST->hasNSAEncoding() &&
9361 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9362 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9363 const bool UsePartialNSA =
9364 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9365
9366 SDValue VAddr;
9367 if (UsePartialNSA) {
9368 VAddr = getBuildDwordsVector(DAG, DL,
9369 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9370 } else if (!UseNSA) {
9371 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9372 }
9373
9374 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9375 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9376 SDValue Unorm;
9377 if (!BaseOpcode->Sampler) {
9378 Unorm = True;
9379 } else {
9380 uint64_t UnormConst =
9381 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9382
9383 Unorm = UnormConst ? True : False;
9384 }
9385
9386 SDValue TFE;
9387 SDValue LWE;
9388 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9389 bool IsTexFail = false;
9390 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9391 return Op;
9392
9393 if (IsTexFail) {
9394 if (!DMaskLanes) {
9395 // Expecting to get an error flag since TFC is on - and dmask is 0
9396 // Force dmask to be at least 1 otherwise the instruction will fail
9397 DMask = 0x1;
9398 DMaskLanes = 1;
9399 NumVDataDwords = 1;
9400 }
9401 NumVDataDwords += 1;
9402 AdjustRetType = true;
9403 }
9404
9405 // Has something earlier tagged that the return type needs adjusting
9406 // This happens if the instruction is a load or has set TexFailCtrl flags
9407 if (AdjustRetType) {
9408 // NumVDataDwords reflects the true number of dwords required in the return
9409 // type
9410 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9411 // This is a no-op load. This can be eliminated
9412 SDValue Undef = DAG.getPOISON(Op.getValueType());
9413 if (isa<MemSDNode>(Op))
9414 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9415 return Undef;
9416 }
9417
9418 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9419 MVT::i32, NumVDataDwords)
9420 : MVT::i32;
9421
9422 ResultTypes[0] = NewVT;
9423 if (ResultTypes.size() == 3) {
9424 // Original result was aggregate type used for TexFailCtrl results
9425 // The actual instruction returns as a vector type which has now been
9426 // created. Remove the aggregate result.
9427 ResultTypes.erase(&ResultTypes[1]);
9428 }
9429 }
9430
9431 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9432 if (BaseOpcode->Atomic)
9433 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9434 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9436 return Op;
9437
9439 if (BaseOpcode->Store || BaseOpcode->Atomic)
9440 Ops.push_back(VData); // vdata
9441 if (UsePartialNSA) {
9442 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9443 Ops.push_back(VAddr);
9444 } else if (UseNSA)
9445 append_range(Ops, VAddrs);
9446 else
9447 Ops.push_back(VAddr);
9448 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9449 EVT RsrcVT = Rsrc.getValueType();
9450 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9451 return Op;
9452 Ops.push_back(Rsrc);
9453 if (BaseOpcode->Sampler) {
9454 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9455 if (Samp.getValueType() != MVT::v4i32)
9456 return Op;
9457 Ops.push_back(Samp);
9458 }
9459 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9460 if (IsGFX10Plus)
9461 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9462 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9463 Ops.push_back(Unorm);
9464 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9465 Ops.push_back(IsA16 && // r128, a16 for gfx9
9466 ST->hasFeature(AMDGPU::FeatureR128A16)
9467 ? True
9468 : False);
9469 if (IsGFX10Plus)
9470 Ops.push_back(IsA16 ? True : False);
9471
9472 if (!Subtarget->hasGFX90AInsts())
9473 Ops.push_back(TFE); // tfe
9474 else if (TFE->getAsZExtVal()) {
9475 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9477 "TFE is not supported on this GPU", DL.getDebugLoc()));
9478 }
9479
9480 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9481 Ops.push_back(LWE); // lwe
9482 if (!IsGFX10Plus)
9483 Ops.push_back(DimInfo->DA ? True : False);
9484 if (BaseOpcode->HasD16)
9485 Ops.push_back(IsD16 ? True : False);
9486 if (isa<MemSDNode>(Op))
9487 Ops.push_back(Op.getOperand(0)); // chain
9488
9489 int NumVAddrDwords =
9490 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9491 int Opcode = -1;
9492
9493 if (IsGFX12Plus) {
9494 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9495 NumVDataDwords, NumVAddrDwords);
9496 } else if (IsGFX11Plus) {
9497 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9498 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9499 : AMDGPU::MIMGEncGfx11Default,
9500 NumVDataDwords, NumVAddrDwords);
9501 } else if (IsGFX10Plus) {
9502 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9503 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9504 : AMDGPU::MIMGEncGfx10Default,
9505 NumVDataDwords, NumVAddrDwords);
9506 } else {
9507 if (Subtarget->hasGFX90AInsts()) {
9508 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9509 NumVDataDwords, NumVAddrDwords);
9510 if (Opcode == -1) {
9511 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9513 "requested image instruction is not supported on this GPU",
9514 DL.getDebugLoc()));
9515
9516 unsigned Idx = 0;
9517 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9518 for (EVT VT : OrigResultTypes) {
9519 if (VT == MVT::Other)
9520 RetValues[Idx++] = Op.getOperand(0); // Chain
9521 else
9522 RetValues[Idx++] = DAG.getPOISON(VT);
9523 }
9524
9525 return DAG.getMergeValues(RetValues, DL);
9526 }
9527 }
9528 if (Opcode == -1 &&
9529 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9530 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9531 NumVDataDwords, NumVAddrDwords);
9532 if (Opcode == -1)
9533 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9534 NumVDataDwords, NumVAddrDwords);
9535 }
9536 if (Opcode == -1)
9537 return Op;
9538
9539 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9540 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9541 MachineMemOperand *MemRef = MemOp->getMemOperand();
9542 DAG.setNodeMemRefs(NewNode, {MemRef});
9543 }
9544
9545 if (BaseOpcode->AtomicX2) {
9547 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9548 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9549 }
9550 if (BaseOpcode->NoReturn)
9551 return SDValue(NewNode, 0);
9552 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9553 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9554 NumVDataDwords, IsAtomicPacked16Bit, DL);
9555}
9556
9557SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9558 SDValue Offset, SDValue CachePolicy,
9559 SelectionDAG &DAG) const {
9560 MachineFunction &MF = DAG.getMachineFunction();
9561
9562 const DataLayout &DataLayout = DAG.getDataLayout();
9563 Align Alignment =
9564 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9565
9566 MachineMemOperand *MMO = MF.getMachineMemOperand(
9567 MachinePointerInfo(),
9570 VT.getStoreSize(), Alignment);
9571
9572 if (!Offset->isDivergent()) {
9573 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9574
9575 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9576 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9577 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9578 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9579 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9580 SDValue BufferLoad =
9582 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9583 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9584 }
9585
9586 // Widen vec3 load to vec4.
9587 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9588 !Subtarget->hasScalarDwordx3Loads()) {
9589 EVT WidenedVT =
9591 auto WidenedOp = DAG.getMemIntrinsicNode(
9592 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9593 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9594 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9595 DAG.getVectorIdxConstant(0, DL));
9596 return Subvector;
9597 }
9598
9600 DAG.getVTList(VT), Ops, VT, MMO);
9601 }
9602
9603 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9604 // assume that the buffer is unswizzled.
9605 SDValue Ops[] = {
9606 DAG.getEntryNode(), // Chain
9607 Rsrc, // rsrc
9608 DAG.getConstant(0, DL, MVT::i32), // vindex
9609 {}, // voffset
9610 {}, // soffset
9611 {}, // offset
9612 CachePolicy, // cachepolicy
9613 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9614 };
9615 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9616 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9617 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9618 }
9619
9621 unsigned NumLoads = 1;
9622 MVT LoadVT = VT.getSimpleVT();
9623 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9624 assert((LoadVT.getScalarType() == MVT::i32 ||
9625 LoadVT.getScalarType() == MVT::f32));
9626
9627 if (NumElts == 8 || NumElts == 16) {
9628 NumLoads = NumElts / 4;
9629 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9630 }
9631
9632 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9633
9634 // Use the alignment to ensure that the required offsets will fit into the
9635 // immediate offsets.
9636 setBufferOffsets(Offset, DAG, &Ops[3],
9637 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9638
9639 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9640 for (unsigned i = 0; i < NumLoads; ++i) {
9641 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9642 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9643 LoadVT, MMO, DAG));
9644 }
9645
9646 if (NumElts == 8 || NumElts == 16)
9647 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9648
9649 return Loads[0];
9650}
9651
9652SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9653 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9654 if (!Subtarget->hasArchitectedSGPRs())
9655 return {};
9656 SDLoc SL(Op);
9657 MVT VT = MVT::i32;
9658 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9659 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9660 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9661}
9662
9663SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9664 AMDGPU::Hwreg::Id HwReg,
9665 unsigned LowBit,
9666 unsigned Width) const {
9667 SDLoc SL(Op);
9668 using namespace AMDGPU::Hwreg;
9669 return {DAG.getMachineNode(
9670 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9671 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9672 SL, MVT::i32)),
9673 0};
9674}
9675
9676SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9677 unsigned Dim,
9678 const ArgDescriptor &Arg) const {
9679 SDLoc SL(Op);
9680 MachineFunction &MF = DAG.getMachineFunction();
9681 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9682 if (MaxID == 0)
9683 return DAG.getConstant(0, SL, MVT::i32);
9684
9685 // It's undefined behavior if a function marked with the amdgpu-no-*
9686 // attributes uses the corresponding intrinsic.
9687 if (!Arg)
9688 return DAG.getPOISON(Op->getValueType(0));
9689
9690 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9691 SDLoc(DAG.getEntryNode()), Arg);
9692
9693 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9694 // masking operations anyway.
9695 //
9696 // TODO: We could assert the top bit is 0 for the source copy.
9697 if (Arg.isMasked())
9698 return Val;
9699
9700 // Preserve the known bits after expansion to a copy.
9701 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9702 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9703 DAG.getValueType(SmallVT));
9704}
9705
9706SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9707 SelectionDAG &DAG) const {
9708 MachineFunction &MF = DAG.getMachineFunction();
9709 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9710
9711 EVT VT = Op.getValueType();
9712 SDLoc DL(Op);
9713 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9714
9715 // TODO: Should this propagate fast-math-flags?
9716
9717 switch (IntrinsicID) {
9718 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9719 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9720 return emitNonHSAIntrinsicError(DAG, DL, VT);
9721 return getPreloadedValue(DAG, *MFI, VT,
9723 }
9724 case Intrinsic::amdgcn_dispatch_ptr:
9725 case Intrinsic::amdgcn_queue_ptr: {
9726 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9727 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9728 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9729 DL.getDebugLoc()));
9730 return DAG.getPOISON(VT);
9731 }
9732
9733 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9736 return getPreloadedValue(DAG, *MFI, VT, RegID);
9737 }
9738 case Intrinsic::amdgcn_implicitarg_ptr: {
9739 if (MFI->isEntryFunction())
9740 return getImplicitArgPtr(DAG, DL);
9741 return getPreloadedValue(DAG, *MFI, VT,
9743 }
9744 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9746 // This only makes sense to call in a kernel, so just lower to null.
9747 return DAG.getConstant(0, DL, VT);
9748 }
9749
9750 return getPreloadedValue(DAG, *MFI, VT,
9752 }
9753 case Intrinsic::amdgcn_dispatch_id: {
9754 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9755 }
9756 case Intrinsic::amdgcn_rcp:
9757 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9758 case Intrinsic::amdgcn_rsq:
9759 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9760 case Intrinsic::amdgcn_rsq_legacy:
9761 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9762 return emitRemovedIntrinsicError(DAG, DL, VT);
9763 return SDValue();
9764 case Intrinsic::amdgcn_rcp_legacy:
9765 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9766 return emitRemovedIntrinsicError(DAG, DL, VT);
9767 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9768 case Intrinsic::amdgcn_rsq_clamp: {
9769 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9770 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9771
9772 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9773 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9774 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9775
9776 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9777 SDValue Tmp =
9778 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9779 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9780 DAG.getConstantFP(Min, DL, VT));
9781 }
9782 case Intrinsic::r600_read_ngroups_x:
9783 if (Subtarget->isAmdHsaOS())
9784 return emitNonHSAIntrinsicError(DAG, DL, VT);
9785
9786 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9788 false);
9789 case Intrinsic::r600_read_ngroups_y:
9790 if (Subtarget->isAmdHsaOS())
9791 return emitNonHSAIntrinsicError(DAG, DL, VT);
9792
9793 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9795 false);
9796 case Intrinsic::r600_read_ngroups_z:
9797 if (Subtarget->isAmdHsaOS())
9798 return emitNonHSAIntrinsicError(DAG, DL, VT);
9799
9800 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9802 false);
9803 case Intrinsic::r600_read_local_size_x:
9804 if (Subtarget->isAmdHsaOS())
9805 return emitNonHSAIntrinsicError(DAG, DL, VT);
9806
9807 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9809 case Intrinsic::r600_read_local_size_y:
9810 if (Subtarget->isAmdHsaOS())
9811 return emitNonHSAIntrinsicError(DAG, DL, VT);
9812
9813 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9815 case Intrinsic::r600_read_local_size_z:
9816 if (Subtarget->isAmdHsaOS())
9817 return emitNonHSAIntrinsicError(DAG, DL, VT);
9818
9819 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9821 case Intrinsic::amdgcn_workgroup_id_x:
9822 return lowerWorkGroupId(DAG, *MFI, VT,
9826 case Intrinsic::amdgcn_workgroup_id_y:
9827 return lowerWorkGroupId(DAG, *MFI, VT,
9831 case Intrinsic::amdgcn_workgroup_id_z:
9832 return lowerWorkGroupId(DAG, *MFI, VT,
9836 case Intrinsic::amdgcn_cluster_id_x:
9837 return Subtarget->hasClusters()
9838 ? getPreloadedValue(DAG, *MFI, VT,
9840 : DAG.getPOISON(VT);
9841 case Intrinsic::amdgcn_cluster_id_y:
9842 return Subtarget->hasClusters()
9843 ? getPreloadedValue(DAG, *MFI, VT,
9845 : DAG.getPOISON(VT);
9846 case Intrinsic::amdgcn_cluster_id_z:
9847 return Subtarget->hasClusters()
9848 ? getPreloadedValue(DAG, *MFI, VT,
9850 : DAG.getPOISON(VT);
9851 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9852 return Subtarget->hasClusters()
9853 ? getPreloadedValue(
9854 DAG, *MFI, VT,
9856 : DAG.getPOISON(VT);
9857 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9858 return Subtarget->hasClusters()
9859 ? getPreloadedValue(
9860 DAG, *MFI, VT,
9862 : DAG.getPOISON(VT);
9863 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9864 return Subtarget->hasClusters()
9865 ? getPreloadedValue(
9866 DAG, *MFI, VT,
9868 : DAG.getPOISON(VT);
9869 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9870 return Subtarget->hasClusters()
9871 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9872 : SDValue();
9873 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9874 return Subtarget->hasClusters()
9875 ? getPreloadedValue(
9876 DAG, *MFI, VT,
9878 : DAG.getPOISON(VT);
9879 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9880 return Subtarget->hasClusters()
9881 ? getPreloadedValue(
9882 DAG, *MFI, VT,
9884 : DAG.getPOISON(VT);
9885 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9886 return Subtarget->hasClusters()
9887 ? getPreloadedValue(
9888 DAG, *MFI, VT,
9890 : DAG.getPOISON(VT);
9891 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9892 return Subtarget->hasClusters()
9893 ? getPreloadedValue(
9894 DAG, *MFI, VT,
9896 : DAG.getPOISON(VT);
9897 case Intrinsic::amdgcn_wave_id:
9898 return lowerWaveID(DAG, Op);
9899 case Intrinsic::amdgcn_lds_kernel_id: {
9900 if (MFI->isEntryFunction())
9901 return getLDSKernelId(DAG, DL);
9902 return getPreloadedValue(DAG, *MFI, VT,
9904 }
9905 case Intrinsic::amdgcn_workitem_id_x:
9906 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9907 case Intrinsic::amdgcn_workitem_id_y:
9908 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9909 case Intrinsic::amdgcn_workitem_id_z:
9910 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9911 case Intrinsic::amdgcn_wavefrontsize:
9912 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9913 SDLoc(Op), MVT::i32);
9914 case Intrinsic::amdgcn_s_buffer_load: {
9915 unsigned CPol = Op.getConstantOperandVal(3);
9916 // s_buffer_load, because of how it's optimized, can't be volatile
9917 // so reject ones with the volatile bit set.
9918 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9921 return Op;
9922 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9923 Op.getOperand(3), DAG);
9924 }
9925 case Intrinsic::amdgcn_fdiv_fast:
9926 return lowerFDIV_FAST(Op, DAG);
9927 case Intrinsic::amdgcn_sin:
9928 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9929
9930 case Intrinsic::amdgcn_cos:
9931 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9932
9933 case Intrinsic::amdgcn_mul_u24:
9934 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9935 Op.getOperand(2));
9936 case Intrinsic::amdgcn_mul_i24:
9937 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9938 Op.getOperand(2));
9939
9940 case Intrinsic::amdgcn_log_clamp: {
9941 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9942 return SDValue();
9943
9944 return emitRemovedIntrinsicError(DAG, DL, VT);
9945 }
9946 case Intrinsic::amdgcn_fract:
9947 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9948
9949 case Intrinsic::amdgcn_class:
9950 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9951 Op.getOperand(2));
9952 case Intrinsic::amdgcn_div_fmas:
9953 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9954 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9955
9956 case Intrinsic::amdgcn_div_fixup:
9957 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9958 Op.getOperand(2), Op.getOperand(3));
9959
9960 case Intrinsic::amdgcn_div_scale: {
9961 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9962
9963 // Translate to the operands expected by the machine instruction. The
9964 // first parameter must be the same as the first instruction.
9965 SDValue Numerator = Op.getOperand(1);
9966 SDValue Denominator = Op.getOperand(2);
9967
9968 // Note this order is opposite of the machine instruction's operations,
9969 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9970 // intrinsic has the numerator as the first operand to match a normal
9971 // division operation.
9972
9973 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9974
9975 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9976 Denominator, Numerator);
9977 }
9978 case Intrinsic::amdgcn_icmp: {
9979 // There is a Pat that handles this variant, so return it as-is.
9980 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9981 Op.getConstantOperandVal(2) == 0 &&
9982 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9983 return Op;
9984 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9985 }
9986 case Intrinsic::amdgcn_fcmp: {
9987 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9988 }
9989 case Intrinsic::amdgcn_ballot:
9990 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9991 case Intrinsic::amdgcn_fmed3:
9992 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9993 Op.getOperand(2), Op.getOperand(3));
9994 case Intrinsic::amdgcn_fdot2:
9995 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9996 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9997 case Intrinsic::amdgcn_fmul_legacy:
9998 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9999 Op.getOperand(2));
10000 case Intrinsic::amdgcn_sffbh:
10001 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10002 case Intrinsic::amdgcn_sbfe:
10003 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10004 Op.getOperand(2), Op.getOperand(3));
10005 case Intrinsic::amdgcn_ubfe:
10006 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10007 Op.getOperand(2), Op.getOperand(3));
10008 case Intrinsic::amdgcn_cvt_pkrtz:
10009 case Intrinsic::amdgcn_cvt_pknorm_i16:
10010 case Intrinsic::amdgcn_cvt_pknorm_u16:
10011 case Intrinsic::amdgcn_cvt_pk_i16:
10012 case Intrinsic::amdgcn_cvt_pk_u16: {
10013 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10014 EVT VT = Op.getValueType();
10015 unsigned Opcode;
10016
10017 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10019 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10021 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10023 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10025 else
10027
10028 if (isTypeLegal(VT))
10029 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10030
10031 SDValue Node =
10032 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10033 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10034 }
10035 case Intrinsic::amdgcn_fmad_ftz:
10036 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10037 Op.getOperand(2), Op.getOperand(3));
10038
10039 case Intrinsic::amdgcn_if_break:
10040 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10041 Op->getOperand(1), Op->getOperand(2)),
10042 0);
10043
10044 case Intrinsic::amdgcn_groupstaticsize: {
10046 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10047 return Op;
10048
10049 const Module *M = MF.getFunction().getParent();
10050 const GlobalValue *GV =
10051 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10052 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10054 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10055 }
10056 case Intrinsic::amdgcn_is_shared:
10057 case Intrinsic::amdgcn_is_private: {
10058 SDLoc SL(Op);
10059 SDValue SrcVec =
10060 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10061 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10062 DAG.getConstant(1, SL, MVT::i32));
10063
10064 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10066 : AMDGPUAS::PRIVATE_ADDRESS;
10067 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10068 Subtarget->hasGloballyAddressableScratch()) {
10069 SDValue FlatScratchBaseHi(
10070 DAG.getMachineNode(
10071 AMDGPU::S_MOV_B32, DL, MVT::i32,
10072 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10073 0);
10074 // Test bits 63..58 against the aperture address.
10075 return DAG.getSetCC(
10076 SL, MVT::i1,
10077 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10078 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10079 }
10080
10081 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10082 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10083 }
10084 case Intrinsic::amdgcn_perm:
10085 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10086 Op.getOperand(2), Op.getOperand(3));
10087 case Intrinsic::amdgcn_reloc_constant: {
10088 Module *M = MF.getFunction().getParent();
10089 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10090 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10091 auto *RelocSymbol = cast<GlobalVariable>(
10092 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10093 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10095 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10096 }
10097 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10098 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10099 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10100 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10101 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10102 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10103 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10104 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10105 if (Op.getOperand(4).getValueType() == MVT::i32)
10106 return SDValue();
10107
10108 SDLoc SL(Op);
10109 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10110 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10111 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10112 Op.getOperand(3), IndexKeyi32);
10113 }
10114 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10115 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10116 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10117 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10118 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10119 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10120 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10121 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10122 if (Op.getOperand(4).getValueType() == MVT::i64)
10123 return SDValue();
10124
10125 SDLoc SL(Op);
10126 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10127 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10128 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10129 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10130 Op.getOperand(6)});
10131 }
10132 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10133 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10134 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10135 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10136 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10137 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10138 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10139 ? MVT::i64
10140 : MVT::i32;
10141 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10142 return SDValue();
10143
10144 SDLoc SL(Op);
10145 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10146 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10147 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10148 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10149 IndexKey, Op.getOperand(7),
10150 Op.getOperand(8)}); // No clamp operand
10151 }
10152 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10153 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10154 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10155 if (Op.getOperand(6).getValueType() == MVT::i32)
10156 return SDValue();
10157
10158 SDLoc SL(Op);
10159 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10160 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10161 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10162 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10163 IndexKeyi32, Op.getOperand(7)});
10164 }
10165 case Intrinsic::amdgcn_addrspacecast_nonnull:
10166 return lowerADDRSPACECAST(Op, DAG);
10167 case Intrinsic::amdgcn_readlane:
10168 case Intrinsic::amdgcn_readfirstlane:
10169 case Intrinsic::amdgcn_writelane:
10170 case Intrinsic::amdgcn_permlane16:
10171 case Intrinsic::amdgcn_permlanex16:
10172 case Intrinsic::amdgcn_permlane64:
10173 case Intrinsic::amdgcn_set_inactive:
10174 case Intrinsic::amdgcn_set_inactive_chain_arg:
10175 case Intrinsic::amdgcn_mov_dpp8:
10176 case Intrinsic::amdgcn_update_dpp:
10177 return lowerLaneOp(*this, Op.getNode(), DAG);
10178 case Intrinsic::amdgcn_dead: {
10180 for (const EVT ValTy : Op.getNode()->values())
10181 Poisons.push_back(DAG.getPOISON(ValTy));
10182 return DAG.getMergeValues(Poisons, SDLoc(Op));
10183 }
10184 default:
10185 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10187 return lowerImage(Op, ImageDimIntr, DAG, false);
10188
10189 return Op;
10190 }
10191}
10192
10193// On targets not supporting constant in soffset field, turn zero to
10194// SGPR_NULL to avoid generating an extra s_mov with zero.
10196 const GCNSubtarget *Subtarget) {
10197 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10198 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10199 return SOffset;
10200}
10201
10202SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10203 SelectionDAG &DAG,
10204 unsigned NewOpcode) const {
10205 SDLoc DL(Op);
10206
10207 SDValue VData = Op.getOperand(2);
10208 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10209 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10210 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10211 SDValue Ops[] = {
10212 Op.getOperand(0), // Chain
10213 VData, // vdata
10214 Rsrc, // rsrc
10215 DAG.getConstant(0, DL, MVT::i32), // vindex
10216 VOffset, // voffset
10217 SOffset, // soffset
10218 Offset, // offset
10219 Op.getOperand(6), // cachepolicy
10220 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10221 };
10222
10223 auto *M = cast<MemSDNode>(Op);
10224
10225 EVT MemVT = VData.getValueType();
10226 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10227 M->getMemOperand());
10228}
10229
10230SDValue
10231SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10232 unsigned NewOpcode) const {
10233 SDLoc DL(Op);
10234
10235 SDValue VData = Op.getOperand(2);
10236 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10237 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10238 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10239 SDValue Ops[] = {
10240 Op.getOperand(0), // Chain
10241 VData, // vdata
10242 Rsrc, // rsrc
10243 Op.getOperand(4), // vindex
10244 VOffset, // voffset
10245 SOffset, // soffset
10246 Offset, // offset
10247 Op.getOperand(7), // cachepolicy
10248 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10249 };
10250
10251 auto *M = cast<MemSDNode>(Op);
10252
10253 EVT MemVT = VData.getValueType();
10254 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10255 M->getMemOperand());
10256}
10257
10258SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10259 SelectionDAG &DAG) const {
10260 unsigned IntrID = Op.getConstantOperandVal(1);
10261 SDLoc DL(Op);
10262
10263 switch (IntrID) {
10264 case Intrinsic::amdgcn_ds_ordered_add:
10265 case Intrinsic::amdgcn_ds_ordered_swap: {
10266 MemSDNode *M = cast<MemSDNode>(Op);
10267 SDValue Chain = M->getOperand(0);
10268 SDValue M0 = M->getOperand(2);
10269 SDValue Value = M->getOperand(3);
10270 unsigned IndexOperand = M->getConstantOperandVal(7);
10271 unsigned WaveRelease = M->getConstantOperandVal(8);
10272 unsigned WaveDone = M->getConstantOperandVal(9);
10273
10274 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10275 IndexOperand &= ~0x3f;
10276 unsigned CountDw = 0;
10277
10278 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10279 CountDw = (IndexOperand >> 24) & 0xf;
10280 IndexOperand &= ~(0xf << 24);
10281
10282 if (CountDw < 1 || CountDw > 4) {
10283 const Function &Fn = DAG.getMachineFunction().getFunction();
10284 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10285 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10286 DL.getDebugLoc()));
10287 CountDw = 1;
10288 }
10289 }
10290
10291 if (IndexOperand) {
10292 const Function &Fn = DAG.getMachineFunction().getFunction();
10293 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10294 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10295 }
10296
10297 if (WaveDone && !WaveRelease) {
10298 // TODO: Move this to IR verifier
10299 const Function &Fn = DAG.getMachineFunction().getFunction();
10300 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10301 Fn, "ds_ordered_count: wave_done requires wave_release",
10302 DL.getDebugLoc()));
10303 }
10304
10305 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10306 unsigned ShaderType =
10308 unsigned Offset0 = OrderedCountIndex << 2;
10309 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10310
10311 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10312 Offset1 |= (CountDw - 1) << 6;
10313
10314 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10315 Offset1 |= ShaderType << 2;
10316
10317 unsigned Offset = Offset0 | (Offset1 << 8);
10318
10319 SDValue Ops[] = {
10320 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10321 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10322 };
10324 M->getVTList(), Ops, M->getMemoryVT(),
10325 M->getMemOperand());
10326 }
10327 case Intrinsic::amdgcn_raw_buffer_load:
10328 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10329 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10330 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10331 case Intrinsic::amdgcn_raw_buffer_load_format:
10332 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10333 const bool IsFormat =
10334 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10335 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10336
10337 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10338 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10339 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10340 SDValue Ops[] = {
10341 Op.getOperand(0), // Chain
10342 Rsrc, // rsrc
10343 DAG.getConstant(0, DL, MVT::i32), // vindex
10344 VOffset, // voffset
10345 SOffset, // soffset
10346 Offset, // offset
10347 Op.getOperand(5), // cachepolicy, swizzled buffer
10348 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10349 };
10350
10351 auto *M = cast<MemSDNode>(Op);
10352 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10353 }
10354 case Intrinsic::amdgcn_struct_buffer_load:
10355 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10356 case Intrinsic::amdgcn_struct_buffer_load_format:
10357 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10358 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10359 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10360 const bool IsFormat =
10361 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10362 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10363
10364 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10365 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10366 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10367 SDValue Ops[] = {
10368 Op.getOperand(0), // Chain
10369 Rsrc, // rsrc
10370 Op.getOperand(3), // vindex
10371 VOffset, // voffset
10372 SOffset, // soffset
10373 Offset, // offset
10374 Op.getOperand(6), // cachepolicy, swizzled buffer
10375 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10376 };
10377
10378 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10379 }
10380 case Intrinsic::amdgcn_raw_tbuffer_load:
10381 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10382 MemSDNode *M = cast<MemSDNode>(Op);
10383 EVT LoadVT = Op.getValueType();
10384 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10385 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10386 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10387
10388 SDValue Ops[] = {
10389 Op.getOperand(0), // Chain
10390 Rsrc, // rsrc
10391 DAG.getConstant(0, DL, MVT::i32), // vindex
10392 VOffset, // voffset
10393 SOffset, // soffset
10394 Offset, // offset
10395 Op.getOperand(5), // format
10396 Op.getOperand(6), // cachepolicy, swizzled buffer
10397 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10398 };
10399
10400 if (LoadVT.getScalarType() == MVT::f16)
10401 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10402 Ops);
10403 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10404 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10405 DAG);
10406 }
10407 case Intrinsic::amdgcn_struct_tbuffer_load:
10408 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10409 MemSDNode *M = cast<MemSDNode>(Op);
10410 EVT LoadVT = Op.getValueType();
10411 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10412 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10413 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10414
10415 SDValue Ops[] = {
10416 Op.getOperand(0), // Chain
10417 Rsrc, // rsrc
10418 Op.getOperand(3), // vindex
10419 VOffset, // voffset
10420 SOffset, // soffset
10421 Offset, // offset
10422 Op.getOperand(6), // format
10423 Op.getOperand(7), // cachepolicy, swizzled buffer
10424 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10425 };
10426
10427 if (LoadVT.getScalarType() == MVT::f16)
10428 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10429 Ops);
10430 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10431 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10432 DAG);
10433 }
10434 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10436 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10437 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10438 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10439 return lowerStructBufferAtomicIntrin(Op, DAG,
10441 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10442 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10443 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10444 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10445 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10446 return lowerStructBufferAtomicIntrin(Op, DAG,
10448 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10449 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10450 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10451 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10452 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10453 return lowerStructBufferAtomicIntrin(Op, DAG,
10455 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10457 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10458 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10460 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10461 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10463 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10464 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10466 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10467 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10469 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10470 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10471 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10472 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10473 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10474 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10475 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10476 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10478 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10479 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10481 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10482 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10483 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10484 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10485 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10486 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10487 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10488 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10489 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10490 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10491 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10492 return lowerRawBufferAtomicIntrin(Op, DAG,
10494 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10495 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10496 return lowerStructBufferAtomicIntrin(Op, DAG,
10498 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10500 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10501 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10502 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10503 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10504 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10505 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10506 return lowerStructBufferAtomicIntrin(Op, DAG,
10508 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10509 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10510 return lowerStructBufferAtomicIntrin(Op, DAG,
10512 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10513 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10514 return lowerStructBufferAtomicIntrin(Op, DAG,
10516 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10517 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10518 return lowerStructBufferAtomicIntrin(Op, DAG,
10520 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10521 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10522 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10523 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10525 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10526 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10527 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10528 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10529 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10530 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10531 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10532 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10533 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10534 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10535 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10536 return lowerStructBufferAtomicIntrin(Op, DAG,
10538
10539 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10540 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10541 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10542 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10543 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10544 SDValue Ops[] = {
10545 Op.getOperand(0), // Chain
10546 Op.getOperand(2), // src
10547 Op.getOperand(3), // cmp
10548 Rsrc, // rsrc
10549 DAG.getConstant(0, DL, MVT::i32), // vindex
10550 VOffset, // voffset
10551 SOffset, // soffset
10552 Offset, // offset
10553 Op.getOperand(7), // cachepolicy
10554 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10555 };
10556 EVT VT = Op.getValueType();
10557 auto *M = cast<MemSDNode>(Op);
10558
10560 Op->getVTList(), Ops, VT,
10561 M->getMemOperand());
10562 }
10563 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10564 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10565 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10566 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10567 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10568 SDValue Ops[] = {
10569 Op.getOperand(0), // Chain
10570 Op.getOperand(2), // src
10571 Op.getOperand(3), // cmp
10572 Rsrc, // rsrc
10573 Op.getOperand(5), // vindex
10574 VOffset, // voffset
10575 SOffset, // soffset
10576 Offset, // offset
10577 Op.getOperand(8), // cachepolicy
10578 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10579 };
10580 EVT VT = Op.getValueType();
10581 auto *M = cast<MemSDNode>(Op);
10582
10584 Op->getVTList(), Ops, VT,
10585 M->getMemOperand());
10586 }
10587 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10588 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10589 MemSDNode *M = cast<MemSDNode>(Op);
10590 SDValue NodePtr = M->getOperand(2);
10591 SDValue RayExtent = M->getOperand(3);
10592 SDValue InstanceMask = M->getOperand(4);
10593 SDValue RayOrigin = M->getOperand(5);
10594 SDValue RayDir = M->getOperand(6);
10595 SDValue Offsets = M->getOperand(7);
10596 SDValue TDescr = M->getOperand(8);
10597
10598 assert(NodePtr.getValueType() == MVT::i64);
10599 assert(RayDir.getValueType() == MVT::v3f32);
10600
10601 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10602 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10603 return SDValue();
10604 }
10605
10606 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10607 const unsigned NumVDataDwords = 10;
10608 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10609 int Opcode = AMDGPU::getMIMGOpcode(
10610 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10611 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10612 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10613 assert(Opcode != -1);
10614
10616 Ops.push_back(NodePtr);
10617 Ops.push_back(DAG.getBuildVector(
10618 MVT::v2i32, DL,
10619 {DAG.getBitcast(MVT::i32, RayExtent),
10620 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10621 Ops.push_back(RayOrigin);
10622 Ops.push_back(RayDir);
10623 Ops.push_back(Offsets);
10624 Ops.push_back(TDescr);
10625 Ops.push_back(M->getChain());
10626
10627 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10628 MachineMemOperand *MemRef = M->getMemOperand();
10629 DAG.setNodeMemRefs(NewNode, {MemRef});
10630 return SDValue(NewNode, 0);
10631 }
10632 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10633 MemSDNode *M = cast<MemSDNode>(Op);
10634 SDValue NodePtr = M->getOperand(2);
10635 SDValue RayExtent = M->getOperand(3);
10636 SDValue RayOrigin = M->getOperand(4);
10637 SDValue RayDir = M->getOperand(5);
10638 SDValue RayInvDir = M->getOperand(6);
10639 SDValue TDescr = M->getOperand(7);
10640
10641 assert(NodePtr.getValueType() == MVT::i32 ||
10642 NodePtr.getValueType() == MVT::i64);
10643 assert(RayDir.getValueType() == MVT::v3f16 ||
10644 RayDir.getValueType() == MVT::v3f32);
10645
10646 if (!Subtarget->hasGFX10_AEncoding()) {
10647 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10648 return SDValue();
10649 }
10650
10651 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10652 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10653 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10654 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10655 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10656 const unsigned NumVDataDwords = 4;
10657 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10658 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10659 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10660 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10661 IsGFX12Plus;
10662 const unsigned BaseOpcodes[2][2] = {
10663 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10664 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10665 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10666 int Opcode;
10667 if (UseNSA) {
10668 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10669 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10670 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10671 : AMDGPU::MIMGEncGfx10NSA,
10672 NumVDataDwords, NumVAddrDwords);
10673 } else {
10674 assert(!IsGFX12Plus);
10675 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10676 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10677 : AMDGPU::MIMGEncGfx10Default,
10678 NumVDataDwords, NumVAddrDwords);
10679 }
10680 assert(Opcode != -1);
10681
10683
10684 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10686 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10687 if (Lanes[0].getValueSizeInBits() == 32) {
10688 for (unsigned I = 0; I < 3; ++I)
10689 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10690 } else {
10691 if (IsAligned) {
10692 Ops.push_back(DAG.getBitcast(
10693 MVT::i32,
10694 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10695 Ops.push_back(Lanes[2]);
10696 } else {
10697 SDValue Elt0 = Ops.pop_back_val();
10698 Ops.push_back(DAG.getBitcast(
10699 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10700 Ops.push_back(DAG.getBitcast(
10701 MVT::i32,
10702 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10703 }
10704 }
10705 };
10706
10707 if (UseNSA && IsGFX11Plus) {
10708 Ops.push_back(NodePtr);
10709 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10710 Ops.push_back(RayOrigin);
10711 if (IsA16) {
10712 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10713 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10714 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10715 for (unsigned I = 0; I < 3; ++I) {
10716 MergedLanes.push_back(DAG.getBitcast(
10717 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10718 {DirLanes[I], InvDirLanes[I]})));
10719 }
10720 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10721 } else {
10722 Ops.push_back(RayDir);
10723 Ops.push_back(RayInvDir);
10724 }
10725 } else {
10726 if (Is64)
10727 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10728 2);
10729 else
10730 Ops.push_back(NodePtr);
10731
10732 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10733 packLanes(RayOrigin, true);
10734 packLanes(RayDir, true);
10735 packLanes(RayInvDir, false);
10736 }
10737
10738 if (!UseNSA) {
10739 // Build a single vector containing all the operands so far prepared.
10740 if (NumVAddrDwords > 12) {
10741 SDValue Undef = DAG.getPOISON(MVT::i32);
10742 Ops.append(16 - Ops.size(), Undef);
10743 }
10744 assert(Ops.size() >= 8 && Ops.size() <= 12);
10745 SDValue MergedOps =
10746 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10747 Ops.clear();
10748 Ops.push_back(MergedOps);
10749 }
10750
10751 Ops.push_back(TDescr);
10752 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10753 Ops.push_back(M->getChain());
10754
10755 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10756 MachineMemOperand *MemRef = M->getMemOperand();
10757 DAG.setNodeMemRefs(NewNode, {MemRef});
10758 return SDValue(NewNode, 0);
10759 }
10760 case Intrinsic::amdgcn_global_atomic_fmin_num:
10761 case Intrinsic::amdgcn_global_atomic_fmax_num:
10762 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10763 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10764 MemSDNode *M = cast<MemSDNode>(Op);
10765 SDValue Ops[] = {
10766 M->getOperand(0), // Chain
10767 M->getOperand(2), // Ptr
10768 M->getOperand(3) // Value
10769 };
10770 unsigned Opcode = 0;
10771 switch (IntrID) {
10772 case Intrinsic::amdgcn_global_atomic_fmin_num:
10773 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10774 Opcode = ISD::ATOMIC_LOAD_FMIN;
10775 break;
10776 }
10777 case Intrinsic::amdgcn_global_atomic_fmax_num:
10778 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10779 Opcode = ISD::ATOMIC_LOAD_FMAX;
10780 break;
10781 }
10782 default:
10783 llvm_unreachable("unhandled atomic opcode");
10784 }
10785 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10786 Ops, M->getMemOperand());
10787 }
10788 case Intrinsic::amdgcn_s_get_barrier_state:
10789 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10790 SDValue Chain = Op->getOperand(0);
10792 unsigned Opc;
10793
10794 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10795 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10796 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10797 BarID = (BarID >> 4) & 0x3F;
10798 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10799 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10800 Ops.push_back(K);
10801 Ops.push_back(Chain);
10802 } else {
10803 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10804 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10805 SDValue M0Val;
10806 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10807 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10808 M0Val = SDValue(
10809 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10810 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10811 0);
10812 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10813 } else
10814 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10815 }
10816
10817 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10818 return SDValue(NewMI, 0);
10819 }
10820 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10821 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10822 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10823 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10824 SDValue Chain = Op->getOperand(0);
10825 SDValue Ptr = Op->getOperand(2);
10826 EVT VT = Op->getValueType(0);
10827 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10828 Chain, Ptr, MII->getMemOperand());
10829 }
10830 default:
10831
10832 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10834 return lowerImage(Op, ImageDimIntr, DAG, true);
10835
10836 return SDValue();
10837 }
10838}
10839
10840// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10841// dwordx4 if on SI and handle TFE loads.
10842SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10843 SDVTList VTList,
10844 ArrayRef<SDValue> Ops, EVT MemVT,
10845 MachineMemOperand *MMO,
10846 SelectionDAG &DAG) const {
10847 LLVMContext &C = *DAG.getContext();
10848 MachineFunction &MF = DAG.getMachineFunction();
10849 EVT VT = VTList.VTs[0];
10850
10851 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10852 bool IsTFE = VTList.NumVTs == 3;
10853 if (IsTFE) {
10854 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10855 unsigned NumOpDWords = NumValueDWords + 1;
10856 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10857 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10858 MachineMemOperand *OpDWordsMMO =
10859 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10860 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10861 OpDWordsVT, OpDWordsMMO, DAG);
10862 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10863 DAG.getVectorIdxConstant(NumValueDWords, DL));
10864 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10865 SDValue ValueDWords =
10866 NumValueDWords == 1
10867 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10869 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10870 ZeroIdx);
10871 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10872 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10873 }
10874
10875 if (!Subtarget->hasDwordx3LoadStores() &&
10876 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10877 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10878 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10879 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10880 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10881 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10882 WidenedMemVT, WidenedMMO);
10884 DAG.getVectorIdxConstant(0, DL));
10885 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10886 }
10887
10888 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10889}
10890
10891SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10892 bool ImageStore) const {
10893 EVT StoreVT = VData.getValueType();
10894
10895 // No change for f16 and legal vector D16 types.
10896 if (!StoreVT.isVector())
10897 return VData;
10898
10899 SDLoc DL(VData);
10900 unsigned NumElements = StoreVT.getVectorNumElements();
10901
10902 if (Subtarget->hasUnpackedD16VMem()) {
10903 // We need to unpack the packed data to store.
10904 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10905 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10906
10907 EVT EquivStoreVT =
10908 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10909 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10910 return DAG.UnrollVectorOp(ZExt.getNode());
10911 }
10912
10913 // The sq block of gfx8.1 does not estimate register use correctly for d16
10914 // image store instructions. The data operand is computed as if it were not a
10915 // d16 image instruction.
10916 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10917 // Bitcast to i16
10918 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10919 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10920
10921 // Decompose into scalars
10923 DAG.ExtractVectorElements(IntVData, Elts);
10924
10925 // Group pairs of i16 into v2i16 and bitcast to i32
10926 SmallVector<SDValue, 4> PackedElts;
10927 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10928 SDValue Pair =
10929 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10930 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10931 PackedElts.push_back(IntPair);
10932 }
10933 if ((NumElements % 2) == 1) {
10934 // Handle v3i16
10935 unsigned I = Elts.size() / 2;
10936 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10937 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10938 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10939 PackedElts.push_back(IntPair);
10940 }
10941
10942 // Pad using UNDEF
10943 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10944
10945 // Build final vector
10946 EVT VecVT =
10947 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10948 return DAG.getBuildVector(VecVT, DL, PackedElts);
10949 }
10950
10951 if (NumElements == 3) {
10952 EVT IntStoreVT =
10954 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10955
10956 EVT WidenedStoreVT = EVT::getVectorVT(
10957 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10958 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10959 WidenedStoreVT.getStoreSizeInBits());
10960 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10961 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10962 }
10963
10964 assert(isTypeLegal(StoreVT));
10965 return VData;
10966}
10967
10968SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10969 SelectionDAG &DAG) const {
10970 SDLoc DL(Op);
10971 SDValue Chain = Op.getOperand(0);
10972 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10973 MachineFunction &MF = DAG.getMachineFunction();
10974
10975 switch (IntrinsicID) {
10976 case Intrinsic::amdgcn_exp_compr: {
10977 if (!Subtarget->hasCompressedExport()) {
10978 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10980 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10981 }
10982 SDValue Src0 = Op.getOperand(4);
10983 SDValue Src1 = Op.getOperand(5);
10984 // Hack around illegal type on SI by directly selecting it.
10985 if (isTypeLegal(Src0.getValueType()))
10986 return SDValue();
10987
10988 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10989 SDValue Undef = DAG.getPOISON(MVT::f32);
10990 const SDValue Ops[] = {
10991 Op.getOperand(2), // tgt
10992 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10993 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10994 Undef, // src2
10995 Undef, // src3
10996 Op.getOperand(7), // vm
10997 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10998 Op.getOperand(3), // en
10999 Op.getOperand(0) // Chain
11000 };
11001
11002 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11003 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11004 }
11005
11006 case Intrinsic::amdgcn_struct_tbuffer_store:
11007 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11008 SDValue VData = Op.getOperand(2);
11009 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11010 if (IsD16)
11011 VData = handleD16VData(VData, DAG);
11012 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11013 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11014 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11015 SDValue Ops[] = {
11016 Chain,
11017 VData, // vdata
11018 Rsrc, // rsrc
11019 Op.getOperand(4), // vindex
11020 VOffset, // voffset
11021 SOffset, // soffset
11022 Offset, // offset
11023 Op.getOperand(7), // format
11024 Op.getOperand(8), // cachepolicy, swizzled buffer
11025 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11026 };
11027 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11029 MemSDNode *M = cast<MemSDNode>(Op);
11030 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11031 M->getMemoryVT(), M->getMemOperand());
11032 }
11033
11034 case Intrinsic::amdgcn_raw_tbuffer_store:
11035 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11036 SDValue VData = Op.getOperand(2);
11037 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11038 if (IsD16)
11039 VData = handleD16VData(VData, DAG);
11040 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11041 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11042 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11043 SDValue Ops[] = {
11044 Chain,
11045 VData, // vdata
11046 Rsrc, // rsrc
11047 DAG.getConstant(0, DL, MVT::i32), // vindex
11048 VOffset, // voffset
11049 SOffset, // soffset
11050 Offset, // offset
11051 Op.getOperand(6), // format
11052 Op.getOperand(7), // cachepolicy, swizzled buffer
11053 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11054 };
11055 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11057 MemSDNode *M = cast<MemSDNode>(Op);
11058 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11059 M->getMemoryVT(), M->getMemOperand());
11060 }
11061
11062 case Intrinsic::amdgcn_raw_buffer_store:
11063 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11064 case Intrinsic::amdgcn_raw_buffer_store_format:
11065 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11066 const bool IsFormat =
11067 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11068 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11069
11070 SDValue VData = Op.getOperand(2);
11071 EVT VDataVT = VData.getValueType();
11072 EVT EltType = VDataVT.getScalarType();
11073 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11074 if (IsD16) {
11075 VData = handleD16VData(VData, DAG);
11076 VDataVT = VData.getValueType();
11077 }
11078
11079 if (!isTypeLegal(VDataVT)) {
11080 VData =
11081 DAG.getNode(ISD::BITCAST, DL,
11082 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11083 }
11084
11085 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11086 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11087 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11088 SDValue Ops[] = {
11089 Chain,
11090 VData,
11091 Rsrc,
11092 DAG.getConstant(0, DL, MVT::i32), // vindex
11093 VOffset, // voffset
11094 SOffset, // soffset
11095 Offset, // offset
11096 Op.getOperand(6), // cachepolicy, swizzled buffer
11097 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11098 };
11099 unsigned Opc =
11102 MemSDNode *M = cast<MemSDNode>(Op);
11103
11104 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11105 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11106 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11107
11108 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11109 M->getMemoryVT(), M->getMemOperand());
11110 }
11111
11112 case Intrinsic::amdgcn_struct_buffer_store:
11113 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11114 case Intrinsic::amdgcn_struct_buffer_store_format:
11115 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11116 const bool IsFormat =
11117 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11118 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11119
11120 SDValue VData = Op.getOperand(2);
11121 EVT VDataVT = VData.getValueType();
11122 EVT EltType = VDataVT.getScalarType();
11123 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11124
11125 if (IsD16) {
11126 VData = handleD16VData(VData, DAG);
11127 VDataVT = VData.getValueType();
11128 }
11129
11130 if (!isTypeLegal(VDataVT)) {
11131 VData =
11132 DAG.getNode(ISD::BITCAST, DL,
11133 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11134 }
11135
11136 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11137 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11138 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11139 SDValue Ops[] = {
11140 Chain,
11141 VData,
11142 Rsrc,
11143 Op.getOperand(4), // vindex
11144 VOffset, // voffset
11145 SOffset, // soffset
11146 Offset, // offset
11147 Op.getOperand(7), // cachepolicy, swizzled buffer
11148 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11149 };
11150 unsigned Opc =
11153 MemSDNode *M = cast<MemSDNode>(Op);
11154
11155 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11156 EVT VDataType = VData.getValueType().getScalarType();
11157 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11158 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11159
11160 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11161 M->getMemoryVT(), M->getMemOperand());
11162 }
11163 case Intrinsic::amdgcn_raw_buffer_load_lds:
11164 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11165 case Intrinsic::amdgcn_struct_buffer_load_lds:
11166 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11167 if (!Subtarget->hasVMemToLDSLoad())
11168 return SDValue();
11169 unsigned Opc;
11170 bool HasVIndex =
11171 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11172 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11173 unsigned OpOffset = HasVIndex ? 1 : 0;
11174 SDValue VOffset = Op.getOperand(5 + OpOffset);
11175 bool HasVOffset = !isNullConstant(VOffset);
11176 unsigned Size = Op->getConstantOperandVal(4);
11177
11178 switch (Size) {
11179 default:
11180 return SDValue();
11181 case 1:
11182 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11183 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11184 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11185 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11186 break;
11187 case 2:
11188 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11189 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11190 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11191 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11192 break;
11193 case 4:
11194 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11195 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11196 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11197 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11198 break;
11199 case 12:
11200 if (!Subtarget->hasLDSLoadB96_B128())
11201 return SDValue();
11202 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11203 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11204 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11205 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11206 break;
11207 case 16:
11208 if (!Subtarget->hasLDSLoadB96_B128())
11209 return SDValue();
11210 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11211 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11212 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11213 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11214 break;
11215 }
11216
11217 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11218
11220
11221 if (HasVIndex && HasVOffset)
11222 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11223 {Op.getOperand(5), // VIndex
11224 VOffset}));
11225 else if (HasVIndex)
11226 Ops.push_back(Op.getOperand(5));
11227 else if (HasVOffset)
11228 Ops.push_back(VOffset);
11229
11230 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11231 Ops.push_back(Rsrc);
11232 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11233 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11234 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11235 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11236 Ops.push_back(DAG.getTargetConstant(
11237 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11238 DL, MVT::i8)); // cpol
11239 Ops.push_back(DAG.getTargetConstant(
11240 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11241 ? 1
11242 : 0,
11243 DL, MVT::i8)); // swz
11244 Ops.push_back(M0Val.getValue(0)); // Chain
11245 Ops.push_back(M0Val.getValue(1)); // Glue
11246
11247 auto *M = cast<MemSDNode>(Op);
11248 MachineMemOperand *LoadMMO = M->getMemOperand();
11249 // Don't set the offset value here because the pointer points to the base of
11250 // the buffer.
11251 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11252
11253 MachinePointerInfo StorePtrI = LoadPtrI;
11254 LoadPtrI.V = PoisonValue::get(
11258
11259 auto F = LoadMMO->getFlags() &
11261 LoadMMO =
11263 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11264
11265 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11266 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11267 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11268
11269 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11270 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11271
11272 return SDValue(Load, 0);
11273 }
11274 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11275 // for "trust me" that the remaining cases are global pointers until
11276 // such time as we can put two mem operands on an intrinsic.
11277 case Intrinsic::amdgcn_load_to_lds:
11278 case Intrinsic::amdgcn_global_load_lds: {
11279 if (!Subtarget->hasVMemToLDSLoad())
11280 return SDValue();
11281
11282 unsigned Opc;
11283 unsigned Size = Op->getConstantOperandVal(4);
11284 switch (Size) {
11285 default:
11286 return SDValue();
11287 case 1:
11288 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11289 break;
11290 case 2:
11291 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11292 break;
11293 case 4:
11294 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11295 break;
11296 case 12:
11297 if (!Subtarget->hasLDSLoadB96_B128())
11298 return SDValue();
11299 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11300 break;
11301 case 16:
11302 if (!Subtarget->hasLDSLoadB96_B128())
11303 return SDValue();
11304 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11305 break;
11306 }
11307
11308 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11309
11311
11312 SDValue Addr = Op.getOperand(2); // Global ptr
11313 SDValue VOffset;
11314 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11315 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11316 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
11317 SDValue LHS = Addr.getOperand(0);
11318 SDValue RHS = Addr.getOperand(1);
11319
11320 if (LHS->isDivergent())
11321 std::swap(LHS, RHS);
11322
11323 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11324 RHS.getOperand(0).getValueType() == MVT::i32) {
11325 // add (i64 sgpr), (zero_extend (i32 vgpr))
11326 Addr = LHS;
11327 VOffset = RHS.getOperand(0);
11328 }
11329 }
11330
11331 Ops.push_back(Addr);
11332 if (!Addr->isDivergent()) {
11334 if (!VOffset)
11335 VOffset =
11336 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11337 DAG.getTargetConstant(0, DL, MVT::i32)),
11338 0);
11339 Ops.push_back(VOffset);
11340 }
11341
11342 Ops.push_back(Op.getOperand(5)); // Offset
11343 Ops.push_back(Op.getOperand(6)); // CPol
11344 Ops.push_back(M0Val.getValue(0)); // Chain
11345 Ops.push_back(M0Val.getValue(1)); // Glue
11346
11347 auto *M = cast<MemSDNode>(Op);
11348 MachineMemOperand *LoadMMO = M->getMemOperand();
11349 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11350 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11351 MachinePointerInfo StorePtrI = LoadPtrI;
11352 LoadPtrI.V = PoisonValue::get(
11356 auto F = LoadMMO->getFlags() &
11358 LoadMMO =
11360 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11361 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11362 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11363 LoadMMO->getAAInfo());
11364
11365 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11366 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11367
11368 return SDValue(Load, 0);
11369 }
11370 case Intrinsic::amdgcn_end_cf:
11371 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11372 Op->getOperand(2), Chain),
11373 0);
11374 case Intrinsic::amdgcn_s_barrier_init:
11375 case Intrinsic::amdgcn_s_barrier_signal_var: {
11376 // these two intrinsics have two operands: barrier pointer and member count
11377 SDValue Chain = Op->getOperand(0);
11379 SDValue BarOp = Op->getOperand(2);
11380 SDValue CntOp = Op->getOperand(3);
11381 SDValue M0Val;
11382 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11383 ? AMDGPU::S_BARRIER_INIT_M0
11384 : AMDGPU::S_BARRIER_SIGNAL_M0;
11385 // extract the BarrierID from bits 4-9 of BarOp
11386 SDValue BarID;
11387 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11388 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11389 BarID =
11390 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11391 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11392 0);
11393 // Member count should be put into M0[ShAmt:+6]
11394 // Barrier ID should be put into M0[5:0]
11395 M0Val =
11396 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11397 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11398 0);
11399 constexpr unsigned ShAmt = 16;
11400 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11401 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11402
11403 M0Val = SDValue(
11404 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11405
11406 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11407
11408 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11409 return SDValue(NewMI, 0);
11410 }
11411 case Intrinsic::amdgcn_s_barrier_join: {
11412 // these three intrinsics have one operand: barrier pointer
11413 SDValue Chain = Op->getOperand(0);
11415 SDValue BarOp = Op->getOperand(2);
11416 unsigned Opc;
11417
11418 if (isa<ConstantSDNode>(BarOp)) {
11419 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11420 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11421
11422 // extract the BarrierID from bits 4-9 of the immediate
11423 unsigned BarID = (BarVal >> 4) & 0x3F;
11424 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11425 Ops.push_back(K);
11426 Ops.push_back(Chain);
11427 } else {
11428 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11429
11430 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11431 SDValue M0Val;
11432 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11433 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11434 M0Val =
11435 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11436 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11437 0);
11438 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11439 }
11440
11441 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11442 return SDValue(NewMI, 0);
11443 }
11444 case Intrinsic::amdgcn_s_prefetch_data: {
11445 // For non-global address space preserve the chain and remove the call.
11447 return Op.getOperand(0);
11448 return Op;
11449 }
11450 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11451 SDValue Ops[] = {
11452 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11453 Op.getOperand(3), // offset
11454 Op.getOperand(4), // length
11455 };
11456
11457 MemSDNode *M = cast<MemSDNode>(Op);
11459 Op->getVTList(), Ops, M->getMemoryVT(),
11460 M->getMemOperand());
11461 }
11462 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11463 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11464 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11465 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11466 SDValue Chain = Op->getOperand(0);
11467 SDValue Ptr = Op->getOperand(2);
11468 SDValue Val = Op->getOperand(3);
11469 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11470 Ptr, MII->getMemOperand());
11471 }
11472 default: {
11473 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11475 return lowerImage(Op, ImageDimIntr, DAG, true);
11476
11477 return Op;
11478 }
11479 }
11480}
11481
11482// Return whether the operation has NoUnsignedWrap property.
11483static bool isNoUnsignedWrap(SDValue Addr) {
11484 return (Addr.getOpcode() == ISD::ADD &&
11485 Addr->getFlags().hasNoUnsignedWrap()) ||
11486 Addr->getOpcode() == ISD::OR;
11487}
11488
11490 EVT PtrVT) const {
11491 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11492}
11493
11494// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11495// offset (the offset that is included in bounds checking and swizzling, to be
11496// split between the instruction's voffset and immoffset fields) and soffset
11497// (the offset that is excluded from bounds checking and swizzling, to go in
11498// the instruction's soffset field). This function takes the first kind of
11499// offset and figures out how to split it between voffset and immoffset.
11500std::pair<SDValue, SDValue>
11501SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11502 SDLoc DL(Offset);
11503 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11504 SDValue N0 = Offset;
11505 ConstantSDNode *C1 = nullptr;
11506
11507 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11508 N0 = SDValue();
11509 else if (DAG.isBaseWithConstantOffset(N0)) {
11510 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11511 // being added, so we can only safely match a 32-bit addition with no
11512 // unsigned overflow.
11513 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11514 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11515 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11516 N0 = N0.getOperand(0);
11517 }
11518 }
11519
11520 if (C1) {
11521 unsigned ImmOffset = C1->getZExtValue();
11522 // If the immediate value is too big for the immoffset field, put only bits
11523 // that would normally fit in the immoffset field. The remaining value that
11524 // is copied/added for the voffset field is a large power of 2, and it
11525 // stands more chance of being CSEd with the copy/add for another similar
11526 // load/store.
11527 // However, do not do that rounding down if that is a negative
11528 // number, as it appears to be illegal to have a negative offset in the
11529 // vgpr, even if adding the immediate offset makes it positive.
11530 unsigned Overflow = ImmOffset & ~MaxImm;
11531 ImmOffset -= Overflow;
11532 if ((int32_t)Overflow < 0) {
11533 Overflow += ImmOffset;
11534 ImmOffset = 0;
11535 }
11536 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11537 if (Overflow) {
11538 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11539 if (!N0)
11540 N0 = OverflowVal;
11541 else {
11542 SDValue Ops[] = {N0, OverflowVal};
11543 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11544 }
11545 }
11546 }
11547 if (!N0)
11548 N0 = DAG.getConstant(0, DL, MVT::i32);
11549 if (!C1)
11550 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11551 return {N0, SDValue(C1, 0)};
11552}
11553
11554// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11555// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11556// pointed to by Offsets.
11557void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11558 SelectionDAG &DAG, SDValue *Offsets,
11559 Align Alignment) const {
11560 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11561 SDLoc DL(CombinedOffset);
11562 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11563 uint32_t Imm = C->getZExtValue();
11564 uint32_t SOffset, ImmOffset;
11565 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11566 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11567 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11568 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11569 return;
11570 }
11571 }
11572 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11573 SDValue N0 = CombinedOffset.getOperand(0);
11574 SDValue N1 = CombinedOffset.getOperand(1);
11575 uint32_t SOffset, ImmOffset;
11576 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11577 if (Offset >= 0 &&
11578 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11579 Offsets[0] = N0;
11580 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11581 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11582 return;
11583 }
11584 }
11585
11586 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11587 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11588 : DAG.getConstant(0, DL, MVT::i32);
11589
11590 Offsets[0] = CombinedOffset;
11591 Offsets[1] = SOffsetZero;
11592 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11593}
11594
11595SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11596 SelectionDAG &DAG) const {
11597 if (!MaybePointer.getValueType().isScalarInteger())
11598 return MaybePointer;
11599
11600 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11601 return Rsrc;
11602}
11603
11604// Wrap a global or flat pointer into a buffer intrinsic using the flags
11605// specified in the intrinsic.
11606SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11607 SelectionDAG &DAG) const {
11608 SDLoc Loc(Op);
11609
11610 SDValue Pointer = Op->getOperand(1);
11611 SDValue Stride = Op->getOperand(2);
11612 SDValue NumRecords = Op->getOperand(3);
11613 SDValue Flags = Op->getOperand(4);
11614
11615 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11616 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11617 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11618 std::optional<uint32_t> ConstStride = std::nullopt;
11619 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11620 ConstStride = ConstNode->getZExtValue();
11621
11622 SDValue NewHighHalf = Masked;
11623 if (!ConstStride || *ConstStride != 0) {
11624 SDValue ShiftedStride;
11625 if (ConstStride) {
11626 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11627 } else {
11628 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11629 ShiftedStride =
11630 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11631 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11632 }
11633 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11634 }
11635
11636 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11637 NewHighHalf, NumRecords, Flags);
11638 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11639 return RsrcPtr;
11640}
11641
11642// Handle 8 bit and 16 bit buffer loads
11643SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11644 EVT LoadVT, SDLoc DL,
11646 MachineMemOperand *MMO,
11647 bool IsTFE) const {
11648 EVT IntVT = LoadVT.changeTypeToInteger();
11649
11650 if (IsTFE) {
11651 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11654 MachineFunction &MF = DAG.getMachineFunction();
11655 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11656 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11657 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11658 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11659 DAG.getConstant(1, DL, MVT::i32));
11660 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11661 DAG.getConstant(0, DL, MVT::i32));
11662 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11663 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11664 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11665 }
11666
11667 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11670
11671 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11672 SDValue BufferLoad =
11673 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11674 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11675 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11676
11677 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11678}
11679
11680// Handle 8 bit and 16 bit buffer stores
11681SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11682 EVT VDataType, SDLoc DL,
11683 SDValue Ops[],
11684 MemSDNode *M) const {
11685 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11686 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11687
11688 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11689 Ops[1] = BufferStoreExt;
11690 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11691 : AMDGPUISD::BUFFER_STORE_SHORT;
11692 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11693 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11694 M->getMemOperand());
11695}
11696
11698 SDValue Op, const SDLoc &SL, EVT VT) {
11699 if (VT.bitsLT(Op.getValueType()))
11700 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11701
11702 switch (ExtType) {
11703 case ISD::SEXTLOAD:
11704 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11705 case ISD::ZEXTLOAD:
11706 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11707 case ISD::EXTLOAD:
11708 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11709 case ISD::NON_EXTLOAD:
11710 return Op;
11711 }
11712
11713 llvm_unreachable("invalid ext type");
11714}
11715
11716// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11717// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11718SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11719 DAGCombinerInfo &DCI) const {
11720 SelectionDAG &DAG = DCI.DAG;
11721 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11722 return SDValue();
11723
11724 // FIXME: Constant loads should all be marked invariant.
11725 unsigned AS = Ld->getAddressSpace();
11726 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11728 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11729 return SDValue();
11730
11731 // Don't do this early, since it may interfere with adjacent load merging for
11732 // illegal types. We can avoid losing alignment information for exotic types
11733 // pre-legalize.
11734 EVT MemVT = Ld->getMemoryVT();
11735 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11736 MemVT.getSizeInBits() >= 32)
11737 return SDValue();
11738
11739 SDLoc SL(Ld);
11740
11741 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11742 "unexpected vector extload");
11743
11744 // TODO: Drop only high part of range.
11745 SDValue Ptr = Ld->getBasePtr();
11746 SDValue NewLoad = DAG.getLoad(
11747 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11748 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11749 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11750 nullptr); // Drop ranges
11751
11752 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11753 if (MemVT.isFloatingPoint()) {
11755 "unexpected fp extload");
11756 TruncVT = MemVT.changeTypeToInteger();
11757 }
11758
11759 SDValue Cvt = NewLoad;
11760 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11761 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11762 DAG.getValueType(TruncVT));
11763 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11765 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11766 } else {
11768 }
11769
11770 EVT VT = Ld->getValueType(0);
11771 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11772
11773 DCI.AddToWorklist(Cvt.getNode());
11774
11775 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11776 // the appropriate extension from the 32-bit load.
11777 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11778 DCI.AddToWorklist(Cvt.getNode());
11779
11780 // Handle conversion back to floating point if necessary.
11781 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11782
11783 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11784}
11785
11787 const SIMachineFunctionInfo &Info) {
11788 // TODO: Should check if the address can definitely not access stack.
11789 if (Info.isEntryFunction())
11790 return Info.getUserSGPRInfo().hasFlatScratchInit();
11791 return true;
11792}
11793
11794SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11795 SDLoc DL(Op);
11796 LoadSDNode *Load = cast<LoadSDNode>(Op);
11797 ISD::LoadExtType ExtType = Load->getExtensionType();
11798 EVT MemVT = Load->getMemoryVT();
11799 MachineMemOperand *MMO = Load->getMemOperand();
11800
11801 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11802 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11803 return SDValue();
11804
11805 // FIXME: Copied from PPC
11806 // First, load into 32 bits, then truncate to 1 bit.
11807
11808 SDValue Chain = Load->getChain();
11809 SDValue BasePtr = Load->getBasePtr();
11810
11811 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11812
11813 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11814 RealMemVT, MMO);
11815
11816 if (!MemVT.isVector()) {
11817 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11818 NewLD.getValue(1)};
11819
11820 return DAG.getMergeValues(Ops, DL);
11821 }
11822
11824 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11825 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11826 DAG.getConstant(I, DL, MVT::i32));
11827
11828 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11829 }
11830
11831 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11832
11833 return DAG.getMergeValues(Ops, DL);
11834 }
11835
11836 if (!MemVT.isVector())
11837 return SDValue();
11838
11839 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11840 "Custom lowering for non-i32 vectors hasn't been implemented.");
11841
11842 Align Alignment = Load->getAlign();
11843 unsigned AS = Load->getAddressSpace();
11844 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11845 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11846 return SplitVectorLoad(Op, DAG);
11847 }
11848
11849 MachineFunction &MF = DAG.getMachineFunction();
11850 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11851 // If there is a possibility that flat instruction access scratch memory
11852 // then we need to use the same legalization rules we use for private.
11853 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11854 !Subtarget->hasMultiDwordFlatScratchAddressing())
11855 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11858
11859 unsigned NumElements = MemVT.getVectorNumElements();
11860
11861 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11863 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11864 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11866 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11867 Alignment >= Align(4) && NumElements < 32) {
11868 if (MemVT.isPow2VectorType() ||
11869 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11870 return SDValue();
11871 return WidenOrSplitVectorLoad(Op, DAG);
11872 }
11873 // Non-uniform loads will be selected to MUBUF instructions, so they
11874 // have the same legalization requirements as global and private
11875 // loads.
11876 //
11877 }
11878 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11881 if (NumElements > 4)
11882 return SplitVectorLoad(Op, DAG);
11883 // v3 loads not supported on SI.
11884 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11885 return WidenOrSplitVectorLoad(Op, DAG);
11886
11887 // v3 and v4 loads are supported for private and global memory.
11888 return SDValue();
11889 }
11890 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11891 // Depending on the setting of the private_element_size field in the
11892 // resource descriptor, we can only make private accesses up to a certain
11893 // size.
11894 switch (Subtarget->getMaxPrivateElementSize()) {
11895 case 4: {
11896 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11897 return DAG.getMergeValues({Op0, Op1}, DL);
11898 }
11899 case 8:
11900 if (NumElements > 2)
11901 return SplitVectorLoad(Op, DAG);
11902 return SDValue();
11903 case 16:
11904 // Same as global/flat
11905 if (NumElements > 4)
11906 return SplitVectorLoad(Op, DAG);
11907 // v3 loads not supported on SI.
11908 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11909 return WidenOrSplitVectorLoad(Op, DAG);
11910
11911 return SDValue();
11912 default:
11913 llvm_unreachable("unsupported private_element_size");
11914 }
11915 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11916 unsigned Fast = 0;
11917 auto Flags = Load->getMemOperand()->getFlags();
11919 Load->getAlign(), Flags, &Fast) &&
11920 Fast > 1)
11921 return SDValue();
11922
11923 if (MemVT.isVector())
11924 return SplitVectorLoad(Op, DAG);
11925 }
11926
11928 MemVT, *Load->getMemOperand())) {
11929 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11930 return DAG.getMergeValues({Op0, Op1}, DL);
11931 }
11932
11933 return SDValue();
11934}
11935
11936SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11937 EVT VT = Op.getValueType();
11938 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11939 VT.getSizeInBits() == 512)
11940 return splitTernaryVectorOp(Op, DAG);
11941
11942 assert(VT.getSizeInBits() == 64);
11943
11944 SDLoc DL(Op);
11945 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11946
11947 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11948 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11949
11950 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11951 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11952
11953 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11954 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11955
11956 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11957
11958 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11959 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11960
11961 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11962
11963 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11964 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11965}
11966
11967// Catch division cases where we can use shortcuts with rcp and rsq
11968// instructions.
11969SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11970 SelectionDAG &DAG) const {
11971 SDLoc SL(Op);
11972 SDValue LHS = Op.getOperand(0);
11973 SDValue RHS = Op.getOperand(1);
11974 EVT VT = Op.getValueType();
11975 const SDNodeFlags Flags = Op->getFlags();
11976
11977 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11978
11979 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11980 // Without !fpmath accuracy information, we can't do more because we don't
11981 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11982 // f16 is always accurate enough
11983 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11984 return SDValue();
11985
11986 if (CLHS->isExactlyValue(1.0)) {
11987 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11988 // the CI documentation has a worst case error of 1 ulp.
11989 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11990 // use it as long as we aren't trying to use denormals.
11991 //
11992 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11993
11994 // 1.0 / sqrt(x) -> rsq(x)
11995
11996 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11997 // error seems really high at 2^29 ULP.
11998 // 1.0 / x -> rcp(x)
11999 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12000 }
12001
12002 // Same as for 1.0, but expand the sign out of the constant.
12003 if (CLHS->isExactlyValue(-1.0)) {
12004 // -1.0 / x -> rcp (fneg x)
12005 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12006 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12007 }
12008 }
12009
12010 // For f16 and bf16 require afn or arcp.
12011 // For f32 require afn.
12012 if (!AllowInaccurateRcp &&
12013 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12014 return SDValue();
12015
12016 // Turn into multiply by the reciprocal.
12017 // x / y -> x * (1.0 / y)
12018 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12019 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12020}
12021
12022SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12023 SelectionDAG &DAG) const {
12024 SDLoc SL(Op);
12025 SDValue X = Op.getOperand(0);
12026 SDValue Y = Op.getOperand(1);
12027 EVT VT = Op.getValueType();
12028 const SDNodeFlags Flags = Op->getFlags();
12029
12030 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12031 if (!AllowInaccurateDiv)
12032 return SDValue();
12033
12034 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12035 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12036
12037 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12038 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12039
12040 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12041 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12042 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12043 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12044 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12045 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12046}
12047
12048static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12049 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12050 SDNodeFlags Flags) {
12051 if (GlueChain->getNumValues() <= 1) {
12052 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12053 }
12054
12055 assert(GlueChain->getNumValues() == 3);
12056
12057 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12058 switch (Opcode) {
12059 default:
12060 llvm_unreachable("no chain equivalent for opcode");
12061 case ISD::FMUL:
12062 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12063 break;
12064 }
12065
12066 return DAG.getNode(Opcode, SL, VTList,
12067 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12068 Flags);
12069}
12070
12071static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12072 EVT VT, SDValue A, SDValue B, SDValue C,
12073 SDValue GlueChain, SDNodeFlags Flags) {
12074 if (GlueChain->getNumValues() <= 1) {
12075 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12076 }
12077
12078 assert(GlueChain->getNumValues() == 3);
12079
12080 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12081 switch (Opcode) {
12082 default:
12083 llvm_unreachable("no chain equivalent for opcode");
12084 case ISD::FMA:
12085 Opcode = AMDGPUISD::FMA_W_CHAIN;
12086 break;
12087 }
12088
12089 return DAG.getNode(Opcode, SL, VTList,
12090 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12091 Flags);
12092}
12093
12094SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12095 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12096 return FastLowered;
12097
12098 SDLoc SL(Op);
12099 EVT VT = Op.getValueType();
12100 SDValue LHS = Op.getOperand(0);
12101 SDValue RHS = Op.getOperand(1);
12102
12103 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12104 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12105
12106 if (VT == MVT::bf16) {
12107 SDValue ExtDiv =
12108 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12109 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12110 DAG.getTargetConstant(0, SL, MVT::i32));
12111 }
12112
12113 assert(VT == MVT::f16);
12114
12115 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12116 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12117 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12118 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12119 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12120 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12121 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12122 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12123 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12124 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12125 // q16.u = opx(V_CVT_F16_F32, q32.u);
12126 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12127
12128 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12129 unsigned FMADOpCode =
12131 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12132 SDValue Rcp =
12133 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12134 SDValue Quot =
12135 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12136 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12137 Op->getFlags());
12138 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12139 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12140 Op->getFlags());
12141 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12142 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12143 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12144 DAG.getConstant(0xff800000, SL, MVT::i32));
12145 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12146 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12147 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12148 DAG.getTargetConstant(0, SL, MVT::i32));
12149 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12150 Op->getFlags());
12151}
12152
12153// Faster 2.5 ULP division that does not support denormals.
12154SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12155 SDNodeFlags Flags = Op->getFlags();
12156 SDLoc SL(Op);
12157 SDValue LHS = Op.getOperand(1);
12158 SDValue RHS = Op.getOperand(2);
12159
12160 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12161
12162 const APFloat K0Val(0x1p+96f);
12163 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12164
12165 const APFloat K1Val(0x1p-32f);
12166 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12167
12168 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12169
12170 EVT SetCCVT =
12171 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12172
12173 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12174
12175 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12176
12177 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12178
12179 // rcp does not support denormals.
12180 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12181
12182 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12183
12184 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12185}
12186
12187// Returns immediate value for setting the F32 denorm mode when using the
12188// S_DENORM_MODE instruction.
12191 const GCNSubtarget *ST) {
12192 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12193 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12194 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12195 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12196}
12197
12198SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12199 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12200 return FastLowered;
12201
12202 // The selection matcher assumes anything with a chain selecting to a
12203 // mayRaiseFPException machine instruction. Since we're introducing a chain
12204 // here, we need to explicitly report nofpexcept for the regular fdiv
12205 // lowering.
12206 SDNodeFlags Flags = Op->getFlags();
12207 Flags.setNoFPExcept(true);
12208
12209 SDLoc SL(Op);
12210 SDValue LHS = Op.getOperand(0);
12211 SDValue RHS = Op.getOperand(1);
12212
12213 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12214
12215 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12216
12217 SDValue DenominatorScaled =
12218 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12219 SDValue NumeratorScaled =
12220 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12221
12222 // Denominator is scaled to not be denormal, so using rcp is ok.
12223 SDValue ApproxRcp =
12224 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12225 SDValue NegDivScale0 =
12226 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12227
12228 using namespace AMDGPU::Hwreg;
12229 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12230 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12231
12232 const MachineFunction &MF = DAG.getMachineFunction();
12233 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12234 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12235
12236 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12237 const bool HasDynamicDenormals =
12238 (DenormMode.Input == DenormalMode::Dynamic) ||
12239 (DenormMode.Output == DenormalMode::Dynamic);
12240
12241 SDValue SavedDenormMode;
12242
12243 if (!PreservesDenormals) {
12244 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12245 // lowering. The chain dependence is insufficient, and we need glue. We do
12246 // not need the glue variants in a strictfp function.
12247
12248 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12249
12250 SDValue Glue = DAG.getEntryNode();
12251 if (HasDynamicDenormals) {
12252 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12253 DAG.getVTList(MVT::i32, MVT::Glue),
12254 {BitField, Glue});
12255 SavedDenormMode = SDValue(GetReg, 0);
12256
12257 Glue = DAG.getMergeValues(
12258 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12259 }
12260
12261 SDNode *EnableDenorm;
12262 if (Subtarget->hasDenormModeInst()) {
12263 const SDValue EnableDenormValue =
12265
12266 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12267 EnableDenormValue)
12268 .getNode();
12269 } else {
12270 const SDValue EnableDenormValue =
12271 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12272 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12273 {EnableDenormValue, BitField, Glue});
12274 }
12275
12276 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12277 SDValue(EnableDenorm, 1)};
12278
12279 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12280 }
12281
12282 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12283 ApproxRcp, One, NegDivScale0, Flags);
12284
12285 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12286 ApproxRcp, Fma0, Flags);
12287
12288 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12289 Fma1, Flags);
12290
12291 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12292 NumeratorScaled, Mul, Flags);
12293
12294 SDValue Fma3 =
12295 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12296
12297 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12298 NumeratorScaled, Fma3, Flags);
12299
12300 if (!PreservesDenormals) {
12301 SDNode *DisableDenorm;
12302 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12303 const SDValue DisableDenormValue = getSPDenormModeValue(
12304 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12305
12306 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12307 DisableDenorm =
12308 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12309 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12310 .getNode();
12311 } else {
12312 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12313 const SDValue DisableDenormValue =
12314 HasDynamicDenormals
12315 ? SavedDenormMode
12316 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12317
12318 DisableDenorm = DAG.getMachineNode(
12319 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12320 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12321 }
12322
12323 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12324 SDValue(DisableDenorm, 0), DAG.getRoot());
12325 DAG.setRoot(OutputChain);
12326 }
12327
12328 SDValue Scale = NumeratorScaled.getValue(1);
12329 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12330 {Fma4, Fma1, Fma3, Scale}, Flags);
12331
12332 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12333}
12334
12335SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12336 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12337 return FastLowered;
12338
12339 SDLoc SL(Op);
12340 SDValue X = Op.getOperand(0);
12341 SDValue Y = Op.getOperand(1);
12342
12343 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12344
12345 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12346
12347 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12348
12349 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12350
12351 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12352
12353 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12354
12355 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12356
12357 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12358
12359 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12360
12361 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12362 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12363
12364 SDValue Fma4 =
12365 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12366
12367 SDValue Scale;
12368
12369 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12370 // Workaround a hardware bug on SI where the condition output from div_scale
12371 // is not usable.
12372
12373 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12374
12375 // Figure out if the scale to use for div_fmas.
12376 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12377 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12378 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12379 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12380
12381 SDValue NumHi =
12382 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12383 SDValue DenHi =
12384 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12385
12386 SDValue Scale0Hi =
12387 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12388 SDValue Scale1Hi =
12389 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12390
12391 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12392 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12393 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12394 } else {
12395 Scale = DivScale1.getValue(1);
12396 }
12397
12398 SDValue Fmas =
12399 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12400
12401 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12402}
12403
12404SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12405 EVT VT = Op.getValueType();
12406
12407 if (VT == MVT::f32)
12408 return LowerFDIV32(Op, DAG);
12409
12410 if (VT == MVT::f64)
12411 return LowerFDIV64(Op, DAG);
12412
12413 if (VT == MVT::f16 || VT == MVT::bf16)
12414 return LowerFDIV16(Op, DAG);
12415
12416 llvm_unreachable("Unexpected type for fdiv");
12417}
12418
12419SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12420 SDLoc dl(Op);
12421 SDValue Val = Op.getOperand(0);
12422 EVT VT = Val.getValueType();
12423 EVT ResultExpVT = Op->getValueType(1);
12424 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12425
12426 SDValue Mant = DAG.getNode(
12428 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12429
12430 SDValue Exp = DAG.getNode(
12431 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12432 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12433
12434 if (Subtarget->hasFractBug()) {
12435 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12436 SDValue Inf =
12438
12439 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12440 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12441 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12442 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12443 }
12444
12445 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12446 return DAG.getMergeValues({Mant, CastExp}, dl);
12447}
12448
12449SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12450 SDLoc DL(Op);
12451 StoreSDNode *Store = cast<StoreSDNode>(Op);
12452 EVT VT = Store->getMemoryVT();
12453
12454 if (VT == MVT::i1) {
12455 return DAG.getTruncStore(
12456 Store->getChain(), DL,
12457 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12458 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12459 }
12460
12461 assert(VT.isVector() &&
12462 Store->getValue().getValueType().getScalarType() == MVT::i32);
12463
12464 unsigned AS = Store->getAddressSpace();
12465 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12466 Store->getAlign().value() < VT.getStoreSize() &&
12467 VT.getSizeInBits() > 32) {
12468 return SplitVectorStore(Op, DAG);
12469 }
12470
12471 MachineFunction &MF = DAG.getMachineFunction();
12472 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12473 // If there is a possibility that flat instruction access scratch memory
12474 // then we need to use the same legalization rules we use for private.
12475 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12476 !Subtarget->hasMultiDwordFlatScratchAddressing())
12477 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12480
12481 unsigned NumElements = VT.getVectorNumElements();
12483 if (NumElements > 4)
12484 return SplitVectorStore(Op, DAG);
12485 // v3 stores not supported on SI.
12486 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12487 return SplitVectorStore(Op, DAG);
12488
12490 VT, *Store->getMemOperand()))
12491 return expandUnalignedStore(Store, DAG);
12492
12493 return SDValue();
12494 }
12495 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12496 switch (Subtarget->getMaxPrivateElementSize()) {
12497 case 4:
12498 return scalarizeVectorStore(Store, DAG);
12499 case 8:
12500 if (NumElements > 2)
12501 return SplitVectorStore(Op, DAG);
12502 return SDValue();
12503 case 16:
12504 if (NumElements > 4 ||
12505 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12506 return SplitVectorStore(Op, DAG);
12507 return SDValue();
12508 default:
12509 llvm_unreachable("unsupported private_element_size");
12510 }
12511 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12512 unsigned Fast = 0;
12513 auto Flags = Store->getMemOperand()->getFlags();
12515 Store->getAlign(), Flags, &Fast) &&
12516 Fast > 1)
12517 return SDValue();
12518
12519 if (VT.isVector())
12520 return SplitVectorStore(Op, DAG);
12521
12522 return expandUnalignedStore(Store, DAG);
12523 }
12524
12525 // Probably an invalid store. If so we'll end up emitting a selection error.
12526 return SDValue();
12527}
12528
12529// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12530SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12531 SDLoc SL(Op);
12532 assert(!Subtarget->has16BitInsts());
12533 SDNodeFlags Flags = Op->getFlags();
12534 SDValue Ext =
12535 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12536
12537 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12538 SDValue Sqrt =
12539 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12540
12541 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12542 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12543}
12544
12545SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12546 SDLoc DL(Op);
12547 SDNodeFlags Flags = Op->getFlags();
12548 MVT VT = Op.getValueType().getSimpleVT();
12549 const SDValue X = Op.getOperand(0);
12550
12551 if (allowApproxFunc(DAG, Flags)) {
12552 // Instruction is 1ulp but ignores denormals.
12553 return DAG.getNode(
12555 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12556 }
12557
12558 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12559 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12560
12561 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12562
12563 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12564
12565 SDValue SqrtX =
12566 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12567
12568 SDValue SqrtS;
12569 if (needsDenormHandlingF32(DAG, X, Flags)) {
12570 SDValue SqrtID =
12571 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12572 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12573
12574 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12575 SDValue SqrtSNextDownInt =
12576 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12577 DAG.getAllOnesConstant(DL, MVT::i32));
12578 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12579
12580 SDValue NegSqrtSNextDown =
12581 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12582
12583 SDValue SqrtVP =
12584 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12585
12586 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12587 DAG.getConstant(1, DL, MVT::i32));
12588 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12589
12590 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12591 SDValue SqrtVS =
12592 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12593
12594 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12595 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12596
12597 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12598 Flags);
12599
12600 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12601 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12602 Flags);
12603 } else {
12604 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12605
12606 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12607
12608 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12609 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12610 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12611
12612 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12613 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12614 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12615
12616 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12617 SDValue SqrtD =
12618 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12619 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12620 }
12621
12622 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12623
12624 SDValue ScaledDown =
12625 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12626
12627 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12628 SDValue IsZeroOrInf =
12629 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12630 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12631
12632 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12633}
12634
12635SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12636 // For double type, the SQRT and RSQ instructions don't have required
12637 // precision, we apply Goldschmidt's algorithm to improve the result:
12638 //
12639 // y0 = rsq(x)
12640 // g0 = x * y0
12641 // h0 = 0.5 * y0
12642 //
12643 // r0 = 0.5 - h0 * g0
12644 // g1 = g0 * r0 + g0
12645 // h1 = h0 * r0 + h0
12646 //
12647 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12648 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12649 // h2 = h1 * r1 + h1
12650 //
12651 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12652 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12653 //
12654 // sqrt(x) = g3
12655
12656 SDNodeFlags Flags = Op->getFlags();
12657
12658 SDLoc DL(Op);
12659
12660 SDValue X = Op.getOperand(0);
12661 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12662
12663 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12664
12665 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12666
12667 // Scale up input if it is too small.
12668 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12669 SDValue ScaleUp =
12670 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12671 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12672
12673 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12674
12675 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12676
12677 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12678 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12679
12680 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12681 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12682
12683 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12684
12685 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12686
12687 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12688 SDValue SqrtD0 =
12689 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12690
12691 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12692
12693 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12694 SDValue SqrtD1 =
12695 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12696
12697 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12698
12699 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12700 SDValue ScaleDown =
12701 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12702 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12703
12704 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12705 // with finite only or nsz because rsq(+/-0) = +/-inf
12706
12707 // TODO: Check for DAZ and expand to subnormals
12708 SDValue IsZeroOrInf =
12709 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12710 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12711
12712 // If x is +INF, +0, or -0, use its original value
12713 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12714 Flags);
12715}
12716
12717SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12718 SDLoc DL(Op);
12719 EVT VT = Op.getValueType();
12720 SDValue Arg = Op.getOperand(0);
12721 SDValue TrigVal;
12722
12723 // Propagate fast-math flags so that the multiply we introduce can be folded
12724 // if Arg is already the result of a multiply by constant.
12725 auto Flags = Op->getFlags();
12726
12727 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12728
12729 if (Subtarget->hasTrigReducedRange()) {
12730 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12731 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12732 } else {
12733 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12734 }
12735
12736 switch (Op.getOpcode()) {
12737 case ISD::FCOS:
12738 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12739 case ISD::FSIN:
12740 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12741 default:
12742 llvm_unreachable("Wrong trig opcode");
12743 }
12744}
12745
12746SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12747 SelectionDAG &DAG) const {
12748 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12749 assert(AtomicNode->isCompareAndSwap());
12750 unsigned AS = AtomicNode->getAddressSpace();
12751
12752 // No custom lowering required for local address space
12754 return Op;
12755
12756 // Non-local address space requires custom lowering for atomic compare
12757 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12758 SDLoc DL(Op);
12759 SDValue ChainIn = Op.getOperand(0);
12760 SDValue Addr = Op.getOperand(1);
12761 SDValue Old = Op.getOperand(2);
12762 SDValue New = Op.getOperand(3);
12763 EVT VT = Op.getValueType();
12764 MVT SimpleVT = VT.getSimpleVT();
12765 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12766
12767 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12768 SDValue Ops[] = {ChainIn, Addr, NewOld};
12769
12771 Op->getVTList(), Ops, VT,
12772 AtomicNode->getMemOperand());
12773}
12774
12775//===----------------------------------------------------------------------===//
12776// Custom DAG optimizations
12777//===----------------------------------------------------------------------===//
12778
12779SDValue
12780SITargetLowering::performUCharToFloatCombine(SDNode *N,
12781 DAGCombinerInfo &DCI) const {
12782 EVT VT = N->getValueType(0);
12783 EVT ScalarVT = VT.getScalarType();
12784 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12785 return SDValue();
12786
12787 SelectionDAG &DAG = DCI.DAG;
12788 SDLoc DL(N);
12789
12790 SDValue Src = N->getOperand(0);
12791 EVT SrcVT = Src.getValueType();
12792
12793 // TODO: We could try to match extracting the higher bytes, which would be
12794 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12795 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12796 // about in practice.
12797 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12798 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12799 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12800 DCI.AddToWorklist(Cvt.getNode());
12801
12802 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12803 if (ScalarVT != MVT::f32) {
12804 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12805 DAG.getTargetConstant(0, DL, MVT::i32));
12806 }
12807 return Cvt;
12808 }
12809 }
12810
12811 return SDValue();
12812}
12813
12814SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12815 DAGCombinerInfo &DCI) const {
12816 SDValue MagnitudeOp = N->getOperand(0);
12817 SDValue SignOp = N->getOperand(1);
12818
12819 // The generic combine for fcopysign + fp cast is too conservative with
12820 // vectors, and also gets confused by the splitting we will perform here, so
12821 // peek through FP casts.
12822 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12823 SignOp.getOpcode() == ISD::FP_ROUND)
12824 SignOp = SignOp.getOperand(0);
12825
12826 SelectionDAG &DAG = DCI.DAG;
12827 SDLoc DL(N);
12828 EVT SignVT = SignOp.getValueType();
12829
12830 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12831 // lower half with a copy.
12832 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12833 EVT MagVT = MagnitudeOp.getValueType();
12834
12835 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12836
12837 if (MagVT.getScalarType() == MVT::f64) {
12838 EVT F32VT = MagVT.isVector()
12839 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12840 : MVT::v2f32;
12841
12842 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12843
12845 for (unsigned I = 0; I != NumElts; ++I) {
12846 SDValue MagLo =
12847 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12848 DAG.getConstant(2 * I, DL, MVT::i32));
12849 SDValue MagHi =
12850 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12851 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12852
12853 SDValue SignOpElt =
12854 MagVT.isVector()
12856 SignOp, DAG.getConstant(I, DL, MVT::i32))
12857 : SignOp;
12858
12859 SDValue HiOp =
12860 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12861
12862 SDValue Vector =
12863 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12864
12865 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12866 NewElts.push_back(NewElt);
12867 }
12868
12869 if (NewElts.size() == 1)
12870 return NewElts[0];
12871
12872 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12873 }
12874
12875 if (SignVT.getScalarType() != MVT::f64)
12876 return SDValue();
12877
12878 // Reduce width of sign operand, we only need the highest bit.
12879 //
12880 // fcopysign f64:x, f64:y ->
12881 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12882 // TODO: In some cases it might make sense to go all the way to f16.
12883
12884 EVT F32VT = MagVT.isVector()
12885 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12886 : MVT::v2f32;
12887
12888 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12889
12890 SmallVector<SDValue, 8> F32Signs;
12891 for (unsigned I = 0; I != NumElts; ++I) {
12892 // Take sign from odd elements of cast vector
12893 SDValue SignAsF32 =
12894 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12895 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12896 F32Signs.push_back(SignAsF32);
12897 }
12898
12899 SDValue NewSign =
12900 NumElts == 1
12901 ? F32Signs.back()
12903 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12904 F32Signs);
12905
12906 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12907 NewSign);
12908}
12909
12910// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12911// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12912// bits
12913
12914// This is a variant of
12915// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12916//
12917// The normal DAG combiner will do this, but only if the add has one use since
12918// that would increase the number of instructions.
12919//
12920// This prevents us from seeing a constant offset that can be folded into a
12921// memory instruction's addressing mode. If we know the resulting add offset of
12922// a pointer can be folded into an addressing offset, we can replace the pointer
12923// operand with the add of new constant offset. This eliminates one of the uses,
12924// and may allow the remaining use to also be simplified.
12925//
12926SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12927 EVT MemVT,
12928 DAGCombinerInfo &DCI) const {
12929 SDValue N0 = N->getOperand(0);
12930 SDValue N1 = N->getOperand(1);
12931
12932 // We only do this to handle cases where it's profitable when there are
12933 // multiple uses of the add, so defer to the standard combine.
12934 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12935 N0->hasOneUse())
12936 return SDValue();
12937
12938 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12939 if (!CN1)
12940 return SDValue();
12941
12942 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12943 if (!CAdd)
12944 return SDValue();
12945
12946 SelectionDAG &DAG = DCI.DAG;
12947
12948 if (N0->getOpcode() == ISD::OR &&
12949 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12950 return SDValue();
12951
12952 // If the resulting offset is too large, we can't fold it into the
12953 // addressing mode offset.
12954 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12955 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12956
12957 AddrMode AM;
12958 AM.HasBaseReg = true;
12959 AM.BaseOffs = Offset.getSExtValue();
12960 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12961 return SDValue();
12962
12963 SDLoc SL(N);
12964 EVT VT = N->getValueType(0);
12965
12966 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12967 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12968
12969 SDNodeFlags Flags;
12970 Flags.setNoUnsignedWrap(
12971 N->getFlags().hasNoUnsignedWrap() &&
12972 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12973
12974 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12975}
12976
12977/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12978/// by the chain and intrinsic ID. Theoretically we would also need to check the
12979/// specific intrinsic, but they all place the pointer operand first.
12980static unsigned getBasePtrIndex(const MemSDNode *N) {
12981 switch (N->getOpcode()) {
12982 case ISD::STORE:
12985 return 2;
12986 default:
12987 return 1;
12988 }
12989}
12990
12991SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12992 DAGCombinerInfo &DCI) const {
12993 SelectionDAG &DAG = DCI.DAG;
12994
12995 unsigned PtrIdx = getBasePtrIndex(N);
12996 SDValue Ptr = N->getOperand(PtrIdx);
12997
12998 // TODO: We could also do this for multiplies.
12999 if (Ptr.getOpcode() == ISD::SHL) {
13000 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13001 N->getMemoryVT(), DCI);
13002 if (NewPtr) {
13003 SmallVector<SDValue, 8> NewOps(N->ops());
13004
13005 NewOps[PtrIdx] = NewPtr;
13006 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13007 }
13008 }
13009
13010 return SDValue();
13011}
13012
13013static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13014 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13015 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13016 (Opc == ISD::XOR && Val == 0);
13017}
13018
13019// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13020// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13021// integer combine opportunities since most 64-bit operations are decomposed
13022// this way. TODO: We won't want this for SALU especially if it is an inline
13023// immediate.
13024SDValue SITargetLowering::splitBinaryBitConstantOp(
13025 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13026 const ConstantSDNode *CRHS) const {
13027 uint64_t Val = CRHS->getZExtValue();
13028 uint32_t ValLo = Lo_32(Val);
13029 uint32_t ValHi = Hi_32(Val);
13030 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13031
13032 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13034 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13035 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13036 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13037 !CRHS->user_begin()->isDivergent())
13038 return SDValue();
13039
13040 // If we need to materialize a 64-bit immediate, it will be split up later
13041 // anyway. Avoid creating the harder to understand 64-bit immediate
13042 // materialization.
13043 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13044 }
13045
13046 return SDValue();
13047}
13048
13050 if (V.getValueType() != MVT::i1)
13051 return false;
13052 switch (V.getOpcode()) {
13053 default:
13054 break;
13055 case ISD::SETCC:
13056 case ISD::IS_FPCLASS:
13058 return true;
13059 case ISD::AND:
13060 case ISD::OR:
13061 case ISD::XOR:
13062 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13063 case ISD::SADDO:
13064 case ISD::UADDO:
13065 case ISD::SSUBO:
13066 case ISD::USUBO:
13067 case ISD::SMULO:
13068 case ISD::UMULO:
13069 return V.getResNo() == 1;
13071 unsigned IntrinsicID = V.getConstantOperandVal(0);
13072 switch (IntrinsicID) {
13073 case Intrinsic::amdgcn_is_shared:
13074 case Intrinsic::amdgcn_is_private:
13075 return true;
13076 default:
13077 return false;
13078 }
13079
13080 return false;
13081 }
13082 }
13083 return false;
13084}
13085
13086// If a constant has all zeroes or all ones within each byte return it.
13087// Otherwise return 0.
13089 // 0xff for any zero byte in the mask
13090 uint32_t ZeroByteMask = 0;
13091 if (!(C & 0x000000ff))
13092 ZeroByteMask |= 0x000000ff;
13093 if (!(C & 0x0000ff00))
13094 ZeroByteMask |= 0x0000ff00;
13095 if (!(C & 0x00ff0000))
13096 ZeroByteMask |= 0x00ff0000;
13097 if (!(C & 0xff000000))
13098 ZeroByteMask |= 0xff000000;
13099 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13100 if ((NonZeroByteMask & C) != NonZeroByteMask)
13101 return 0; // Partial bytes selected.
13102 return C;
13103}
13104
13105// Check if a node selects whole bytes from its operand 0 starting at a byte
13106// boundary while masking the rest. Returns select mask as in the v_perm_b32
13107// or -1 if not succeeded.
13108// Note byte select encoding:
13109// value 0-3 selects corresponding source byte;
13110// value 0xc selects zero;
13111// value 0xff selects 0xff.
13113 assert(V.getValueSizeInBits() == 32);
13114
13115 if (V.getNumOperands() != 2)
13116 return ~0;
13117
13118 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13119 if (!N1)
13120 return ~0;
13121
13122 uint32_t C = N1->getZExtValue();
13123
13124 switch (V.getOpcode()) {
13125 default:
13126 break;
13127 case ISD::AND:
13128 if (uint32_t ConstMask = getConstantPermuteMask(C))
13129 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13130 break;
13131
13132 case ISD::OR:
13133 if (uint32_t ConstMask = getConstantPermuteMask(C))
13134 return (0x03020100 & ~ConstMask) | ConstMask;
13135 break;
13136
13137 case ISD::SHL:
13138 if (C % 8)
13139 return ~0;
13140
13141 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13142
13143 case ISD::SRL:
13144 if (C % 8)
13145 return ~0;
13146
13147 return uint32_t(0x0c0c0c0c03020100ull >> C);
13148 }
13149
13150 return ~0;
13151}
13152
13153SDValue SITargetLowering::performAndCombine(SDNode *N,
13154 DAGCombinerInfo &DCI) const {
13155 if (DCI.isBeforeLegalize())
13156 return SDValue();
13157
13158 SelectionDAG &DAG = DCI.DAG;
13159 EVT VT = N->getValueType(0);
13160 SDValue LHS = N->getOperand(0);
13161 SDValue RHS = N->getOperand(1);
13162
13163 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13164 if (VT == MVT::i64 && CRHS) {
13165 if (SDValue Split =
13166 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13167 return Split;
13168 }
13169
13170 if (CRHS && VT == MVT::i32) {
13171 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13172 // nb = number of trailing zeroes in mask
13173 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13174 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13175 uint64_t Mask = CRHS->getZExtValue();
13176 unsigned Bits = llvm::popcount(Mask);
13177 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13178 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13179 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13180 unsigned Shift = CShift->getZExtValue();
13181 unsigned NB = CRHS->getAPIntValue().countr_zero();
13182 unsigned Offset = NB + Shift;
13183 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13184 SDLoc SL(N);
13185 SDValue BFE =
13186 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13187 DAG.getConstant(Offset, SL, MVT::i32),
13188 DAG.getConstant(Bits, SL, MVT::i32));
13189 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13190 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13191 DAG.getValueType(NarrowVT));
13192 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13193 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13194 return Shl;
13195 }
13196 }
13197 }
13198
13199 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13200 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13201 isa<ConstantSDNode>(LHS.getOperand(2))) {
13202 uint32_t Sel = getConstantPermuteMask(Mask);
13203 if (!Sel)
13204 return SDValue();
13205
13206 // Select 0xc for all zero bytes
13207 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13208 SDLoc DL(N);
13209 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13210 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13211 }
13212 }
13213
13214 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13215 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13216 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13217 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13218 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13219
13220 SDValue X = LHS.getOperand(0);
13221 SDValue Y = RHS.getOperand(0);
13222 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13223 !isTypeLegal(X.getValueType()))
13224 return SDValue();
13225
13226 if (LCC == ISD::SETO) {
13227 if (X != LHS.getOperand(1))
13228 return SDValue();
13229
13230 if (RCC == ISD::SETUNE) {
13231 const ConstantFPSDNode *C1 =
13232 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13233 if (!C1 || !C1->isInfinity() || C1->isNegative())
13234 return SDValue();
13235
13236 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13240
13241 static_assert(
13244 0x3ff) == Mask,
13245 "mask not equal");
13246
13247 SDLoc DL(N);
13248 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13249 DAG.getConstant(Mask, DL, MVT::i32));
13250 }
13251 }
13252 }
13253
13254 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13255 std::swap(LHS, RHS);
13256
13257 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13258 RHS.hasOneUse()) {
13259 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13260 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13261 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13262 // | n_nan)
13263 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13264 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13265 (RHS.getOperand(0) == LHS.getOperand(0) &&
13266 LHS.getOperand(0) == LHS.getOperand(1))) {
13267 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13268 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13269 : Mask->getZExtValue() & OrdMask;
13270
13271 SDLoc DL(N);
13272 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13273 DAG.getConstant(NewMask, DL, MVT::i32));
13274 }
13275 }
13276
13277 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13278 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13279 // and x, (sext cc from i1) => select cc, x, 0
13280 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13281 std::swap(LHS, RHS);
13282 if (isBoolSGPR(RHS.getOperand(0)))
13283 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13284 DAG.getConstant(0, SDLoc(N), MVT::i32));
13285 }
13286
13287 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13288 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13289 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13290 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13291 uint32_t LHSMask = getPermuteMask(LHS);
13292 uint32_t RHSMask = getPermuteMask(RHS);
13293 if (LHSMask != ~0u && RHSMask != ~0u) {
13294 // Canonicalize the expression in an attempt to have fewer unique masks
13295 // and therefore fewer registers used to hold the masks.
13296 if (LHSMask > RHSMask) {
13297 std::swap(LHSMask, RHSMask);
13298 std::swap(LHS, RHS);
13299 }
13300
13301 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13302 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13303 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13304 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13305
13306 // Check of we need to combine values from two sources within a byte.
13307 if (!(LHSUsedLanes & RHSUsedLanes) &&
13308 // If we select high and lower word keep it for SDWA.
13309 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13310 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13311 // Each byte in each mask is either selector mask 0-3, or has higher
13312 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13313 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13314 // mask which is not 0xff wins. By anding both masks we have a correct
13315 // result except that 0x0c shall be corrected to give 0x0c only.
13316 uint32_t Mask = LHSMask & RHSMask;
13317 for (unsigned I = 0; I < 32; I += 8) {
13318 uint32_t ByteSel = 0xff << I;
13319 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13320 Mask &= (0x0c << I) & 0xffffffff;
13321 }
13322
13323 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13324 // or 0x0c.
13325 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13326 SDLoc DL(N);
13327
13328 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13329 RHS.getOperand(0),
13330 DAG.getConstant(Sel, DL, MVT::i32));
13331 }
13332 }
13333 }
13334
13335 return SDValue();
13336}
13337
13338// A key component of v_perm is a mapping between byte position of the src
13339// operands, and the byte position of the dest. To provide such, we need: 1. the
13340// node that provides x byte of the dest of the OR, and 2. the byte of the node
13341// used to provide that x byte. calculateByteProvider finds which node provides
13342// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13343// and finds an ultimate src and byte position For example: The supported
13344// LoadCombine pattern for vector loads is as follows
13345// t1
13346// or
13347// / \
13348// t2 t3
13349// zext shl
13350// | | \
13351// t4 t5 16
13352// or anyext
13353// / \ |
13354// t6 t7 t8
13355// srl shl or
13356// / | / \ / \
13357// t9 t10 t11 t12 t13 t14
13358// trunc* 8 trunc* 8 and and
13359// | | / | | \
13360// t15 t16 t17 t18 t19 t20
13361// trunc* 255 srl -256
13362// | / \
13363// t15 t15 16
13364//
13365// *In this example, the truncs are from i32->i16
13366//
13367// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13368// respectively. calculateSrcByte would find (given node) -> ultimate src &
13369// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13370// After finding the mapping, we can combine the tree into vperm t15, t16,
13371// 0x05000407
13372
13373// Find the source and byte position from a node.
13374// \p DestByte is the byte position of the dest of the or that the src
13375// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13376// dest of the or byte. \p Depth tracks how many recursive iterations we have
13377// performed.
13378static const std::optional<ByteProvider<SDValue>>
13379calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13380 unsigned Depth = 0) {
13381 // We may need to recursively traverse a series of SRLs
13382 if (Depth >= 6)
13383 return std::nullopt;
13384
13385 if (Op.getValueSizeInBits() < 8)
13386 return std::nullopt;
13387
13388 if (Op.getValueType().isVector())
13389 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13390
13391 switch (Op->getOpcode()) {
13392 case ISD::TRUNCATE: {
13393 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13394 }
13395
13396 case ISD::SIGN_EXTEND:
13397 case ISD::ZERO_EXTEND:
13399 SDValue NarrowOp = Op->getOperand(0);
13400 auto NarrowVT = NarrowOp.getValueType();
13401 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13402 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13403 NarrowVT = VTSign->getVT();
13404 }
13405 if (!NarrowVT.isByteSized())
13406 return std::nullopt;
13407 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13408
13409 if (SrcIndex >= NarrowByteWidth)
13410 return std::nullopt;
13411 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13412 }
13413
13414 case ISD::SRA:
13415 case ISD::SRL: {
13416 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13417 if (!ShiftOp)
13418 return std::nullopt;
13419
13420 uint64_t BitShift = ShiftOp->getZExtValue();
13421
13422 if (BitShift % 8 != 0)
13423 return std::nullopt;
13424
13425 SrcIndex += BitShift / 8;
13426
13427 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13428 }
13429
13430 default: {
13431 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13432 }
13433 }
13434 llvm_unreachable("fully handled switch");
13435}
13436
13437// For a byte position in the result of an Or, traverse the tree and find the
13438// node (and the byte of the node) which ultimately provides this {Or,
13439// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13440// the byte position of the Op that corresponds with the originally requested
13441// byte of the Or \p Depth tracks how many recursive iterations we have
13442// performed. \p StartingIndex is the originally requested byte of the Or
13443static const std::optional<ByteProvider<SDValue>>
13444calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13445 unsigned StartingIndex = 0) {
13446 // Finding Src tree of RHS of or typically requires at least 1 additional
13447 // depth
13448 if (Depth > 6)
13449 return std::nullopt;
13450
13451 unsigned BitWidth = Op.getScalarValueSizeInBits();
13452 if (BitWidth % 8 != 0)
13453 return std::nullopt;
13454 if (Index > BitWidth / 8 - 1)
13455 return std::nullopt;
13456
13457 bool IsVec = Op.getValueType().isVector();
13458 switch (Op.getOpcode()) {
13459 case ISD::OR: {
13460 if (IsVec)
13461 return std::nullopt;
13462
13463 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13464 StartingIndex);
13465 if (!RHS)
13466 return std::nullopt;
13467 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13468 StartingIndex);
13469 if (!LHS)
13470 return std::nullopt;
13471 // A well formed Or will have two ByteProviders for each byte, one of which
13472 // is constant zero
13473 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13474 return std::nullopt;
13475 if (!LHS || LHS->isConstantZero())
13476 return RHS;
13477 if (!RHS || RHS->isConstantZero())
13478 return LHS;
13479 return std::nullopt;
13480 }
13481
13482 case ISD::AND: {
13483 if (IsVec)
13484 return std::nullopt;
13485
13486 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13487 if (!BitMaskOp)
13488 return std::nullopt;
13489
13490 uint32_t BitMask = BitMaskOp->getZExtValue();
13491 // Bits we expect for our StartingIndex
13492 uint32_t IndexMask = 0xFF << (Index * 8);
13493
13494 if ((IndexMask & BitMask) != IndexMask) {
13495 // If the result of the and partially provides the byte, then it
13496 // is not well formatted
13497 if (IndexMask & BitMask)
13498 return std::nullopt;
13500 }
13501
13502 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13503 }
13504
13505 case ISD::FSHR: {
13506 if (IsVec)
13507 return std::nullopt;
13508
13509 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13510 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13511 if (!ShiftOp || Op.getValueType().isVector())
13512 return std::nullopt;
13513
13514 uint64_t BitsProvided = Op.getValueSizeInBits();
13515 if (BitsProvided % 8 != 0)
13516 return std::nullopt;
13517
13518 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13519 if (BitShift % 8)
13520 return std::nullopt;
13521
13522 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13523 uint64_t ByteShift = BitShift / 8;
13524
13525 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13526 uint64_t BytesProvided = BitsProvided / 8;
13527 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13528 NewIndex %= BytesProvided;
13529 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13530 }
13531
13532 case ISD::SRA:
13533 case ISD::SRL: {
13534 if (IsVec)
13535 return std::nullopt;
13536
13537 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13538 if (!ShiftOp)
13539 return std::nullopt;
13540
13541 uint64_t BitShift = ShiftOp->getZExtValue();
13542 if (BitShift % 8)
13543 return std::nullopt;
13544
13545 auto BitsProvided = Op.getScalarValueSizeInBits();
13546 if (BitsProvided % 8 != 0)
13547 return std::nullopt;
13548
13549 uint64_t BytesProvided = BitsProvided / 8;
13550 uint64_t ByteShift = BitShift / 8;
13551 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13552 // If the byte we are trying to provide (as tracked by index) falls in this
13553 // range, then the SRL provides the byte. The byte of interest of the src of
13554 // the SRL is Index + ByteShift
13555 return BytesProvided - ByteShift > Index
13556 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13557 Index + ByteShift)
13559 }
13560
13561 case ISD::SHL: {
13562 if (IsVec)
13563 return std::nullopt;
13564
13565 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13566 if (!ShiftOp)
13567 return std::nullopt;
13568
13569 uint64_t BitShift = ShiftOp->getZExtValue();
13570 if (BitShift % 8 != 0)
13571 return std::nullopt;
13572 uint64_t ByteShift = BitShift / 8;
13573
13574 // If we are shifting by an amount greater than (or equal to)
13575 // the index we are trying to provide, then it provides 0s. If not,
13576 // then this bytes are not definitively 0s, and the corresponding byte
13577 // of interest is Index - ByteShift of the src
13578 return Index < ByteShift
13580 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13581 Depth + 1, StartingIndex);
13582 }
13583 case ISD::ANY_EXTEND:
13584 case ISD::SIGN_EXTEND:
13585 case ISD::ZERO_EXTEND:
13587 case ISD::AssertZext:
13588 case ISD::AssertSext: {
13589 if (IsVec)
13590 return std::nullopt;
13591
13592 SDValue NarrowOp = Op->getOperand(0);
13593 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13594 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13595 Op->getOpcode() == ISD::AssertZext ||
13596 Op->getOpcode() == ISD::AssertSext) {
13597 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13598 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13599 }
13600 if (NarrowBitWidth % 8 != 0)
13601 return std::nullopt;
13602 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13603
13604 if (Index >= NarrowByteWidth)
13605 return Op.getOpcode() == ISD::ZERO_EXTEND
13606 ? std::optional<ByteProvider<SDValue>>(
13608 : std::nullopt;
13609 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13610 }
13611
13612 case ISD::TRUNCATE: {
13613 if (IsVec)
13614 return std::nullopt;
13615
13616 uint64_t NarrowByteWidth = BitWidth / 8;
13617
13618 if (NarrowByteWidth >= Index) {
13619 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13620 StartingIndex);
13621 }
13622
13623 return std::nullopt;
13624 }
13625
13626 case ISD::CopyFromReg: {
13627 if (BitWidth / 8 > Index)
13628 return calculateSrcByte(Op, StartingIndex, Index);
13629
13630 return std::nullopt;
13631 }
13632
13633 case ISD::LOAD: {
13634 auto *L = cast<LoadSDNode>(Op.getNode());
13635
13636 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13637 if (NarrowBitWidth % 8 != 0)
13638 return std::nullopt;
13639 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13640
13641 // If the width of the load does not reach byte we are trying to provide for
13642 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13643 // question
13644 if (Index >= NarrowByteWidth) {
13645 return L->getExtensionType() == ISD::ZEXTLOAD
13646 ? std::optional<ByteProvider<SDValue>>(
13648 : std::nullopt;
13649 }
13650
13651 if (NarrowByteWidth > Index) {
13652 return calculateSrcByte(Op, StartingIndex, Index);
13653 }
13654
13655 return std::nullopt;
13656 }
13657
13658 case ISD::BSWAP: {
13659 if (IsVec)
13660 return std::nullopt;
13661
13662 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13663 Depth + 1, StartingIndex);
13664 }
13665
13667 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13668 if (!IdxOp)
13669 return std::nullopt;
13670 auto VecIdx = IdxOp->getZExtValue();
13671 auto ScalarSize = Op.getScalarValueSizeInBits();
13672 if (ScalarSize < 32)
13673 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13674 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13675 StartingIndex, Index);
13676 }
13677
13678 case AMDGPUISD::PERM: {
13679 if (IsVec)
13680 return std::nullopt;
13681
13682 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13683 if (!PermMask)
13684 return std::nullopt;
13685
13686 auto IdxMask =
13687 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13688 if (IdxMask > 0x07 && IdxMask != 0x0c)
13689 return std::nullopt;
13690
13691 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13692 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13693
13694 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13697 }
13698
13699 default: {
13700 return std::nullopt;
13701 }
13702 }
13703
13704 llvm_unreachable("fully handled switch");
13705}
13706
13707// Returns true if the Operand is a scalar and is 16 bits
13708static bool isExtendedFrom16Bits(SDValue &Operand) {
13709
13710 switch (Operand.getOpcode()) {
13711 case ISD::ANY_EXTEND:
13712 case ISD::SIGN_EXTEND:
13713 case ISD::ZERO_EXTEND: {
13714 auto OpVT = Operand.getOperand(0).getValueType();
13715 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13716 }
13717 case ISD::LOAD: {
13718 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13719 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13720 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13721 ExtType == ISD::EXTLOAD) {
13722 auto MemVT = L->getMemoryVT();
13723 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13724 }
13725 return L->getMemoryVT().getSizeInBits() == 16;
13726 }
13727 default:
13728 return false;
13729 }
13730}
13731
13732// Returns true if the mask matches consecutive bytes, and the first byte
13733// begins at a power of 2 byte offset from 0th byte
13734static bool addresses16Bits(int Mask) {
13735 int Low8 = Mask & 0xff;
13736 int Hi8 = (Mask & 0xff00) >> 8;
13737
13738 assert(Low8 < 8 && Hi8 < 8);
13739 // Are the bytes contiguous in the order of increasing addresses.
13740 bool IsConsecutive = (Hi8 - Low8 == 1);
13741 // Is the first byte at location that is aligned for 16 bit instructions.
13742 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13743 // In this case, we still need code to extract the 16 bit operand, so it
13744 // is better to use i8 v_perm
13745 bool Is16Aligned = !(Low8 % 2);
13746
13747 return IsConsecutive && Is16Aligned;
13748}
13749
13750// Do not lower into v_perm if the operands are actually 16 bit
13751// and the selected bits (based on PermMask) correspond with two
13752// easily addressable 16 bit operands.
13754 SDValue &OtherOp) {
13755 int Low16 = PermMask & 0xffff;
13756 int Hi16 = (PermMask & 0xffff0000) >> 16;
13757
13758 auto TempOp = peekThroughBitcasts(Op);
13759 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13760
13761 auto OpIs16Bit =
13762 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13763 if (!OpIs16Bit)
13764 return true;
13765
13766 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13767 isExtendedFrom16Bits(TempOtherOp);
13768 if (!OtherOpIs16Bit)
13769 return true;
13770
13771 // Do we cleanly address both
13772 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13773}
13774
13776 unsigned DWordOffset) {
13777 SDValue Ret;
13778
13779 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13780 // ByteProvider must be at least 8 bits
13781 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13782
13783 if (TypeSize <= 32)
13784 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13785
13786 if (Src.getValueType().isVector()) {
13787 auto ScalarTySize = Src.getScalarValueSizeInBits();
13788 auto ScalarTy = Src.getValueType().getScalarType();
13789 if (ScalarTySize == 32) {
13790 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13791 DAG.getConstant(DWordOffset, SL, MVT::i32));
13792 }
13793 if (ScalarTySize > 32) {
13794 Ret = DAG.getNode(
13795 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13796 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13797 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13798 if (ShiftVal)
13799 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13800 DAG.getConstant(ShiftVal, SL, MVT::i32));
13801 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13802 }
13803
13804 assert(ScalarTySize < 32);
13805 auto NumElements = TypeSize / ScalarTySize;
13806 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13807 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13808 auto NumElementsIn32 = 32 / ScalarTySize;
13809 auto NumAvailElements = DWordOffset < Trunc32Elements
13810 ? NumElementsIn32
13811 : NumElements - NormalizedTrunc;
13812
13814 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13815 NumAvailElements);
13816
13817 Ret = DAG.getBuildVector(
13818 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13819 VecSrcs);
13820 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13821 }
13822
13823 /// Scalar Type
13824 auto ShiftVal = 32 * DWordOffset;
13825 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13826 DAG.getConstant(ShiftVal, SL, MVT::i32));
13827 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13828}
13829
13831 SelectionDAG &DAG = DCI.DAG;
13832 [[maybe_unused]] EVT VT = N->getValueType(0);
13834
13835 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13836 assert(VT == MVT::i32);
13837 for (int i = 0; i < 4; i++) {
13838 // Find the ByteProvider that provides the ith byte of the result of OR
13839 std::optional<ByteProvider<SDValue>> P =
13840 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13841 // TODO support constantZero
13842 if (!P || P->isConstantZero())
13843 return SDValue();
13844
13845 PermNodes.push_back(*P);
13846 }
13847 if (PermNodes.size() != 4)
13848 return SDValue();
13849
13850 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13851 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13852 uint64_t PermMask = 0x00000000;
13853 for (size_t i = 0; i < PermNodes.size(); i++) {
13854 auto PermOp = PermNodes[i];
13855 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13856 // by sizeof(Src2) = 4
13857 int SrcByteAdjust = 4;
13858
13859 // If the Src uses a byte from a different DWORD, then it corresponds
13860 // with a difference source
13861 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13862 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13863 if (SecondSrc)
13864 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13865 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13866 return SDValue();
13867
13868 // Set the index of the second distinct Src node
13869 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13870 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13871 SrcByteAdjust = 0;
13872 }
13873 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13875 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13876 }
13877 SDLoc DL(N);
13878 SDValue Op = *PermNodes[FirstSrc.first].Src;
13879 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13880 assert(Op.getValueSizeInBits() == 32);
13881
13882 // Check that we are not just extracting the bytes in order from an op
13883 if (!SecondSrc) {
13884 int Low16 = PermMask & 0xffff;
13885 int Hi16 = (PermMask & 0xffff0000) >> 16;
13886
13887 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13888 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13889
13890 // The perm op would really just produce Op. So combine into Op
13891 if (WellFormedLow && WellFormedHi)
13892 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13893 }
13894
13895 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13896
13897 if (SecondSrc) {
13898 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13899 assert(OtherOp.getValueSizeInBits() == 32);
13900 }
13901
13902 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13903
13904 assert(Op.getValueType().isByteSized() &&
13905 OtherOp.getValueType().isByteSized());
13906
13907 // If the ultimate src is less than 32 bits, then we will only be
13908 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13909 // CalculateByteProvider would not have returned Op as source if we
13910 // used a byte that is outside its ValueType. Thus, we are free to
13911 // ANY_EXTEND as the extended bits are dont-cares.
13912 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13913 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13914
13915 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13916 DAG.getConstant(PermMask, DL, MVT::i32));
13917 }
13918 return SDValue();
13919}
13920
13921SDValue SITargetLowering::performOrCombine(SDNode *N,
13922 DAGCombinerInfo &DCI) const {
13923 SelectionDAG &DAG = DCI.DAG;
13924 SDValue LHS = N->getOperand(0);
13925 SDValue RHS = N->getOperand(1);
13926
13927 EVT VT = N->getValueType(0);
13928 if (VT == MVT::i1) {
13929 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13930 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13931 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13932 SDValue Src = LHS.getOperand(0);
13933 if (Src != RHS.getOperand(0))
13934 return SDValue();
13935
13936 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13937 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13938 if (!CLHS || !CRHS)
13939 return SDValue();
13940
13941 // Only 10 bits are used.
13942 static const uint32_t MaxMask = 0x3ff;
13943
13944 uint32_t NewMask =
13945 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13946 SDLoc DL(N);
13947 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13948 DAG.getConstant(NewMask, DL, MVT::i32));
13949 }
13950
13951 return SDValue();
13952 }
13953
13954 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13956 LHS.getOpcode() == AMDGPUISD::PERM &&
13957 isa<ConstantSDNode>(LHS.getOperand(2))) {
13958 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13959 if (!Sel)
13960 return SDValue();
13961
13962 Sel |= LHS.getConstantOperandVal(2);
13963 SDLoc DL(N);
13964 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13965 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13966 }
13967
13968 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13969 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13970 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13971 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13972
13973 // If all the uses of an or need to extract the individual elements, do not
13974 // attempt to lower into v_perm
13975 auto usesCombinedOperand = [](SDNode *OrUse) {
13976 // If we have any non-vectorized use, then it is a candidate for v_perm
13977 if (OrUse->getOpcode() != ISD::BITCAST ||
13978 !OrUse->getValueType(0).isVector())
13979 return true;
13980
13981 // If we have any non-vectorized use, then it is a candidate for v_perm
13982 for (auto *VUser : OrUse->users()) {
13983 if (!VUser->getValueType(0).isVector())
13984 return true;
13985
13986 // If the use of a vector is a store, then combining via a v_perm
13987 // is beneficial.
13988 // TODO -- whitelist more uses
13989 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13990 if (VUser->getOpcode() == VectorwiseOp)
13991 return true;
13992 }
13993 return false;
13994 };
13995
13996 if (!any_of(N->users(), usesCombinedOperand))
13997 return SDValue();
13998
13999 uint32_t LHSMask = getPermuteMask(LHS);
14000 uint32_t RHSMask = getPermuteMask(RHS);
14001
14002 if (LHSMask != ~0u && RHSMask != ~0u) {
14003 // Canonicalize the expression in an attempt to have fewer unique masks
14004 // and therefore fewer registers used to hold the masks.
14005 if (LHSMask > RHSMask) {
14006 std::swap(LHSMask, RHSMask);
14007 std::swap(LHS, RHS);
14008 }
14009
14010 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14011 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14012 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14013 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14014
14015 // Check of we need to combine values from two sources within a byte.
14016 if (!(LHSUsedLanes & RHSUsedLanes) &&
14017 // If we select high and lower word keep it for SDWA.
14018 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14019 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14020 // Kill zero bytes selected by other mask. Zero value is 0xc.
14021 LHSMask &= ~RHSUsedLanes;
14022 RHSMask &= ~LHSUsedLanes;
14023 // Add 4 to each active LHS lane
14024 LHSMask |= LHSUsedLanes & 0x04040404;
14025 // Combine masks
14026 uint32_t Sel = LHSMask | RHSMask;
14027 SDLoc DL(N);
14028
14029 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14030 RHS.getOperand(0),
14031 DAG.getConstant(Sel, DL, MVT::i32));
14032 }
14033 }
14034 if (LHSMask == ~0u || RHSMask == ~0u) {
14035 if (SDValue Perm = matchPERM(N, DCI))
14036 return Perm;
14037 }
14038 }
14039
14040 // Detect identity v2i32 OR and replace with identity source node.
14041 // Specifically an Or that has operands constructed from the same source node
14042 // via extract_vector_elt and build_vector. I.E.
14043 // v2i32 or(
14044 // v2i32 build_vector(
14045 // i32 extract_elt(%IdentitySrc, 0),
14046 // i32 0
14047 // ),
14048 // v2i32 build_vector(
14049 // i32 0,
14050 // i32 extract_elt(%IdentitySrc, 1)
14051 // ) )
14052 // =>
14053 // v2i32 %IdentitySrc
14054
14055 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14056 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14057
14058 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14059 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14060
14061 // Test for and normalise build vectors.
14062 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14063
14064 // Get the extract_vector_element operands.
14065 SDValue LEVE = LHS->getOperand(0);
14066 SDValue REVE = RHS->getOperand(1);
14067
14068 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14070 // Check that different elements from the same vector are
14071 // extracted.
14072 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14073 LEVE->getOperand(1) != REVE->getOperand(1)) {
14074 SDValue IdentitySrc = LEVE.getOperand(0);
14075 return IdentitySrc;
14076 }
14077 }
14078 }
14079 }
14080
14081 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14082 return SDValue();
14083
14084 // TODO: This could be a generic combine with a predicate for extracting the
14085 // high half of an integer being free.
14086
14087 // (or i64:x, (zero_extend i32:y)) ->
14088 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14089 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14090 RHS.getOpcode() != ISD::ZERO_EXTEND)
14091 std::swap(LHS, RHS);
14092
14093 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14094 SDValue ExtSrc = RHS.getOperand(0);
14095 EVT SrcVT = ExtSrc.getValueType();
14096 if (SrcVT == MVT::i32) {
14097 SDLoc SL(N);
14098 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14099 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14100
14101 DCI.AddToWorklist(LowOr.getNode());
14102 DCI.AddToWorklist(HiBits.getNode());
14103
14104 SDValue Vec =
14105 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14106 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14107 }
14108 }
14109
14110 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14111 if (CRHS) {
14112 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14113 N->getOperand(0), CRHS))
14114 return Split;
14115 }
14116
14117 return SDValue();
14118}
14119
14120SDValue SITargetLowering::performXorCombine(SDNode *N,
14121 DAGCombinerInfo &DCI) const {
14122 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14123 return RV;
14124
14125 SDValue LHS = N->getOperand(0);
14126 SDValue RHS = N->getOperand(1);
14127
14128 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14129 SelectionDAG &DAG = DCI.DAG;
14130
14131 EVT VT = N->getValueType(0);
14132 if (CRHS && VT == MVT::i64) {
14133 if (SDValue Split =
14134 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14135 return Split;
14136 }
14137
14138 // v2i32 (xor (vselect cc, x, y), K) ->
14139 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14140 // replaced with source modifiers when the select is lowered to CNDMASK.
14141 unsigned Opc = LHS.getOpcode();
14142 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14143 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14144 CRHS && CRHS->getAPIntValue().isSignMask()) {
14145 SDValue CC = LHS->getOperand(0);
14146 SDValue TRUE = LHS->getOperand(1);
14147 SDValue FALSE = LHS->getOperand(2);
14148 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14149 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14150 SDValue XSelect =
14151 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14152 return XSelect;
14153 }
14154
14155 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14156 // fneg-like xors into 64-bit select.
14157 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14158 // This looks like an fneg, try to fold as a source modifier.
14159 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14161 // xor (select c, a, b), 0x80000000 ->
14162 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14163 SDLoc DL(N);
14164 SDValue CastLHS =
14165 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14166 SDValue CastRHS =
14167 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14168 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14169 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14170 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14171 LHS->getOperand(0), FNegLHS, FNegRHS);
14172 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14173 }
14174 }
14175
14176 return SDValue();
14177}
14178
14179SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14180 DAGCombinerInfo &DCI) const {
14181 if (!Subtarget->has16BitInsts() ||
14182 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14183 return SDValue();
14184
14185 EVT VT = N->getValueType(0);
14186 if (VT != MVT::i32)
14187 return SDValue();
14188
14189 SDValue Src = N->getOperand(0);
14190 if (Src.getValueType() != MVT::i16)
14191 return SDValue();
14192
14193 return SDValue();
14194}
14195
14196SDValue
14197SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14198 DAGCombinerInfo &DCI) const {
14199 SDValue Src = N->getOperand(0);
14200 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14201
14202 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14203 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14204 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14205 VTSign->getVT() == MVT::i8) ||
14206 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14207 VTSign->getVT() == MVT::i16))) {
14208 assert(Subtarget->hasScalarSubwordLoads() &&
14209 "s_buffer_load_{u8, i8} are supported "
14210 "in GFX12 (or newer) architectures.");
14211 EVT VT = Src.getValueType();
14212 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14215 SDLoc DL(N);
14216 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14217 SDValue Ops[] = {
14218 Src.getOperand(0), // source register
14219 Src.getOperand(1), // offset
14220 Src.getOperand(2) // cachePolicy
14221 };
14222 auto *M = cast<MemSDNode>(Src);
14223 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14224 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14225 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14226 return LoadVal;
14227 }
14228 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14229 VTSign->getVT() == MVT::i8) ||
14230 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14231 VTSign->getVT() == MVT::i16)) &&
14232 Src.hasOneUse()) {
14233 auto *M = cast<MemSDNode>(Src);
14234 SDValue Ops[] = {Src.getOperand(0), // Chain
14235 Src.getOperand(1), // rsrc
14236 Src.getOperand(2), // vindex
14237 Src.getOperand(3), // voffset
14238 Src.getOperand(4), // soffset
14239 Src.getOperand(5), // offset
14240 Src.getOperand(6), Src.getOperand(7)};
14241 // replace with BUFFER_LOAD_BYTE/SHORT
14242 SDVTList ResList =
14243 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14244 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14247 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14248 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14249 return DCI.DAG.getMergeValues(
14250 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14251 }
14252 return SDValue();
14253}
14254
14255SDValue SITargetLowering::performClassCombine(SDNode *N,
14256 DAGCombinerInfo &DCI) const {
14257 SelectionDAG &DAG = DCI.DAG;
14258 SDValue Mask = N->getOperand(1);
14259
14260 // fp_class x, 0 -> false
14261 if (isNullConstant(Mask))
14262 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14263
14264 if (N->getOperand(0).isUndef())
14265 return DAG.getUNDEF(MVT::i1);
14266
14267 return SDValue();
14268}
14269
14270SDValue SITargetLowering::performRcpCombine(SDNode *N,
14271 DAGCombinerInfo &DCI) const {
14272 EVT VT = N->getValueType(0);
14273 SDValue N0 = N->getOperand(0);
14274
14275 if (N0.isUndef()) {
14276 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14277 SDLoc(N), VT);
14278 }
14279
14280 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14281 N0.getOpcode() == ISD::SINT_TO_FP)) {
14282 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14283 N->getFlags());
14284 }
14285
14286 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14287 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14288 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14289 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14290 N->getFlags());
14291 }
14292
14294}
14295
14297 unsigned MaxDepth) const {
14298 unsigned Opcode = Op.getOpcode();
14299 if (Opcode == ISD::FCANONICALIZE)
14300 return true;
14301
14302 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14303 const auto &F = CFP->getValueAPF();
14304 if (F.isNaN() && F.isSignaling())
14305 return false;
14306 if (!F.isDenormal())
14307 return true;
14308
14309 DenormalMode Mode =
14310 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14311 return Mode == DenormalMode::getIEEE();
14312 }
14313
14314 // If source is a result of another standard FP operation it is already in
14315 // canonical form.
14316 if (MaxDepth == 0)
14317 return false;
14318
14319 switch (Opcode) {
14320 // These will flush denorms if required.
14321 case ISD::FADD:
14322 case ISD::FSUB:
14323 case ISD::FMUL:
14324 case ISD::FCEIL:
14325 case ISD::FFLOOR:
14326 case ISD::FMA:
14327 case ISD::FMAD:
14328 case ISD::FSQRT:
14329 case ISD::FDIV:
14330 case ISD::FREM:
14331 case ISD::FP_ROUND:
14332 case ISD::FP_EXTEND:
14333 case ISD::FP16_TO_FP:
14334 case ISD::FP_TO_FP16:
14335 case ISD::BF16_TO_FP:
14336 case ISD::FP_TO_BF16:
14337 case ISD::FLDEXP:
14340 case AMDGPUISD::RCP:
14341 case AMDGPUISD::RSQ:
14345 case AMDGPUISD::LOG:
14346 case AMDGPUISD::EXP:
14350 case AMDGPUISD::FRACT:
14357 case AMDGPUISD::SIN_HW:
14358 case AMDGPUISD::COS_HW:
14359 return true;
14360
14361 // It can/will be lowered or combined as a bit operation.
14362 // Need to check their input recursively to handle.
14363 case ISD::FNEG:
14364 case ISD::FABS:
14365 case ISD::FCOPYSIGN:
14366 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14367
14368 case ISD::AND:
14369 if (Op.getValueType() == MVT::i32) {
14370 // Be careful as we only know it is a bitcast floating point type. It
14371 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14372 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14373 // is valid to optimize for all types.
14374 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14375 if (RHS->getZExtValue() == 0xffff0000) {
14376 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14377 }
14378 }
14379 }
14380 break;
14381
14382 case ISD::FSIN:
14383 case ISD::FCOS:
14384 case ISD::FSINCOS:
14385 return Op.getValueType().getScalarType() != MVT::f16;
14386
14387 case ISD::FMINNUM:
14388 case ISD::FMAXNUM:
14389 case ISD::FMINNUM_IEEE:
14390 case ISD::FMAXNUM_IEEE:
14391 case ISD::FMINIMUM:
14392 case ISD::FMAXIMUM:
14393 case ISD::FMINIMUMNUM:
14394 case ISD::FMAXIMUMNUM:
14395 case AMDGPUISD::CLAMP:
14396 case AMDGPUISD::FMED3:
14397 case AMDGPUISD::FMAX3:
14398 case AMDGPUISD::FMIN3:
14400 case AMDGPUISD::FMINIMUM3: {
14401 // FIXME: Shouldn't treat the generic operations different based these.
14402 // However, we aren't really required to flush the result from
14403 // minnum/maxnum..
14404
14405 // snans will be quieted, so we only need to worry about denormals.
14406 if (Subtarget->supportsMinMaxDenormModes() ||
14407 // FIXME: denormalsEnabledForType is broken for dynamic
14408 denormalsEnabledForType(DAG, Op.getValueType()))
14409 return true;
14410
14411 // Flushing may be required.
14412 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14413 // targets need to check their input recursively.
14414
14415 // FIXME: Does this apply with clamp? It's implemented with max.
14416 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14417 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14418 return false;
14419 }
14420
14421 return true;
14422 }
14423 case ISD::SELECT: {
14424 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14425 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14426 }
14427 case ISD::BUILD_VECTOR: {
14428 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14429 SDValue SrcOp = Op.getOperand(i);
14430 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14431 return false;
14432 }
14433
14434 return true;
14435 }
14438 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14439 }
14441 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14442 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14443 }
14444 case ISD::UNDEF:
14445 // Could be anything.
14446 return false;
14447
14448 case ISD::BITCAST:
14449 // TODO: This is incorrect as it loses track of the operand's type. We may
14450 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14451 // same bits that are canonicalized in one type need not be in the other.
14452 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14453 case ISD::TRUNCATE: {
14454 // Hack round the mess we make when legalizing extract_vector_elt
14455 if (Op.getValueType() == MVT::i16) {
14456 SDValue TruncSrc = Op.getOperand(0);
14457 if (TruncSrc.getValueType() == MVT::i32 &&
14458 TruncSrc.getOpcode() == ISD::BITCAST &&
14459 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14460 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14461 }
14462 }
14463 return false;
14464 }
14466 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14467 // TODO: Handle more intrinsics
14468 switch (IntrinsicID) {
14469 case Intrinsic::amdgcn_cvt_pkrtz:
14470 case Intrinsic::amdgcn_cubeid:
14471 case Intrinsic::amdgcn_frexp_mant:
14472 case Intrinsic::amdgcn_fdot2:
14473 case Intrinsic::amdgcn_rcp:
14474 case Intrinsic::amdgcn_rsq:
14475 case Intrinsic::amdgcn_rsq_clamp:
14476 case Intrinsic::amdgcn_rcp_legacy:
14477 case Intrinsic::amdgcn_rsq_legacy:
14478 case Intrinsic::amdgcn_trig_preop:
14479 case Intrinsic::amdgcn_tanh:
14480 case Intrinsic::amdgcn_log:
14481 case Intrinsic::amdgcn_exp2:
14482 case Intrinsic::amdgcn_sqrt:
14483 return true;
14484 default:
14485 break;
14486 }
14487
14488 break;
14489 }
14490 default:
14491 break;
14492 }
14493
14494 // FIXME: denormalsEnabledForType is broken for dynamic
14495 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14496 DAG.isKnownNeverSNaN(Op);
14497}
14498
14500 unsigned MaxDepth) const {
14501 const MachineRegisterInfo &MRI = MF.getRegInfo();
14502 MachineInstr *MI = MRI.getVRegDef(Reg);
14503 unsigned Opcode = MI->getOpcode();
14504
14505 if (Opcode == AMDGPU::G_FCANONICALIZE)
14506 return true;
14507
14508 std::optional<FPValueAndVReg> FCR;
14509 // Constant splat (can be padded with undef) or scalar constant.
14511 if (FCR->Value.isSignaling())
14512 return false;
14513 if (!FCR->Value.isDenormal())
14514 return true;
14515
14516 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14517 return Mode == DenormalMode::getIEEE();
14518 }
14519
14520 if (MaxDepth == 0)
14521 return false;
14522
14523 switch (Opcode) {
14524 case AMDGPU::G_FADD:
14525 case AMDGPU::G_FSUB:
14526 case AMDGPU::G_FMUL:
14527 case AMDGPU::G_FCEIL:
14528 case AMDGPU::G_FFLOOR:
14529 case AMDGPU::G_FRINT:
14530 case AMDGPU::G_FNEARBYINT:
14531 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14532 case AMDGPU::G_INTRINSIC_TRUNC:
14533 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14534 case AMDGPU::G_FMA:
14535 case AMDGPU::G_FMAD:
14536 case AMDGPU::G_FSQRT:
14537 case AMDGPU::G_FDIV:
14538 case AMDGPU::G_FREM:
14539 case AMDGPU::G_FPOW:
14540 case AMDGPU::G_FPEXT:
14541 case AMDGPU::G_FLOG:
14542 case AMDGPU::G_FLOG2:
14543 case AMDGPU::G_FLOG10:
14544 case AMDGPU::G_FPTRUNC:
14545 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14546 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14547 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14548 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14549 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14550 return true;
14551 case AMDGPU::G_FNEG:
14552 case AMDGPU::G_FABS:
14553 case AMDGPU::G_FCOPYSIGN:
14554 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14555 case AMDGPU::G_FMINNUM:
14556 case AMDGPU::G_FMAXNUM:
14557 case AMDGPU::G_FMINNUM_IEEE:
14558 case AMDGPU::G_FMAXNUM_IEEE:
14559 case AMDGPU::G_FMINIMUM:
14560 case AMDGPU::G_FMAXIMUM:
14561 case AMDGPU::G_FMINIMUMNUM:
14562 case AMDGPU::G_FMAXIMUMNUM: {
14563 if (Subtarget->supportsMinMaxDenormModes() ||
14564 // FIXME: denormalsEnabledForType is broken for dynamic
14565 denormalsEnabledForType(MRI.getType(Reg), MF))
14566 return true;
14567
14568 [[fallthrough]];
14569 }
14570 case AMDGPU::G_BUILD_VECTOR:
14571 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14572 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14573 return false;
14574 return true;
14575 case AMDGPU::G_INTRINSIC:
14576 case AMDGPU::G_INTRINSIC_CONVERGENT:
14577 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14578 case Intrinsic::amdgcn_fmul_legacy:
14579 case Intrinsic::amdgcn_fmad_ftz:
14580 case Intrinsic::amdgcn_sqrt:
14581 case Intrinsic::amdgcn_fmed3:
14582 case Intrinsic::amdgcn_sin:
14583 case Intrinsic::amdgcn_cos:
14584 case Intrinsic::amdgcn_log:
14585 case Intrinsic::amdgcn_exp2:
14586 case Intrinsic::amdgcn_log_clamp:
14587 case Intrinsic::amdgcn_rcp:
14588 case Intrinsic::amdgcn_rcp_legacy:
14589 case Intrinsic::amdgcn_rsq:
14590 case Intrinsic::amdgcn_rsq_clamp:
14591 case Intrinsic::amdgcn_rsq_legacy:
14592 case Intrinsic::amdgcn_div_scale:
14593 case Intrinsic::amdgcn_div_fmas:
14594 case Intrinsic::amdgcn_div_fixup:
14595 case Intrinsic::amdgcn_fract:
14596 case Intrinsic::amdgcn_cvt_pkrtz:
14597 case Intrinsic::amdgcn_cubeid:
14598 case Intrinsic::amdgcn_cubema:
14599 case Intrinsic::amdgcn_cubesc:
14600 case Intrinsic::amdgcn_cubetc:
14601 case Intrinsic::amdgcn_frexp_mant:
14602 case Intrinsic::amdgcn_fdot2:
14603 case Intrinsic::amdgcn_trig_preop:
14604 case Intrinsic::amdgcn_tanh:
14605 return true;
14606 default:
14607 break;
14608 }
14609
14610 [[fallthrough]];
14611 default:
14612 return false;
14613 }
14614
14615 llvm_unreachable("invalid operation");
14616}
14617
14618// Constant fold canonicalize.
14619SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14620 const SDLoc &SL, EVT VT,
14621 const APFloat &C) const {
14622 // Flush denormals to 0 if not enabled.
14623 if (C.isDenormal()) {
14624 DenormalMode Mode =
14625 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14626 if (Mode == DenormalMode::getPreserveSign()) {
14627 return DAG.getConstantFP(
14628 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14629 }
14630
14631 if (Mode != DenormalMode::getIEEE())
14632 return SDValue();
14633 }
14634
14635 if (C.isNaN()) {
14636 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14637 if (C.isSignaling()) {
14638 // Quiet a signaling NaN.
14639 // FIXME: Is this supposed to preserve payload bits?
14640 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14641 }
14642
14643 // Make sure it is the canonical NaN bitpattern.
14644 //
14645 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14646 // immediate?
14647 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14648 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14649 }
14650
14651 // Already canonical.
14652 return DAG.getConstantFP(C, SL, VT);
14653}
14654
14656 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14657}
14658
14659SDValue
14660SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14661 DAGCombinerInfo &DCI) const {
14662 SelectionDAG &DAG = DCI.DAG;
14663 SDValue N0 = N->getOperand(0);
14664 EVT VT = N->getValueType(0);
14665
14666 // fcanonicalize undef -> qnan
14667 if (N0.isUndef()) {
14669 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14670 }
14671
14672 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14673 EVT VT = N->getValueType(0);
14674 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14675 }
14676
14677 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14678 // (fcanonicalize k)
14679 //
14680 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14681
14682 // TODO: This could be better with wider vectors that will be split to v2f16,
14683 // and to consider uses since there aren't that many packed operations.
14684 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14685 isTypeLegal(MVT::v2f16)) {
14686 SDLoc SL(N);
14687 SDValue NewElts[2];
14688 SDValue Lo = N0.getOperand(0);
14689 SDValue Hi = N0.getOperand(1);
14690 EVT EltVT = Lo.getValueType();
14691
14693 for (unsigned I = 0; I != 2; ++I) {
14694 SDValue Op = N0.getOperand(I);
14695 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14696 NewElts[I] =
14697 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14698 } else if (Op.isUndef()) {
14699 // Handled below based on what the other operand is.
14700 NewElts[I] = Op;
14701 } else {
14702 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14703 }
14704 }
14705
14706 // If one half is undef, and one is constant, prefer a splat vector rather
14707 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14708 // cheaper to use and may be free with a packed operation.
14709 if (NewElts[0].isUndef()) {
14710 if (isa<ConstantFPSDNode>(NewElts[1]))
14711 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14712 ? NewElts[1]
14713 : DAG.getConstantFP(0.0f, SL, EltVT);
14714 }
14715
14716 if (NewElts[1].isUndef()) {
14717 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14718 ? NewElts[0]
14719 : DAG.getConstantFP(0.0f, SL, EltVT);
14720 }
14721
14722 return DAG.getBuildVector(VT, SL, NewElts);
14723 }
14724 }
14725
14726 return SDValue();
14727}
14728
14729static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14730 switch (Opc) {
14731 case ISD::FMAXNUM:
14732 case ISD::FMAXNUM_IEEE:
14733 case ISD::FMAXIMUMNUM:
14734 return AMDGPUISD::FMAX3;
14735 case ISD::FMAXIMUM:
14736 return AMDGPUISD::FMAXIMUM3;
14737 case ISD::SMAX:
14738 return AMDGPUISD::SMAX3;
14739 case ISD::UMAX:
14740 return AMDGPUISD::UMAX3;
14741 case ISD::FMINNUM:
14742 case ISD::FMINNUM_IEEE:
14743 case ISD::FMINIMUMNUM:
14744 return AMDGPUISD::FMIN3;
14745 case ISD::FMINIMUM:
14746 return AMDGPUISD::FMINIMUM3;
14747 case ISD::SMIN:
14748 return AMDGPUISD::SMIN3;
14749 case ISD::UMIN:
14750 return AMDGPUISD::UMIN3;
14751 default:
14752 llvm_unreachable("Not a min/max opcode");
14753 }
14754}
14755
14756SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14757 const SDLoc &SL, SDValue Src,
14758 SDValue MinVal,
14759 SDValue MaxVal,
14760 bool Signed) const {
14761
14762 // med3 comes from
14763 // min(max(x, K0), K1), K0 < K1
14764 // max(min(x, K0), K1), K1 < K0
14765 //
14766 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14767 // min/max op.
14768 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14769 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14770
14771 if (!MinK || !MaxK)
14772 return SDValue();
14773
14774 if (Signed) {
14775 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14776 return SDValue();
14777 } else {
14778 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14779 return SDValue();
14780 }
14781
14782 EVT VT = MinK->getValueType(0);
14783 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14784 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14785 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14786
14787 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14788 // not available, but this is unlikely to be profitable as constants
14789 // will often need to be materialized & extended, especially on
14790 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14791 return SDValue();
14792}
14793
14796 return C;
14797
14799 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14800 return C;
14801 }
14802
14803 return nullptr;
14804}
14805
14806SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14807 const SDLoc &SL, SDValue Op0,
14808 SDValue Op1) const {
14809 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14810 if (!K1)
14811 return SDValue();
14812
14813 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14814 if (!K0)
14815 return SDValue();
14816
14817 // Ordered >= (although NaN inputs should have folded away by now).
14818 if (K0->getValueAPF() > K1->getValueAPF())
14819 return SDValue();
14820
14821 // med3 with a nan input acts like
14822 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14823 //
14824 // So the result depends on whether the IEEE mode bit is enabled or not with a
14825 // signaling nan input.
14826 // ieee=1
14827 // s0 snan: yields s2
14828 // s1 snan: yields s2
14829 // s2 snan: qnan
14830
14831 // s0 qnan: min(s1, s2)
14832 // s1 qnan: min(s0, s2)
14833 // s2 qnan: min(s0, s1)
14834
14835 // ieee=0
14836 // s0 snan: min(s1, s2)
14837 // s1 snan: min(s0, s2)
14838 // s2 snan: qnan
14839
14840 // s0 qnan: min(s1, s2)
14841 // s1 qnan: min(s0, s2)
14842 // s2 qnan: min(s0, s1)
14843 const MachineFunction &MF = DAG.getMachineFunction();
14844 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14845
14846 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14847 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14848 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14849 EVT VT = Op0.getValueType();
14850 if (Info->getMode().DX10Clamp) {
14851 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14852 // hardware fmed3 behavior converting to a min.
14853 // FIXME: Should this be allowing -0.0?
14854 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14855 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14856 }
14857
14858 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14859 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14860 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14861 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14862 // then give the other result, which is different from med3 with a NaN
14863 // input.
14864 SDValue Var = Op0.getOperand(0);
14865 if (!DAG.isKnownNeverSNaN(Var))
14866 return SDValue();
14867
14868 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14869
14870 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14871 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14872 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14873 SDValue(K0, 0), SDValue(K1, 0));
14874 }
14875 }
14876
14877 return SDValue();
14878}
14879
14880/// \return true if the subtarget supports minimum3 and maximum3 with the given
14881/// base min/max opcode \p Opc for type \p VT.
14882static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14883 EVT VT) {
14884 switch (Opc) {
14885 case ISD::FMINNUM:
14886 case ISD::FMAXNUM:
14887 case ISD::FMINNUM_IEEE:
14888 case ISD::FMAXNUM_IEEE:
14889 case ISD::FMINIMUMNUM:
14890 case ISD::FMAXIMUMNUM:
14893 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14894 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14895 case ISD::FMINIMUM:
14896 case ISD::FMAXIMUM:
14897 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14898 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14899 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14900 case ISD::SMAX:
14901 case ISD::SMIN:
14902 case ISD::UMAX:
14903 case ISD::UMIN:
14904 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14905 default:
14906 return false;
14907 }
14908
14909 llvm_unreachable("not a min/max opcode");
14910}
14911
14912SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14913 DAGCombinerInfo &DCI) const {
14914 SelectionDAG &DAG = DCI.DAG;
14915
14916 EVT VT = N->getValueType(0);
14917 unsigned Opc = N->getOpcode();
14918 SDValue Op0 = N->getOperand(0);
14919 SDValue Op1 = N->getOperand(1);
14920
14921 // Only do this if the inner op has one use since this will just increases
14922 // register pressure for no benefit.
14923
14924 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14925 // max(max(a, b), c) -> max3(a, b, c)
14926 // min(min(a, b), c) -> min3(a, b, c)
14927 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14928 SDLoc DL(N);
14929 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14930 Op0.getOperand(0), Op0.getOperand(1), Op1);
14931 }
14932
14933 // Try commuted.
14934 // max(a, max(b, c)) -> max3(a, b, c)
14935 // min(a, min(b, c)) -> min3(a, b, c)
14936 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14937 SDLoc DL(N);
14938 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14939 Op0, Op1.getOperand(0), Op1.getOperand(1));
14940 }
14941 }
14942
14943 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14944 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14945 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14946 if (SDValue Med3 = performIntMed3ImmCombine(
14947 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14948 return Med3;
14949 }
14950 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14951 if (SDValue Med3 = performIntMed3ImmCombine(
14952 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14953 return Med3;
14954 }
14955
14956 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14957 if (SDValue Med3 = performIntMed3ImmCombine(
14958 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14959 return Med3;
14960 }
14961 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14962 if (SDValue Med3 = performIntMed3ImmCombine(
14963 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14964 return Med3;
14965 }
14966
14967 // if !is_snan(x):
14968 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14969 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14970 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14971 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14972 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14973 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14974 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14976 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14977 (VT == MVT::f32 || VT == MVT::f64 ||
14978 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14979 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14980 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14981 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14982 Op0.hasOneUse()) {
14983 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14984 return Res;
14985 }
14986
14987 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14988 // for some types, but at a higher cost since it's implemented with a 3
14989 // operand form.
14990 const SDNodeFlags Flags = N->getFlags();
14991 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14992 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14993 unsigned NewOpc =
14994 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14995 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14996 }
14997
14998 return SDValue();
14999}
15000
15004 // FIXME: Should this be allowing -0.0?
15005 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15006 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15007 }
15008 }
15009
15010 return false;
15011}
15012
15013// FIXME: Should only worry about snans for version with chain.
15014SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15015 DAGCombinerInfo &DCI) const {
15016 EVT VT = N->getValueType(0);
15017 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15018 // NaNs. With a NaN input, the order of the operands may change the result.
15019
15020 SelectionDAG &DAG = DCI.DAG;
15021 SDLoc SL(N);
15022
15023 SDValue Src0 = N->getOperand(0);
15024 SDValue Src1 = N->getOperand(1);
15025 SDValue Src2 = N->getOperand(2);
15026
15027 if (isClampZeroToOne(Src0, Src1)) {
15028 // const_a, const_b, x -> clamp is safe in all cases including signaling
15029 // nans.
15030 // FIXME: Should this be allowing -0.0?
15031 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15032 }
15033
15034 const MachineFunction &MF = DAG.getMachineFunction();
15035 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15036
15037 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15038 // handling no dx10-clamp?
15039 if (Info->getMode().DX10Clamp) {
15040 // If NaNs is clamped to 0, we are free to reorder the inputs.
15041
15042 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15043 std::swap(Src0, Src1);
15044
15045 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15046 std::swap(Src1, Src2);
15047
15048 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15049 std::swap(Src0, Src1);
15050
15051 if (isClampZeroToOne(Src1, Src2))
15052 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15053 }
15054
15055 return SDValue();
15056}
15057
15058SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15059 DAGCombinerInfo &DCI) const {
15060 SDValue Src0 = N->getOperand(0);
15061 SDValue Src1 = N->getOperand(1);
15062 if (Src0.isUndef() && Src1.isUndef())
15063 return DCI.DAG.getUNDEF(N->getValueType(0));
15064 return SDValue();
15065}
15066
15067// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15068// expanded into a set of cmp/select instructions.
15070 unsigned NumElem,
15071 bool IsDivergentIdx,
15072 const GCNSubtarget *Subtarget) {
15074 return false;
15075
15076 unsigned VecSize = EltSize * NumElem;
15077
15078 // Sub-dword vectors of size 2 dword or less have better implementation.
15079 if (VecSize <= 64 && EltSize < 32)
15080 return false;
15081
15082 // Always expand the rest of sub-dword instructions, otherwise it will be
15083 // lowered via memory.
15084 if (EltSize < 32)
15085 return true;
15086
15087 // Always do this if var-idx is divergent, otherwise it will become a loop.
15088 if (IsDivergentIdx)
15089 return true;
15090
15091 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15092 unsigned NumInsts = NumElem /* Number of compares */ +
15093 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15094
15095 // On some architectures (GFX9) movrel is not available and it's better
15096 // to expand.
15097 if (Subtarget->useVGPRIndexMode())
15098 return NumInsts <= 16;
15099
15100 // If movrel is available, use it instead of expanding for vector of 8
15101 // elements.
15102 if (Subtarget->hasMovrel())
15103 return NumInsts <= 15;
15104
15105 return true;
15106}
15107
15109 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15110 if (isa<ConstantSDNode>(Idx))
15111 return false;
15112
15113 SDValue Vec = N->getOperand(0);
15114 EVT VecVT = Vec.getValueType();
15115 EVT EltVT = VecVT.getVectorElementType();
15116 unsigned EltSize = EltVT.getSizeInBits();
15117 unsigned NumElem = VecVT.getVectorNumElements();
15118
15120 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15121}
15122
15123SDValue
15124SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15125 DAGCombinerInfo &DCI) const {
15126 SDValue Vec = N->getOperand(0);
15127 SelectionDAG &DAG = DCI.DAG;
15128
15129 EVT VecVT = Vec.getValueType();
15130 EVT VecEltVT = VecVT.getVectorElementType();
15131 EVT ResVT = N->getValueType(0);
15132
15133 unsigned VecSize = VecVT.getSizeInBits();
15134 unsigned VecEltSize = VecEltVT.getSizeInBits();
15135
15136 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15138 SDLoc SL(N);
15139 SDValue Idx = N->getOperand(1);
15140 SDValue Elt =
15141 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15142 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15143 }
15144
15145 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15146 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15147 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15148 // depending on the shift operand. See e.g. performSraCombine().
15149 // This combine ensures that the optimisation is compatible with v2i32
15150 // legalised AND.
15151 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15152 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15153
15155 if (!C || C->getZExtValue() != 0x1f)
15156 return SDValue();
15157
15158 SDLoc SL(N);
15159 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15160 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15161 Vec->getOperand(0), N->getOperand(1));
15162 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15163 DAG.ReplaceAllUsesWith(N, A.getNode());
15164 }
15165
15166 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15167 // =>
15168 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15169 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15170 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15171 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15172 SDLoc SL(N);
15173 SDValue Idx = N->getOperand(1);
15174 unsigned Opc = Vec.getOpcode();
15175
15176 switch (Opc) {
15177 default:
15178 break;
15179 // TODO: Support other binary operations.
15180 case ISD::FADD:
15181 case ISD::FSUB:
15182 case ISD::FMUL:
15183 case ISD::ADD:
15184 case ISD::UMIN:
15185 case ISD::UMAX:
15186 case ISD::SMIN:
15187 case ISD::SMAX:
15188 case ISD::FMAXNUM:
15189 case ISD::FMINNUM:
15190 case ISD::FMAXNUM_IEEE:
15191 case ISD::FMINNUM_IEEE:
15192 case ISD::FMAXIMUM:
15193 case ISD::FMINIMUM: {
15194 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15195 Vec.getOperand(0), Idx);
15196 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15197 Vec.getOperand(1), Idx);
15198
15199 DCI.AddToWorklist(Elt0.getNode());
15200 DCI.AddToWorklist(Elt1.getNode());
15201 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15202 }
15203 }
15204 }
15205
15206 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15208 SDLoc SL(N);
15209 SDValue Idx = N->getOperand(1);
15210 SDValue V;
15211 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15212 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15213 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15214 if (I == 0)
15215 V = Elt;
15216 else
15217 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15218 }
15219 return V;
15220 }
15221
15222 if (!DCI.isBeforeLegalize())
15223 return SDValue();
15224
15225 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15226 // elements. This exposes more load reduction opportunities by replacing
15227 // multiple small extract_vector_elements with a single 32-bit extract.
15228 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15229 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15230 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15231 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15232
15233 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15234 unsigned EltIdx = BitIndex / 32;
15235 unsigned LeftoverBitIdx = BitIndex % 32;
15236 SDLoc SL(N);
15237
15238 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15239 DCI.AddToWorklist(Cast.getNode());
15240
15241 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15242 DAG.getConstant(EltIdx, SL, MVT::i32));
15243 DCI.AddToWorklist(Elt.getNode());
15244 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15245 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15246 DCI.AddToWorklist(Srl.getNode());
15247
15248 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15249 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15250 DCI.AddToWorklist(Trunc.getNode());
15251
15252 if (VecEltVT == ResVT) {
15253 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15254 }
15255
15256 assert(ResVT.isScalarInteger());
15257 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15258 }
15259
15260 return SDValue();
15261}
15262
15263SDValue
15264SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15265 DAGCombinerInfo &DCI) const {
15266 SDValue Vec = N->getOperand(0);
15267 SDValue Idx = N->getOperand(2);
15268 EVT VecVT = Vec.getValueType();
15269 EVT EltVT = VecVT.getVectorElementType();
15270
15271 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15272 // => BUILD_VECTOR n x select (e, const-idx)
15274 return SDValue();
15275
15276 SelectionDAG &DAG = DCI.DAG;
15277 SDLoc SL(N);
15278 SDValue Ins = N->getOperand(1);
15279 EVT IdxVT = Idx.getValueType();
15280
15282 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15283 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15284 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15285 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15286 Ops.push_back(V);
15287 }
15288
15289 return DAG.getBuildVector(VecVT, SL, Ops);
15290}
15291
15292/// Return the source of an fp_extend from f16 to f32, or a converted FP
15293/// constant.
15295 if (Src.getOpcode() == ISD::FP_EXTEND &&
15296 Src.getOperand(0).getValueType() == MVT::f16) {
15297 return Src.getOperand(0);
15298 }
15299
15300 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15301 APFloat Val = CFP->getValueAPF();
15302 bool LosesInfo = true;
15304 if (!LosesInfo)
15305 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15306 }
15307
15308 return SDValue();
15309}
15310
15311SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15312 DAGCombinerInfo &DCI) const {
15313 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15314 "combine only useful on gfx8");
15315
15316 SDValue TruncSrc = N->getOperand(0);
15317 EVT VT = N->getValueType(0);
15318 if (VT != MVT::f16)
15319 return SDValue();
15320
15321 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15322 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15323 return SDValue();
15324
15325 SelectionDAG &DAG = DCI.DAG;
15326 SDLoc SL(N);
15327
15328 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15329 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15330 // casting back.
15331
15332 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15333 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15334 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15335 if (!A)
15336 return SDValue();
15337
15338 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15339 if (!B)
15340 return SDValue();
15341
15342 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15343 if (!C)
15344 return SDValue();
15345
15346 // This changes signaling nan behavior. If an input is a signaling nan, it
15347 // would have been quieted by the fpext originally. We don't care because
15348 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15349 // we would be worse off than just doing the promotion.
15350 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15351 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15352 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15353 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15354}
15355
15356unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15357 const SDNode *N0,
15358 const SDNode *N1) const {
15359 EVT VT = N0->getValueType(0);
15360
15361 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15362 // support denormals ever.
15363 if (((VT == MVT::f32 &&
15365 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15368 return ISD::FMAD;
15369
15370 const TargetOptions &Options = DAG.getTarget().Options;
15371 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15372 (N0->getFlags().hasAllowContract() &&
15373 N1->getFlags().hasAllowContract())) &&
15375 return ISD::FMA;
15376 }
15377
15378 return 0;
15379}
15380
15381// For a reassociatable opcode perform:
15382// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15383SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15384 SelectionDAG &DAG) const {
15385 EVT VT = N->getValueType(0);
15386 if (VT != MVT::i32 && VT != MVT::i64)
15387 return SDValue();
15388
15389 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15390 return SDValue();
15391
15392 unsigned Opc = N->getOpcode();
15393 SDValue Op0 = N->getOperand(0);
15394 SDValue Op1 = N->getOperand(1);
15395
15396 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15397 return SDValue();
15398
15399 if (Op0->isDivergent())
15400 std::swap(Op0, Op1);
15401
15402 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15403 return SDValue();
15404
15405 SDValue Op2 = Op1.getOperand(1);
15406 Op1 = Op1.getOperand(0);
15407 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15408 return SDValue();
15409
15410 if (Op1->isDivergent())
15411 std::swap(Op1, Op2);
15412
15413 SDLoc SL(N);
15414 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15415 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15416}
15417
15418static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15419 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15421 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15422 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15423 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15424}
15425
15426// Fold
15427// y = lshr i64 x, 32
15428// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15429// with Const.hi == -1
15430// To
15431// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15433 SDValue MulLHS, SDValue MulRHS,
15434 SDValue AddRHS) {
15435 if (MulRHS.getOpcode() == ISD::SRL)
15436 std::swap(MulLHS, MulRHS);
15437
15438 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15439 return SDValue();
15440
15441 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15442 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15443 MulLHS.getOperand(0) != AddRHS)
15444 return SDValue();
15445
15447 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15448 return SDValue();
15449
15450 SDValue ConstMul =
15451 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15452 return getMad64_32(DAG, SL, MVT::i64,
15453 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15454 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15455}
15456
15457// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15458// multiplies, if any.
15459//
15460// Full 64-bit multiplies that feed into an addition are lowered here instead
15461// of using the generic expansion. The generic expansion ends up with
15462// a tree of ADD nodes that prevents us from using the "add" part of the
15463// MAD instruction. The expansion produced here results in a chain of ADDs
15464// instead of a tree.
15465SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15466 DAGCombinerInfo &DCI) const {
15467 assert(N->isAnyAdd());
15468
15469 SelectionDAG &DAG = DCI.DAG;
15470 EVT VT = N->getValueType(0);
15471 SDLoc SL(N);
15472 SDValue LHS = N->getOperand(0);
15473 SDValue RHS = N->getOperand(1);
15474
15475 if (VT.isVector())
15476 return SDValue();
15477
15478 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15479 // result in scalar registers for uniform values.
15480 if (!N->isDivergent() && Subtarget->hasSMulHi())
15481 return SDValue();
15482
15483 unsigned NumBits = VT.getScalarSizeInBits();
15484 if (NumBits <= 32 || NumBits > 64)
15485 return SDValue();
15486
15487 if (LHS.getOpcode() != ISD::MUL) {
15488 assert(RHS.getOpcode() == ISD::MUL);
15489 std::swap(LHS, RHS);
15490 }
15491
15492 // Avoid the fold if it would unduly increase the number of multiplies due to
15493 // multiple uses, except on hardware with full-rate multiply-add (which is
15494 // part of full-rate 64-bit ops).
15495 if (!Subtarget->hasFullRate64Ops()) {
15496 unsigned NumUsers = 0;
15497 for (SDNode *User : LHS->users()) {
15498 // There is a use that does not feed into addition, so the multiply can't
15499 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15500 if (!User->isAnyAdd())
15501 return SDValue();
15502
15503 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15504 // MUL + 3xADD + 3xADDC over 3xMAD.
15505 ++NumUsers;
15506 if (NumUsers >= 3)
15507 return SDValue();
15508 }
15509 }
15510
15511 SDValue MulLHS = LHS.getOperand(0);
15512 SDValue MulRHS = LHS.getOperand(1);
15513 SDValue AddRHS = RHS;
15514
15515 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15516 return FoldedMAD;
15517
15518 // Always check whether operands are small unsigned values, since that
15519 // knowledge is useful in more cases. Check for small signed values only if
15520 // doing so can unlock a shorter code sequence.
15521 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15522 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15523
15524 bool MulSignedLo = false;
15525 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15526 MulSignedLo =
15527 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15528 }
15529
15530 // The operands and final result all have the same number of bits. If
15531 // operands need to be extended, they can be extended with garbage. The
15532 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15533 // truncated away in the end.
15534 if (VT != MVT::i64) {
15535 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15536 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15537 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15538 }
15539
15540 // The basic code generated is conceptually straightforward. Pseudo code:
15541 //
15542 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15543 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15544 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15545 //
15546 // The second and third lines are optional, depending on whether the factors
15547 // are {sign,zero}-extended or not.
15548 //
15549 // The actual DAG is noisier than the pseudo code, but only due to
15550 // instructions that disassemble values into low and high parts, and
15551 // assemble the final result.
15552 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15553
15554 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15555 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15556 SDValue Accum =
15557 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15558
15559 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15560 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15561
15562 if (!MulLHSUnsigned32) {
15563 auto MulLHSHi =
15564 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15565 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15566 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15567 }
15568
15569 if (!MulRHSUnsigned32) {
15570 auto MulRHSHi =
15571 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15572 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15573 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15574 }
15575
15576 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15577 Accum = DAG.getBitcast(MVT::i64, Accum);
15578 }
15579
15580 if (VT != MVT::i64)
15581 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15582 return Accum;
15583}
15584
15585SDValue
15586SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15587 DAGCombinerInfo &DCI) const {
15588 SDValue RHS = N->getOperand(1);
15589 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15590 if (!CRHS)
15591 return SDValue();
15592
15593 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15594 // common.
15595 uint64_t Val = CRHS->getZExtValue();
15596 if (countr_zero(Val) >= 32) {
15597 SelectionDAG &DAG = DCI.DAG;
15598 SDLoc SL(N);
15599 SDValue LHS = N->getOperand(0);
15600
15601 // Avoid carry machinery if we know the low half of the add does not
15602 // contribute to the final result.
15603 //
15604 // add i64:x, K if computeTrailingZeros(K) >= 32
15605 // => build_pair (add x.hi, K.hi), x.lo
15606
15607 // Breaking the 64-bit add here with this strange constant is unlikely
15608 // to interfere with addressing mode patterns.
15609
15610 SDValue Hi = getHiHalf64(LHS, DAG);
15611 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15612 unsigned Opcode = N->getOpcode();
15613 if (Opcode == ISD::PTRADD)
15614 Opcode = ISD::ADD;
15615 SDValue AddHi =
15616 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15617
15618 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15619 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15620 }
15621
15622 return SDValue();
15623}
15624
15625// Collect the ultimate src of each of the mul node's operands, and confirm
15626// each operand is 8 bytes.
15627static std::optional<ByteProvider<SDValue>>
15628handleMulOperand(const SDValue &MulOperand) {
15629 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15630 if (!Byte0 || Byte0->isConstantZero()) {
15631 return std::nullopt;
15632 }
15633 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15634 if (Byte1 && !Byte1->isConstantZero()) {
15635 return std::nullopt;
15636 }
15637 return Byte0;
15638}
15639
15640static unsigned addPermMasks(unsigned First, unsigned Second) {
15641 unsigned FirstCs = First & 0x0c0c0c0c;
15642 unsigned SecondCs = Second & 0x0c0c0c0c;
15643 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15644 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15645
15646 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15647 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15648 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15649 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15650
15651 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15652}
15653
15654struct DotSrc {
15656 int64_t PermMask;
15658};
15659
15663 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15664
15665 assert(Src0.Src.has_value() && Src1.Src.has_value());
15666 // Src0s and Src1s are empty, just place arbitrarily.
15667 if (Step == 0) {
15668 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15669 Src0.SrcOffset / 4});
15670 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15671 Src1.SrcOffset / 4});
15672 return;
15673 }
15674
15675 for (int BPI = 0; BPI < 2; BPI++) {
15676 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15677 if (BPI == 1) {
15678 BPP = {Src1, Src0};
15679 }
15680 unsigned ZeroMask = 0x0c0c0c0c;
15681 unsigned FMask = 0xFF << (8 * (3 - Step));
15682
15683 unsigned FirstMask =
15684 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15685 unsigned SecondMask =
15686 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15687 // Attempt to find Src vector which contains our SDValue, if so, add our
15688 // perm mask to the existing one. If we are unable to find a match for the
15689 // first SDValue, attempt to find match for the second.
15690 int FirstGroup = -1;
15691 for (int I = 0; I < 2; I++) {
15692 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15693 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15694 return IterElt.SrcOp == *BPP.first.Src &&
15695 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15696 };
15697
15698 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15699 if (Match != Srcs.end()) {
15700 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15701 FirstGroup = I;
15702 break;
15703 }
15704 }
15705 if (FirstGroup != -1) {
15706 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15707 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15708 return IterElt.SrcOp == *BPP.second.Src &&
15709 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15710 };
15711 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15712 if (Match != Srcs.end()) {
15713 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15714 } else
15715 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15716 return;
15717 }
15718 }
15719
15720 // If we have made it here, then we could not find a match in Src0s or Src1s
15721 // for either Src0 or Src1, so just place them arbitrarily.
15722
15723 unsigned ZeroMask = 0x0c0c0c0c;
15724 unsigned FMask = 0xFF << (8 * (3 - Step));
15725
15726 Src0s.push_back(
15727 {*Src0.Src,
15728 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15729 Src0.SrcOffset / 4});
15730 Src1s.push_back(
15731 {*Src1.Src,
15732 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15733 Src1.SrcOffset / 4});
15734}
15735
15737 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15738 bool IsAny) {
15739
15740 // If we just have one source, just permute it accordingly.
15741 if (Srcs.size() == 1) {
15742 auto *Elt = Srcs.begin();
15743 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15744
15745 // v_perm will produce the original value
15746 if (Elt->PermMask == 0x3020100)
15747 return EltOp;
15748
15749 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15750 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15751 }
15752
15753 auto *FirstElt = Srcs.begin();
15754 auto *SecondElt = std::next(FirstElt);
15755
15757
15758 // If we have multiple sources in the chain, combine them via perms (using
15759 // calculated perm mask) and Ors.
15760 while (true) {
15761 auto FirstMask = FirstElt->PermMask;
15762 auto SecondMask = SecondElt->PermMask;
15763
15764 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15765 unsigned FirstPlusFour = FirstMask | 0x04040404;
15766 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15767 // original 0x0C.
15768 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15769
15770 auto PermMask = addPermMasks(FirstMask, SecondMask);
15771 auto FirstVal =
15772 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15773 auto SecondVal =
15774 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15775
15776 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15777 SecondVal,
15778 DAG.getConstant(PermMask, SL, MVT::i32)));
15779
15780 FirstElt = std::next(SecondElt);
15781 if (FirstElt == Srcs.end())
15782 break;
15783
15784 SecondElt = std::next(FirstElt);
15785 // If we only have a FirstElt, then just combine that into the cumulative
15786 // source node.
15787 if (SecondElt == Srcs.end()) {
15788 auto EltOp =
15789 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15790
15791 Perms.push_back(
15792 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15793 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15794 break;
15795 }
15796 }
15797
15798 assert(Perms.size() == 1 || Perms.size() == 2);
15799 return Perms.size() == 2
15800 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15801 : Perms[0];
15802}
15803
15804static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15805 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15806 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15807 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15808 EntryMask += ZeroMask;
15809 }
15810}
15811
15812static bool isMul(const SDValue Op) {
15813 auto Opcode = Op.getOpcode();
15814
15815 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15816 Opcode == AMDGPUISD::MUL_I24);
15817}
15818
15819static std::optional<bool>
15821 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15822 const SDValue &S1Op, const SelectionDAG &DAG) {
15823 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15824 // of the dot4 is irrelevant.
15825 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15826 return false;
15827
15828 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15829 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15830 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15831 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15832 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15833 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15834
15835 assert(!(S0IsUnsigned && S0IsSigned));
15836 assert(!(S1IsUnsigned && S1IsSigned));
15837
15838 // There are 9 possible permutations of
15839 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15840
15841 // In two permutations, the sign bits are known to be the same for both Ops,
15842 // so simply return Signed / Unsigned corresponding to the MSB
15843
15844 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15845 return S0IsSigned;
15846
15847 // In another two permutations, the sign bits are known to be opposite. In
15848 // this case return std::nullopt to indicate a bad match.
15849
15850 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15851 return std::nullopt;
15852
15853 // In the remaining five permutations, we don't know the value of the sign
15854 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15855 // the upper bits must be extension bits. Thus, the only ways for the sign
15856 // bit to be unknown is if it was sign extended from unknown value, or if it
15857 // was any extended. In either case, it is correct to use the signed
15858 // version of the signedness semantics of dot4
15859
15860 // In two of such permutations, we known the sign bit is set for
15861 // one op, and the other is unknown. It is okay to used signed version of
15862 // dot4.
15863 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15864 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15865 return true;
15866
15867 // In one such permutation, we don't know either of the sign bits. It is okay
15868 // to used the signed version of dot4.
15869 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15870 return true;
15871
15872 // In two of such permutations, we known the sign bit is unset for
15873 // one op, and the other is unknown. Return std::nullopt to indicate a
15874 // bad match.
15875 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15876 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15877 return std::nullopt;
15878
15879 llvm_unreachable("Fully covered condition");
15880}
15881
15882SDValue SITargetLowering::performAddCombine(SDNode *N,
15883 DAGCombinerInfo &DCI) const {
15884 SelectionDAG &DAG = DCI.DAG;
15885 EVT VT = N->getValueType(0);
15886 SDLoc SL(N);
15887 SDValue LHS = N->getOperand(0);
15888 SDValue RHS = N->getOperand(1);
15889
15890 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15891 if (Subtarget->hasMad64_32()) {
15892 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15893 return Folded;
15894 }
15895 }
15896
15897 if (SDValue V = reassociateScalarOps(N, DAG)) {
15898 return V;
15899 }
15900
15901 if (VT == MVT::i64) {
15902 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15903 return Folded;
15904 }
15905
15906 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15907 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15908 SDValue TempNode(N, 0);
15909 std::optional<bool> IsSigned;
15913
15914 // Match the v_dot4 tree, while collecting src nodes.
15915 int ChainLength = 0;
15916 for (int I = 0; I < 4; I++) {
15917 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15918 if (MulIdx == -1)
15919 break;
15920 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15921 if (!Src0)
15922 break;
15923 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15924 if (!Src1)
15925 break;
15926
15927 auto IterIsSigned = checkDot4MulSignedness(
15928 TempNode->getOperand(MulIdx), *Src0, *Src1,
15929 TempNode->getOperand(MulIdx)->getOperand(0),
15930 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15931 if (!IterIsSigned)
15932 break;
15933 if (!IsSigned)
15934 IsSigned = *IterIsSigned;
15935 if (*IterIsSigned != *IsSigned)
15936 break;
15937 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15938 auto AddIdx = 1 - MulIdx;
15939 // Allow the special case where add (add (mul24, 0), mul24) became ->
15940 // add (mul24, mul24).
15941 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15942 Src2s.push_back(TempNode->getOperand(AddIdx));
15943 auto Src0 =
15944 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15945 if (!Src0)
15946 break;
15947 auto Src1 =
15948 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15949 if (!Src1)
15950 break;
15951 auto IterIsSigned = checkDot4MulSignedness(
15952 TempNode->getOperand(AddIdx), *Src0, *Src1,
15953 TempNode->getOperand(AddIdx)->getOperand(0),
15954 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15955 if (!IterIsSigned)
15956 break;
15957 assert(IsSigned);
15958 if (*IterIsSigned != *IsSigned)
15959 break;
15960 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15961 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15962 ChainLength = I + 2;
15963 break;
15964 }
15965
15966 TempNode = TempNode->getOperand(AddIdx);
15967 Src2s.push_back(TempNode);
15968 ChainLength = I + 1;
15969 if (TempNode->getNumOperands() < 2)
15970 break;
15971 LHS = TempNode->getOperand(0);
15972 RHS = TempNode->getOperand(1);
15973 }
15974
15975 if (ChainLength < 2)
15976 return SDValue();
15977
15978 // Masks were constructed with assumption that we would find a chain of
15979 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15980 // 0x0c) so they do not affect dot calculation.
15981 if (ChainLength < 4) {
15982 fixMasks(Src0s, ChainLength);
15983 fixMasks(Src1s, ChainLength);
15984 }
15985
15986 SDValue Src0, Src1;
15987
15988 // If we are just using a single source for both, and have permuted the
15989 // bytes consistently, we can just use the sources without permuting
15990 // (commutation).
15991 bool UseOriginalSrc = false;
15992 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15993 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15994 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15995 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15996 SmallVector<unsigned, 4> SrcBytes;
15997 auto Src0Mask = Src0s.begin()->PermMask;
15998 SrcBytes.push_back(Src0Mask & 0xFF000000);
15999 bool UniqueEntries = true;
16000 for (auto I = 1; I < 4; I++) {
16001 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16002
16003 if (is_contained(SrcBytes, NextByte)) {
16004 UniqueEntries = false;
16005 break;
16006 }
16007 SrcBytes.push_back(NextByte);
16008 }
16009
16010 if (UniqueEntries) {
16011 UseOriginalSrc = true;
16012
16013 auto *FirstElt = Src0s.begin();
16014 auto FirstEltOp =
16015 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16016
16017 auto *SecondElt = Src1s.begin();
16018 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16019 SecondElt->DWordOffset);
16020
16021 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16022 MVT::getIntegerVT(32));
16023 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16024 MVT::getIntegerVT(32));
16025 }
16026 }
16027
16028 if (!UseOriginalSrc) {
16029 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16030 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16031 }
16032
16033 assert(IsSigned);
16034 SDValue Src2 =
16035 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16036
16037 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16038 : Intrinsic::amdgcn_udot4,
16039 SL, MVT::i64);
16040
16041 assert(!VT.isVector());
16042 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16043 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16044
16045 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16046 }
16047
16048 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16049 return SDValue();
16050
16051 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16052 // add x, sext (setcc) => usubo_carry x, 0, setcc
16053 unsigned Opc = LHS.getOpcode();
16056 std::swap(RHS, LHS);
16057
16058 Opc = RHS.getOpcode();
16059 switch (Opc) {
16060 default:
16061 break;
16062 case ISD::ZERO_EXTEND:
16063 case ISD::SIGN_EXTEND:
16064 case ISD::ANY_EXTEND: {
16065 auto Cond = RHS.getOperand(0);
16066 // If this won't be a real VOPC output, we would still need to insert an
16067 // extra instruction anyway.
16068 if (!isBoolSGPR(Cond))
16069 break;
16070 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16071 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16073 return DAG.getNode(Opc, SL, VTList, Args);
16074 }
16075 case ISD::UADDO_CARRY: {
16076 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16077 if (!isNullConstant(RHS.getOperand(1)))
16078 break;
16079 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16080 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16081 }
16082 }
16083 return SDValue();
16084}
16085
16086SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16087 DAGCombinerInfo &DCI) const {
16088 SelectionDAG &DAG = DCI.DAG;
16089 SDLoc DL(N);
16090 EVT VT = N->getValueType(0);
16091 SDValue N0 = N->getOperand(0);
16092 SDValue N1 = N->getOperand(1);
16093
16094 // The following folds transform PTRADDs into regular arithmetic in cases
16095 // where the PTRADD wouldn't be folded as an immediate offset into memory
16096 // instructions anyway. They are target-specific in that other targets might
16097 // prefer to not lose information about the pointer arithmetic.
16098
16099 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16100 // Adapted from DAGCombiner::visitADDLikeCommutative.
16101 SDValue V, K;
16102 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16103 SDNodeFlags ShlFlags = N1->getFlags();
16104 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16105 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16106 // preserved.
16107 SDNodeFlags NewShlFlags =
16108 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16110 : SDNodeFlags();
16111 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16112 DCI.AddToWorklist(Inner.getNode());
16113 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16114 }
16115
16116 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16117 // performAddCombine.
16118 if (N1.getOpcode() == ISD::MUL) {
16119 if (Subtarget->hasMad64_32()) {
16120 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16121 return Folded;
16122 }
16123 }
16124
16125 // If the 32 low bits of the constant are all zero, there is nothing to fold
16126 // into an immediate offset, so it's better to eliminate the unnecessary
16127 // addition for the lower 32 bits than to preserve the PTRADD.
16128 // Analogous to a fold in performAddCombine.
16129 if (VT == MVT::i64) {
16130 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16131 return Folded;
16132 }
16133
16134 if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
16135 // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
16136 // global address GA and constant c, such that c can be folded into GA.
16137 SDValue GAValue = N0.getOperand(0);
16138 if (const GlobalAddressSDNode *GA =
16140 if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
16141 // If both additions in the original were NUW, reassociation preserves
16142 // that.
16143 SDNodeFlags Flags =
16144 (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16145 SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
16146 DCI.AddToWorklist(Inner.getNode());
16147 return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
16148 }
16149 }
16150 }
16151
16152 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16153 return SDValue();
16154
16155 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
16156 // y is not, and (add y, z) is used only once.
16157 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
16158 // z is not, and (add y, z) is used only once.
16159 // The goal is to move constant offsets to the outermost ptradd, to create
16160 // more opportunities to fold offsets into memory instructions.
16161 // Together with the generic combines in DAGCombiner.cpp, this also
16162 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
16163 //
16164 // This transform is here instead of in the general DAGCombiner as it can
16165 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
16166 // AArch64's CPA.
16167 SDValue X = N0;
16168 SDValue Y = N1.getOperand(0);
16169 SDValue Z = N1.getOperand(1);
16170 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16171 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16172
16173 // If both additions in the original were NUW, reassociation preserves that.
16174 SDNodeFlags ReassocFlags =
16175 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16176
16177 if (ZIsConstant != YIsConstant) {
16178 if (YIsConstant)
16179 std::swap(Y, Z);
16180 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16181 DCI.AddToWorklist(Inner.getNode());
16182 return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
16183 }
16184
16185 // If one of Y and Z is constant, they have been handled above. If both were
16186 // constant, the addition would have been folded in SelectionDAG::getNode
16187 // already. This ensures that the generic DAG combines won't undo the
16188 // following reassociation.
16189 assert(!YIsConstant && !ZIsConstant);
16190
16191 if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
16192 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16193 // y are uniform and z isn't.
16194 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16195 // z are uniform and y isn't.
16196 // The goal is to push uniform operands up in the computation, so that they
16197 // can be handled with scalar operations. We can't use reassociateScalarOps
16198 // for this since it requires two identical commutative operations to
16199 // reassociate.
16200 if (Y->isDivergent())
16201 std::swap(Y, Z);
16202 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16203 DCI.AddToWorklist(UniformInner.getNode());
16204 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16205 }
16206
16207 return SDValue();
16208}
16209
16210SDValue SITargetLowering::performSubCombine(SDNode *N,
16211 DAGCombinerInfo &DCI) const {
16212 SelectionDAG &DAG = DCI.DAG;
16213 EVT VT = N->getValueType(0);
16214
16215 if (VT == MVT::i64) {
16216 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16217 return Folded;
16218 }
16219
16220 if (VT != MVT::i32)
16221 return SDValue();
16222
16223 SDLoc SL(N);
16224 SDValue LHS = N->getOperand(0);
16225 SDValue RHS = N->getOperand(1);
16226
16227 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16228 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16229 unsigned Opc = RHS.getOpcode();
16230 switch (Opc) {
16231 default:
16232 break;
16233 case ISD::ZERO_EXTEND:
16234 case ISD::SIGN_EXTEND:
16235 case ISD::ANY_EXTEND: {
16236 auto Cond = RHS.getOperand(0);
16237 // If this won't be a real VOPC output, we would still need to insert an
16238 // extra instruction anyway.
16239 if (!isBoolSGPR(Cond))
16240 break;
16241 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16242 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16244 return DAG.getNode(Opc, SL, VTList, Args);
16245 }
16246 }
16247
16248 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16249 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16250 if (!isNullConstant(LHS.getOperand(1)))
16251 return SDValue();
16252 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16253 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16254 }
16255 return SDValue();
16256}
16257
16258SDValue
16259SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16260 DAGCombinerInfo &DCI) const {
16261
16262 if (N->getValueType(0) != MVT::i32)
16263 return SDValue();
16264
16265 if (!isNullConstant(N->getOperand(1)))
16266 return SDValue();
16267
16268 SelectionDAG &DAG = DCI.DAG;
16269 SDValue LHS = N->getOperand(0);
16270
16271 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16272 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16273 unsigned LHSOpc = LHS.getOpcode();
16274 unsigned Opc = N->getOpcode();
16275 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16276 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16277 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16278 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16279 }
16280 return SDValue();
16281}
16282
16283SDValue SITargetLowering::performFAddCombine(SDNode *N,
16284 DAGCombinerInfo &DCI) const {
16285 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16286 return SDValue();
16287
16288 SelectionDAG &DAG = DCI.DAG;
16289 EVT VT = N->getValueType(0);
16290
16291 SDLoc SL(N);
16292 SDValue LHS = N->getOperand(0);
16293 SDValue RHS = N->getOperand(1);
16294
16295 // These should really be instruction patterns, but writing patterns with
16296 // source modifiers is a pain.
16297
16298 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16299 if (LHS.getOpcode() == ISD::FADD) {
16300 SDValue A = LHS.getOperand(0);
16301 if (A == LHS.getOperand(1)) {
16302 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16303 if (FusedOp != 0) {
16304 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16305 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16306 }
16307 }
16308 }
16309
16310 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16311 if (RHS.getOpcode() == ISD::FADD) {
16312 SDValue A = RHS.getOperand(0);
16313 if (A == RHS.getOperand(1)) {
16314 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16315 if (FusedOp != 0) {
16316 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16317 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16318 }
16319 }
16320 }
16321
16322 return SDValue();
16323}
16324
16325SDValue SITargetLowering::performFSubCombine(SDNode *N,
16326 DAGCombinerInfo &DCI) const {
16327 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16328 return SDValue();
16329
16330 SelectionDAG &DAG = DCI.DAG;
16331 SDLoc SL(N);
16332 EVT VT = N->getValueType(0);
16333 assert(!VT.isVector());
16334
16335 // Try to get the fneg to fold into the source modifier. This undoes generic
16336 // DAG combines and folds them into the mad.
16337 //
16338 // Only do this if we are not trying to support denormals. v_mad_f32 does
16339 // not support denormals ever.
16340 SDValue LHS = N->getOperand(0);
16341 SDValue RHS = N->getOperand(1);
16342 if (LHS.getOpcode() == ISD::FADD) {
16343 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16344 SDValue A = LHS.getOperand(0);
16345 if (A == LHS.getOperand(1)) {
16346 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16347 if (FusedOp != 0) {
16348 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16349 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16350
16351 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16352 }
16353 }
16354 }
16355
16356 if (RHS.getOpcode() == ISD::FADD) {
16357 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16358
16359 SDValue A = RHS.getOperand(0);
16360 if (A == RHS.getOperand(1)) {
16361 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16362 if (FusedOp != 0) {
16363 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16364 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16365 }
16366 }
16367 }
16368
16369 return SDValue();
16370}
16371
16372SDValue SITargetLowering::performFDivCombine(SDNode *N,
16373 DAGCombinerInfo &DCI) const {
16374 SelectionDAG &DAG = DCI.DAG;
16375 SDLoc SL(N);
16376 EVT VT = N->getValueType(0);
16377 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16378 return SDValue();
16379
16380 SDValue LHS = N->getOperand(0);
16381 SDValue RHS = N->getOperand(1);
16382
16383 SDNodeFlags Flags = N->getFlags();
16384 SDNodeFlags RHSFlags = RHS->getFlags();
16385 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16386 !RHS->hasOneUse())
16387 return SDValue();
16388
16389 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16390 bool IsNegative = false;
16391 if (CLHS->isExactlyValue(1.0) ||
16392 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16393 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16394 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16395 if (RHS.getOpcode() == ISD::FSQRT) {
16396 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16397 SDValue Rsq =
16398 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16399 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16400 }
16401 }
16402 }
16403
16404 return SDValue();
16405}
16406
16407SDValue SITargetLowering::performFMulCombine(SDNode *N,
16408 DAGCombinerInfo &DCI) const {
16409 SelectionDAG &DAG = DCI.DAG;
16410 EVT VT = N->getValueType(0);
16411 EVT ScalarVT = VT.getScalarType();
16412 EVT IntVT = VT.changeElementType(MVT::i32);
16413
16414 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16415 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16416 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16417 return SDValue();
16418 }
16419
16420 SDValue LHS = N->getOperand(0);
16421 SDValue RHS = N->getOperand(1);
16422
16423 // It is cheaper to realize i32 inline constants as compared against
16424 // materializing f16 or f64 (or even non-inline f32) values,
16425 // possible via ldexp usage, as shown below :
16426 //
16427 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16428 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16429 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16430 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16431 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16432 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16433 if (!TrueNode)
16434 return SDValue();
16435 const ConstantFPSDNode *FalseNode =
16436 isConstOrConstSplatFP(RHS.getOperand(2));
16437 if (!FalseNode)
16438 return SDValue();
16439
16440 if (TrueNode->isNegative() != FalseNode->isNegative())
16441 return SDValue();
16442
16443 // For f32, only non-inline constants should be transformed.
16444 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16445 if (ScalarVT == MVT::f32 &&
16446 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16447 TII->isInlineConstant(FalseNode->getValueAPF()))
16448 return SDValue();
16449
16450 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16451 if (TrueNodeExpVal == INT_MIN)
16452 return SDValue();
16453 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16454 if (FalseNodeExpVal == INT_MIN)
16455 return SDValue();
16456
16457 SDLoc SL(N);
16458 SDValue SelectNode =
16459 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16460 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16461 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16462
16463 LHS = TrueNode->isNegative()
16464 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16465 : LHS;
16466
16467 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16468 }
16469
16470 return SDValue();
16471}
16472
16473SDValue SITargetLowering::performFMACombine(SDNode *N,
16474 DAGCombinerInfo &DCI) const {
16475 SelectionDAG &DAG = DCI.DAG;
16476 EVT VT = N->getValueType(0);
16477 SDLoc SL(N);
16478
16479 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16480 return SDValue();
16481
16482 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16483 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16484 SDValue Op1 = N->getOperand(0);
16485 SDValue Op2 = N->getOperand(1);
16486 SDValue FMA = N->getOperand(2);
16487
16488 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16489 Op2.getOpcode() != ISD::FP_EXTEND)
16490 return SDValue();
16491
16492 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16493 // regardless of the denorm mode setting. Therefore,
16494 // fp-contract is sufficient to allow generating fdot2.
16495 const TargetOptions &Options = DAG.getTarget().Options;
16496 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16497 (N->getFlags().hasAllowContract() &&
16498 FMA->getFlags().hasAllowContract())) {
16499 Op1 = Op1.getOperand(0);
16500 Op2 = Op2.getOperand(0);
16501 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16503 return SDValue();
16504
16505 SDValue Vec1 = Op1.getOperand(0);
16506 SDValue Idx1 = Op1.getOperand(1);
16507 SDValue Vec2 = Op2.getOperand(0);
16508
16509 SDValue FMAOp1 = FMA.getOperand(0);
16510 SDValue FMAOp2 = FMA.getOperand(1);
16511 SDValue FMAAcc = FMA.getOperand(2);
16512
16513 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16514 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16515 return SDValue();
16516
16517 FMAOp1 = FMAOp1.getOperand(0);
16518 FMAOp2 = FMAOp2.getOperand(0);
16519 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16521 return SDValue();
16522
16523 SDValue Vec3 = FMAOp1.getOperand(0);
16524 SDValue Vec4 = FMAOp2.getOperand(0);
16525 SDValue Idx2 = FMAOp1.getOperand(1);
16526
16527 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16528 // Idx1 and Idx2 cannot be the same.
16529 Idx1 == Idx2)
16530 return SDValue();
16531
16532 if (Vec1 == Vec2 || Vec3 == Vec4)
16533 return SDValue();
16534
16535 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16536 return SDValue();
16537
16538 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16539 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16540 DAG.getTargetConstant(0, SL, MVT::i1));
16541 }
16542 }
16543 return SDValue();
16544}
16545
16546SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16547 DAGCombinerInfo &DCI) const {
16548 SelectionDAG &DAG = DCI.DAG;
16549 SDLoc SL(N);
16550
16551 SDValue LHS = N->getOperand(0);
16552 SDValue RHS = N->getOperand(1);
16553 EVT VT = LHS.getValueType();
16554 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16555
16556 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16557 if (!CRHS) {
16559 if (CRHS) {
16560 std::swap(LHS, RHS);
16561 CC = getSetCCSwappedOperands(CC);
16562 }
16563 }
16564
16565 if (CRHS) {
16566 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16567 isBoolSGPR(LHS.getOperand(0))) {
16568 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16569 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16570 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16571 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16572 if ((CRHS->isAllOnes() &&
16573 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16574 (CRHS->isZero() &&
16575 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16576 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16577 DAG.getAllOnesConstant(SL, MVT::i1));
16578 if ((CRHS->isAllOnes() &&
16579 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16580 (CRHS->isZero() &&
16581 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16582 return LHS.getOperand(0);
16583 }
16584
16585 const APInt &CRHSVal = CRHS->getAPIntValue();
16586 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16587 LHS.getOpcode() == ISD::SELECT &&
16588 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16589 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16590 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16591 isBoolSGPR(LHS.getOperand(0))) {
16592 // Given CT != FT:
16593 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16594 // setcc (select cc, CT, CF), CF, ne => cc
16595 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16596 // setcc (select cc, CT, CF), CT, eq => cc
16597 const APInt &CT = LHS.getConstantOperandAPInt(1);
16598 const APInt &CF = LHS.getConstantOperandAPInt(2);
16599
16600 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16601 (CT == CRHSVal && CC == ISD::SETNE))
16602 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16603 DAG.getAllOnesConstant(SL, MVT::i1));
16604 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16605 (CT == CRHSVal && CC == ISD::SETEQ))
16606 return LHS.getOperand(0);
16607 }
16608 }
16609
16610 if (VT != MVT::f32 && VT != MVT::f64 &&
16611 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16612 return SDValue();
16613
16614 // Match isinf/isfinite pattern
16615 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16616 // (fcmp one (fabs x), inf) -> (fp_class x,
16617 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16618 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16619 LHS.getOpcode() == ISD::FABS) {
16620 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16621 if (!CRHS)
16622 return SDValue();
16623
16624 const APFloat &APF = CRHS->getValueAPF();
16625 if (APF.isInfinity() && !APF.isNegative()) {
16626 const unsigned IsInfMask =
16628 const unsigned IsFiniteMask =
16632 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16633 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16634 DAG.getConstant(Mask, SL, MVT::i32));
16635 }
16636 }
16637
16638 return SDValue();
16639}
16640
16641SDValue
16642SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16643 DAGCombinerInfo &DCI) const {
16644 SelectionDAG &DAG = DCI.DAG;
16645 SDLoc SL(N);
16646 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16647
16648 SDValue Src = N->getOperand(0);
16649 SDValue Shift = N->getOperand(0);
16650
16651 // TODO: Extend type shouldn't matter (assuming legal types).
16652 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16653 Shift = Shift.getOperand(0);
16654
16655 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16656 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16657 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16658 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16659 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16660 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16661 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16662 SDValue Shifted = DAG.getZExtOrTrunc(
16663 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16664
16665 unsigned ShiftOffset = 8 * Offset;
16666 if (Shift.getOpcode() == ISD::SHL)
16667 ShiftOffset -= C->getZExtValue();
16668 else
16669 ShiftOffset += C->getZExtValue();
16670
16671 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16672 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16673 MVT::f32, Shifted);
16674 }
16675 }
16676 }
16677
16678 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16679 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16680 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16681 // We simplified Src. If this node is not dead, visit it again so it is
16682 // folded properly.
16683 if (N->getOpcode() != ISD::DELETED_NODE)
16684 DCI.AddToWorklist(N);
16685 return SDValue(N, 0);
16686 }
16687
16688 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16689 if (SDValue DemandedSrc =
16690 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16691 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16692
16693 return SDValue();
16694}
16695
16696SDValue SITargetLowering::performClampCombine(SDNode *N,
16697 DAGCombinerInfo &DCI) const {
16698 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16699 if (!CSrc)
16700 return SDValue();
16701
16702 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16703 const APFloat &F = CSrc->getValueAPF();
16704 APFloat Zero = APFloat::getZero(F.getSemantics());
16705 if (F < Zero ||
16706 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16707 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16708 }
16709
16710 APFloat One(F.getSemantics(), "1.0");
16711 if (F > One)
16712 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16713
16714 return SDValue(CSrc, 0);
16715}
16716
16717SDValue SITargetLowering::performSelectCombine(SDNode *N,
16718 DAGCombinerInfo &DCI) const {
16719
16720 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16721 // integer).
16722 // Detect when CMP and SELECT use the same constant and fold them to avoid
16723 // loading the constant twice. Specifically handles patterns like:
16724 // %cmp = icmp eq i32 %val, 4242
16725 // %sel = select i1 %cmp, i32 4242, i32 %other
16726 // It can be optimized to reuse %val instead of 4242 in select.
16727 SDValue Cond = N->getOperand(0);
16728 SDValue TrueVal = N->getOperand(1);
16729 SDValue FalseVal = N->getOperand(2);
16730
16731 // Check if condition is a comparison.
16732 if (Cond.getOpcode() != ISD::SETCC)
16733 return SDValue();
16734
16735 SDValue LHS = Cond.getOperand(0);
16736 SDValue RHS = Cond.getOperand(1);
16737 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16738
16739 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16740 bool isInteger = LHS.getValueType().isInteger();
16741
16742 // Handle simple floating-point and integer types only.
16743 if (!isFloatingPoint && !isInteger)
16744 return SDValue();
16745
16746 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16747 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16748 if (!isEquality && !isNonEquality)
16749 return SDValue();
16750
16751 SDValue ArgVal, ConstVal;
16752 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16753 (isInteger && isa<ConstantSDNode>(RHS))) {
16754 ConstVal = RHS;
16755 ArgVal = LHS;
16756 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16757 (isInteger && isa<ConstantSDNode>(LHS))) {
16758 ConstVal = LHS;
16759 ArgVal = RHS;
16760 } else {
16761 return SDValue();
16762 }
16763
16764 // Skip optimization for inlinable immediates.
16765 if (isFloatingPoint) {
16766 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16767 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16768 return SDValue();
16769 } else {
16771 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16772 return SDValue();
16773 }
16774
16775 // For equality and non-equality comparisons, patterns:
16776 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16777 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16778 if (!(isEquality && TrueVal == ConstVal) &&
16779 !(isNonEquality && FalseVal == ConstVal))
16780 return SDValue();
16781
16782 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16783 SDValue SelectRHS =
16784 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16785 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16786 SelectLHS, SelectRHS);
16787}
16788
16790 DAGCombinerInfo &DCI) const {
16791 switch (N->getOpcode()) {
16792 case ISD::ADD:
16793 case ISD::SUB:
16794 case ISD::SHL:
16795 case ISD::SRL:
16796 case ISD::SRA:
16797 case ISD::AND:
16798 case ISD::OR:
16799 case ISD::XOR:
16800 case ISD::MUL:
16801 case ISD::SETCC:
16802 case ISD::SELECT:
16803 case ISD::SMIN:
16804 case ISD::SMAX:
16805 case ISD::UMIN:
16806 case ISD::UMAX:
16807 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16808 return Res;
16809 break;
16810 default:
16811 break;
16812 }
16813
16814 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16815 return SDValue();
16816
16817 switch (N->getOpcode()) {
16818 case ISD::ADD:
16819 return performAddCombine(N, DCI);
16820 case ISD::PTRADD:
16821 return performPtrAddCombine(N, DCI);
16822 case ISD::SUB:
16823 return performSubCombine(N, DCI);
16824 case ISD::UADDO_CARRY:
16825 case ISD::USUBO_CARRY:
16826 return performAddCarrySubCarryCombine(N, DCI);
16827 case ISD::FADD:
16828 return performFAddCombine(N, DCI);
16829 case ISD::FSUB:
16830 return performFSubCombine(N, DCI);
16831 case ISD::FDIV:
16832 return performFDivCombine(N, DCI);
16833 case ISD::FMUL:
16834 return performFMulCombine(N, DCI);
16835 case ISD::SETCC:
16836 return performSetCCCombine(N, DCI);
16837 case ISD::SELECT:
16838 if (auto Res = performSelectCombine(N, DCI))
16839 return Res;
16840 break;
16841 case ISD::FMAXNUM:
16842 case ISD::FMINNUM:
16843 case ISD::FMAXNUM_IEEE:
16844 case ISD::FMINNUM_IEEE:
16845 case ISD::FMAXIMUM:
16846 case ISD::FMINIMUM:
16847 case ISD::FMAXIMUMNUM:
16848 case ISD::FMINIMUMNUM:
16849 case ISD::SMAX:
16850 case ISD::SMIN:
16851 case ISD::UMAX:
16852 case ISD::UMIN:
16855 return performMinMaxCombine(N, DCI);
16856 case ISD::FMA:
16857 return performFMACombine(N, DCI);
16858 case ISD::AND:
16859 return performAndCombine(N, DCI);
16860 case ISD::OR:
16861 return performOrCombine(N, DCI);
16862 case ISD::FSHR: {
16864 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16865 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16866 return matchPERM(N, DCI);
16867 }
16868 break;
16869 }
16870 case ISD::XOR:
16871 return performXorCombine(N, DCI);
16872 case ISD::ZERO_EXTEND:
16873 return performZeroExtendCombine(N, DCI);
16875 return performSignExtendInRegCombine(N, DCI);
16877 return performClassCombine(N, DCI);
16878 case ISD::FCANONICALIZE:
16879 return performFCanonicalizeCombine(N, DCI);
16880 case AMDGPUISD::RCP:
16881 return performRcpCombine(N, DCI);
16882 case ISD::FLDEXP:
16883 case AMDGPUISD::FRACT:
16884 case AMDGPUISD::RSQ:
16887 case AMDGPUISD::RSQ_CLAMP: {
16888 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16889 SDValue Src = N->getOperand(0);
16890 if (Src.isUndef())
16891 return Src;
16892 break;
16893 }
16894 case ISD::SINT_TO_FP:
16895 case ISD::UINT_TO_FP:
16896 return performUCharToFloatCombine(N, DCI);
16897 case ISD::FCOPYSIGN:
16898 return performFCopySignCombine(N, DCI);
16903 return performCvtF32UByteNCombine(N, DCI);
16904 case AMDGPUISD::FMED3:
16905 return performFMed3Combine(N, DCI);
16907 return performCvtPkRTZCombine(N, DCI);
16908 case AMDGPUISD::CLAMP:
16909 return performClampCombine(N, DCI);
16910 case ISD::SCALAR_TO_VECTOR: {
16911 SelectionDAG &DAG = DCI.DAG;
16912 EVT VT = N->getValueType(0);
16913
16914 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16915 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16916 SDLoc SL(N);
16917 SDValue Src = N->getOperand(0);
16918 EVT EltVT = Src.getValueType();
16919 if (EltVT != MVT::i16)
16920 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16921
16922 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16923 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16924 }
16925
16926 break;
16927 }
16929 return performExtractVectorEltCombine(N, DCI);
16931 return performInsertVectorEltCombine(N, DCI);
16932 case ISD::FP_ROUND:
16933 return performFPRoundCombine(N, DCI);
16934 case ISD::LOAD: {
16935 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16936 return Widened;
16937 [[fallthrough]];
16938 }
16939 default: {
16940 if (!DCI.isBeforeLegalize()) {
16941 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16942 return performMemSDNodeCombine(MemNode, DCI);
16943 }
16944
16945 break;
16946 }
16947 }
16948
16950}
16951
16952/// Helper function for adjustWritemask
16953static unsigned SubIdx2Lane(unsigned Idx) {
16954 switch (Idx) {
16955 default:
16956 return ~0u;
16957 case AMDGPU::sub0:
16958 return 0;
16959 case AMDGPU::sub1:
16960 return 1;
16961 case AMDGPU::sub2:
16962 return 2;
16963 case AMDGPU::sub3:
16964 return 3;
16965 case AMDGPU::sub4:
16966 return 4; // Possible with TFE/LWE
16967 }
16968}
16969
16970/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16971SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16972 SelectionDAG &DAG) const {
16973 unsigned Opcode = Node->getMachineOpcode();
16974
16975 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16976 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16977 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16978 return Node; // not implemented for D16
16979
16980 SDNode *Users[5] = {nullptr};
16981 unsigned Lane = 0;
16982 unsigned DmaskIdx =
16983 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16984 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16985 unsigned NewDmask = 0;
16986 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16987 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16988 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16989 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16990 unsigned TFCLane = 0;
16991 bool HasChain = Node->getNumValues() > 1;
16992
16993 if (OldDmask == 0) {
16994 // These are folded out, but on the chance it happens don't assert.
16995 return Node;
16996 }
16997
16998 unsigned OldBitsSet = llvm::popcount(OldDmask);
16999 // Work out which is the TFE/LWE lane if that is enabled.
17000 if (UsesTFC) {
17001 TFCLane = OldBitsSet;
17002 }
17003
17004 // Try to figure out the used register components
17005 for (SDUse &Use : Node->uses()) {
17006
17007 // Don't look at users of the chain.
17008 if (Use.getResNo() != 0)
17009 continue;
17010
17011 SDNode *User = Use.getUser();
17012
17013 // Abort if we can't understand the usage
17014 if (!User->isMachineOpcode() ||
17015 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17016 return Node;
17017
17018 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17019 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17020 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17021 // set, etc.
17022 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17023 if (Lane == ~0u)
17024 return Node;
17025
17026 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17027 if (UsesTFC && Lane == TFCLane) {
17028 Users[Lane] = User;
17029 } else {
17030 // Set which texture component corresponds to the lane.
17031 unsigned Comp;
17032 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17033 Comp = llvm::countr_zero(Dmask);
17034 Dmask &= ~(1 << Comp);
17035 }
17036
17037 // Abort if we have more than one user per component.
17038 if (Users[Lane])
17039 return Node;
17040
17041 Users[Lane] = User;
17042 NewDmask |= 1 << Comp;
17043 }
17044 }
17045
17046 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17047 bool NoChannels = !NewDmask;
17048 if (NoChannels) {
17049 if (!UsesTFC) {
17050 // No uses of the result and not using TFC. Then do nothing.
17051 return Node;
17052 }
17053 // If the original dmask has one channel - then nothing to do
17054 if (OldBitsSet == 1)
17055 return Node;
17056 // Use an arbitrary dmask - required for the instruction to work
17057 NewDmask = 1;
17058 }
17059 // Abort if there's no change
17060 if (NewDmask == OldDmask)
17061 return Node;
17062
17063 unsigned BitsSet = llvm::popcount(NewDmask);
17064
17065 // Check for TFE or LWE - increase the number of channels by one to account
17066 // for the extra return value
17067 // This will need adjustment for D16 if this is also included in
17068 // adjustWriteMask (this function) but at present D16 are excluded.
17069 unsigned NewChannels = BitsSet + UsesTFC;
17070
17071 int NewOpcode =
17072 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17073 assert(NewOpcode != -1 &&
17074 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17075 "failed to find equivalent MIMG op");
17076
17077 // Adjust the writemask in the node
17079 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17080 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17081 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17082
17083 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17084
17085 MVT ResultVT = NewChannels == 1
17086 ? SVT
17087 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17088 : NewChannels == 5 ? 8
17089 : NewChannels);
17090 SDVTList NewVTList =
17091 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17092
17093 MachineSDNode *NewNode =
17094 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17095
17096 if (HasChain) {
17097 // Update chain.
17098 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17099 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17100 }
17101
17102 if (NewChannels == 1) {
17103 assert(Node->hasNUsesOfValue(1, 0));
17104 SDNode *Copy =
17105 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17106 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17107 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17108 return nullptr;
17109 }
17110
17111 // Update the users of the node with the new indices
17112 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17113 SDNode *User = Users[i];
17114 if (!User) {
17115 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17116 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17117 if (i || !NoChannels)
17118 continue;
17119 } else {
17120 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17121 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17122 if (NewUser != User) {
17123 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17124 DAG.RemoveDeadNode(User);
17125 }
17126 }
17127
17128 switch (Idx) {
17129 default:
17130 break;
17131 case AMDGPU::sub0:
17132 Idx = AMDGPU::sub1;
17133 break;
17134 case AMDGPU::sub1:
17135 Idx = AMDGPU::sub2;
17136 break;
17137 case AMDGPU::sub2:
17138 Idx = AMDGPU::sub3;
17139 break;
17140 case AMDGPU::sub3:
17141 Idx = AMDGPU::sub4;
17142 break;
17143 }
17144 }
17145
17146 DAG.RemoveDeadNode(Node);
17147 return nullptr;
17148}
17149
17151 if (Op.getOpcode() == ISD::AssertZext)
17152 Op = Op.getOperand(0);
17153
17154 return isa<FrameIndexSDNode>(Op);
17155}
17156
17157/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17158/// with frame index operands.
17159/// LLVM assumes that inputs are to these instructions are registers.
17160SDNode *
17162 SelectionDAG &DAG) const {
17163 if (Node->getOpcode() == ISD::CopyToReg) {
17164 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17165 SDValue SrcVal = Node->getOperand(2);
17166
17167 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17168 // to try understanding copies to physical registers.
17169 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17170 SDLoc SL(Node);
17172 SDValue VReg = DAG.getRegister(
17173 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17174
17175 SDNode *Glued = Node->getGluedNode();
17176 SDValue ToVReg = DAG.getCopyToReg(
17177 Node->getOperand(0), SL, VReg, SrcVal,
17178 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17179 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17180 VReg, ToVReg.getValue(1));
17181 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17182 DAG.RemoveDeadNode(Node);
17183 return ToResultReg.getNode();
17184 }
17185 }
17186
17188 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17189 if (!isFrameIndexOp(Node->getOperand(i))) {
17190 Ops.push_back(Node->getOperand(i));
17191 continue;
17192 }
17193
17194 SDLoc DL(Node);
17195 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17196 Node->getOperand(i).getValueType(),
17197 Node->getOperand(i)),
17198 0));
17199 }
17200
17201 return DAG.UpdateNodeOperands(Node, Ops);
17202}
17203
17204/// Fold the instructions after selecting them.
17205/// Returns null if users were already updated.
17207 SelectionDAG &DAG) const {
17209 unsigned Opcode = Node->getMachineOpcode();
17210
17211 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17212 !TII->isGather4(Opcode) &&
17213 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17214 return adjustWritemask(Node, DAG);
17215 }
17216
17217 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17219 return Node;
17220 }
17221
17222 switch (Opcode) {
17223 case AMDGPU::V_DIV_SCALE_F32_e64:
17224 case AMDGPU::V_DIV_SCALE_F64_e64: {
17225 // Satisfy the operand register constraint when one of the inputs is
17226 // undefined. Ordinarily each undef value will have its own implicit_def of
17227 // a vreg, so force these to use a single register.
17228 SDValue Src0 = Node->getOperand(1);
17229 SDValue Src1 = Node->getOperand(3);
17230 SDValue Src2 = Node->getOperand(5);
17231
17232 if ((Src0.isMachineOpcode() &&
17233 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17234 (Src0 == Src1 || Src0 == Src2))
17235 break;
17236
17237 MVT VT = Src0.getValueType().getSimpleVT();
17238 const TargetRegisterClass *RC =
17239 getRegClassFor(VT, Src0.getNode()->isDivergent());
17240
17242 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17243
17244 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17245 Src0, SDValue());
17246
17247 // src0 must be the same register as src1 or src2, even if the value is
17248 // undefined, so make sure we don't violate this constraint.
17249 if (Src0.isMachineOpcode() &&
17250 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17251 if (Src1.isMachineOpcode() &&
17252 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17253 Src0 = Src1;
17254 else if (Src2.isMachineOpcode() &&
17255 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17256 Src0 = Src2;
17257 else {
17258 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17259 Src0 = UndefReg;
17260 Src1 = UndefReg;
17261 }
17262 } else
17263 break;
17264
17266 Ops[1] = Src0;
17267 Ops[3] = Src1;
17268 Ops[5] = Src2;
17269 Ops.push_back(ImpDef.getValue(1));
17270 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17271 }
17272 default:
17273 break;
17274 }
17275
17276 return Node;
17277}
17278
17279// Any MIMG instructions that use tfe or lwe require an initialization of the
17280// result register that will be written in the case of a memory access failure.
17281// The required code is also added to tie this init code to the result of the
17282// img instruction.
17285 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17286 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17287 MachineBasicBlock &MBB = *MI.getParent();
17288
17289 int DstIdx =
17290 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17291 unsigned InitIdx = 0;
17292
17293 if (TII->isImage(MI)) {
17294 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17295 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17296 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17297
17298 if (!TFE && !LWE) // intersect_ray
17299 return;
17300
17301 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17302 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17303 unsigned D16Val = D16 ? D16->getImm() : 0;
17304
17305 if (!TFEVal && !LWEVal)
17306 return;
17307
17308 // At least one of TFE or LWE are non-zero
17309 // We have to insert a suitable initialization of the result value and
17310 // tie this to the dest of the image instruction.
17311
17312 // Calculate which dword we have to initialize to 0.
17313 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17314
17315 // check that dmask operand is found.
17316 assert(MO_Dmask && "Expected dmask operand in instruction");
17317
17318 unsigned dmask = MO_Dmask->getImm();
17319 // Determine the number of active lanes taking into account the
17320 // Gather4 special case
17321 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17322
17323 bool Packed = !Subtarget->hasUnpackedD16VMem();
17324
17325 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17326
17327 // Abandon attempt if the dst size isn't large enough
17328 // - this is in fact an error but this is picked up elsewhere and
17329 // reported correctly.
17330 uint32_t DstSize =
17331 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17332 if (DstSize < InitIdx)
17333 return;
17334 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17335 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17336 } else {
17337 return;
17338 }
17339
17340 const DebugLoc &DL = MI.getDebugLoc();
17341
17342 // Create a register for the initialization value.
17343 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17344 unsigned NewDst = 0; // Final initialized value will be in here
17345
17346 // If PRTStrictNull feature is enabled (the default) then initialize
17347 // all the result registers to 0, otherwise just the error indication
17348 // register (VGPRn+1)
17349 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17350 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17351
17352 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17353 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17354 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17355 // Initialize dword
17356 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17357 // clang-format off
17358 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17359 .addImm(0);
17360 // clang-format on
17361 // Insert into the super-reg
17362 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17363 .addReg(PrevDst)
17364 .addReg(SubReg)
17366
17367 PrevDst = NewDst;
17368 }
17369
17370 // Add as an implicit operand
17371 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17372
17373 // Tie the just added implicit operand to the dst
17374 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17375}
17376
17377/// Assign the register class depending on the number of
17378/// bits set in the writemask
17380 SDNode *Node) const {
17382
17383 MachineFunction *MF = MI.getParent()->getParent();
17386
17387 if (TII->isVOP3(MI.getOpcode())) {
17388 // Make sure constant bus requirements are respected.
17389 TII->legalizeOperandsVOP3(MRI, MI);
17390
17391 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17392 // This saves a chain-copy of registers and better balance register
17393 // use between vgpr and agpr as agpr tuples tend to be big.
17394 if (!MI.getDesc().operands().empty()) {
17395 unsigned Opc = MI.getOpcode();
17396 bool HasAGPRs = Info->mayNeedAGPRs();
17397 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17398 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17399 for (auto I :
17400 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17401 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17402 if (I == -1)
17403 break;
17404 if ((I == Src2Idx) && (HasAGPRs))
17405 break;
17406 MachineOperand &Op = MI.getOperand(I);
17407 if (!Op.isReg() || !Op.getReg().isVirtual())
17408 continue;
17409 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17410 if (!TRI->hasAGPRs(RC))
17411 continue;
17412 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17413 if (!Src || !Src->isCopy() ||
17414 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17415 continue;
17416 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17417 // All uses of agpr64 and agpr32 can also accept vgpr except for
17418 // v_accvgpr_read, but we do not produce agpr reads during selection,
17419 // so no use checks are needed.
17420 MRI.setRegClass(Op.getReg(), NewRC);
17421 }
17422
17423 if (TII->isMAI(MI)) {
17424 // The ordinary src0, src1, src2 were legalized above.
17425 //
17426 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17427 // as a separate instruction.
17428 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17429 AMDGPU::OpName::scale_src0);
17430 if (Src0Idx != -1) {
17431 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17432 AMDGPU::OpName::scale_src1);
17433 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17434 TII->usesConstantBus(MRI, MI, Src1Idx))
17435 TII->legalizeOpWithMove(MI, Src1Idx);
17436 }
17437 }
17438
17439 if (!HasAGPRs)
17440 return;
17441
17442 // Resolve the rest of AV operands to AGPRs.
17443 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17444 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17445 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17446 if (TRI->isVectorSuperClass(RC)) {
17447 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17448 MRI.setRegClass(Src2->getReg(), NewRC);
17449 if (Src2->isTied())
17450 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17451 }
17452 }
17453 }
17454 }
17455
17456 return;
17457 }
17458
17459 if (TII->isImage(MI))
17460 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17461}
17462
17464 uint64_t Val) {
17465 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17466 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17467}
17468
17470 const SDLoc &DL,
17471 SDValue Ptr) const {
17473
17474 // Build the half of the subregister with the constants before building the
17475 // full 128-bit register. If we are building multiple resource descriptors,
17476 // this will allow CSEing of the 2-component register.
17477 const SDValue Ops0[] = {
17478 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17479 buildSMovImm32(DAG, DL, 0),
17480 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17481 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17482 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17483
17484 SDValue SubRegHi = SDValue(
17485 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17486
17487 // Combine the constants and the pointer.
17488 const SDValue Ops1[] = {
17489 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17490 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17491 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17492
17493 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17494}
17495
17496/// Return a resource descriptor with the 'Add TID' bit enabled
17497/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17498/// of the resource descriptor) to create an offset, which is added to
17499/// the resource pointer.
17501 SDValue Ptr, uint32_t RsrcDword1,
17502 uint64_t RsrcDword2And3) const {
17503 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17504 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17505 if (RsrcDword1) {
17506 PtrHi =
17507 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17508 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17509 0);
17510 }
17511
17512 SDValue DataLo =
17513 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17514 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17515
17516 const SDValue Ops[] = {
17517 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17518 PtrLo,
17519 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17520 PtrHi,
17521 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17522 DataLo,
17523 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17524 DataHi,
17525 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17526
17527 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17528}
17529
17530//===----------------------------------------------------------------------===//
17531// SI Inline Assembly Support
17532//===----------------------------------------------------------------------===//
17533
17534std::pair<unsigned, const TargetRegisterClass *>
17536 StringRef Constraint,
17537 MVT VT) const {
17538 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17539
17540 const TargetRegisterClass *RC = nullptr;
17541 if (Constraint.size() == 1) {
17542 // Check if we cannot determine the bit size of the given value type. This
17543 // can happen, for example, in this situation where we have an empty struct
17544 // (size 0): `call void asm "", "v"({} poison)`-
17545 if (VT == MVT::Other)
17546 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17547 const unsigned BitWidth = VT.getSizeInBits();
17548 switch (Constraint[0]) {
17549 default:
17550 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17551 case 's':
17552 case 'r':
17553 switch (BitWidth) {
17554 case 16:
17555 RC = &AMDGPU::SReg_32RegClass;
17556 break;
17557 case 64:
17558 RC = &AMDGPU::SGPR_64RegClass;
17559 break;
17560 default:
17562 if (!RC)
17563 return std::pair(0U, nullptr);
17564 break;
17565 }
17566 break;
17567 case 'v':
17568 switch (BitWidth) {
17569 case 16:
17570 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17571 : &AMDGPU::VGPR_32_Lo256RegClass;
17572 break;
17573 default:
17574 RC = Subtarget->has1024AddressableVGPRs()
17575 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17576 : TRI->getVGPRClassForBitWidth(BitWidth);
17577 if (!RC)
17578 return std::pair(0U, nullptr);
17579 break;
17580 }
17581 break;
17582 case 'a':
17583 if (!Subtarget->hasMAIInsts())
17584 break;
17585 switch (BitWidth) {
17586 case 16:
17587 RC = &AMDGPU::AGPR_32RegClass;
17588 break;
17589 default:
17590 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17591 if (!RC)
17592 return std::pair(0U, nullptr);
17593 break;
17594 }
17595 break;
17596 }
17597 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17598 const unsigned BitWidth = VT.getSizeInBits();
17599 switch (BitWidth) {
17600 case 16:
17601 RC = &AMDGPU::AV_32RegClass;
17602 break;
17603 default:
17604 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17605 if (!RC)
17606 return std::pair(0U, nullptr);
17607 break;
17608 }
17609 }
17610
17611 // We actually support i128, i16 and f16 as inline parameters
17612 // even if they are not reported as legal
17613 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17614 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17615 return std::pair(0U, RC);
17616
17617 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17618 if (Kind != '\0') {
17619 if (Kind == 'v') {
17620 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17621 } else if (Kind == 's') {
17622 RC = &AMDGPU::SGPR_32RegClass;
17623 } else if (Kind == 'a') {
17624 RC = &AMDGPU::AGPR_32RegClass;
17625 }
17626
17627 if (RC) {
17628 if (NumRegs > 1) {
17629 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17630 return std::pair(0U, nullptr);
17631
17632 uint32_t Width = NumRegs * 32;
17633 // Prohibit constraints for register ranges with a width that does not
17634 // match the required type.
17635 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17636 return std::pair(0U, nullptr);
17637
17638 MCRegister Reg = RC->getRegister(Idx);
17640 RC = TRI->getVGPRClassForBitWidth(Width);
17641 else if (SIRegisterInfo::isSGPRClass(RC))
17642 RC = TRI->getSGPRClassForBitWidth(Width);
17643 else if (SIRegisterInfo::isAGPRClass(RC))
17644 RC = TRI->getAGPRClassForBitWidth(Width);
17645 if (RC) {
17646 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17647 if (!Reg) {
17648 // The register class does not contain the requested register,
17649 // e.g., because it is an SGPR pair that would violate alignment
17650 // requirements.
17651 return std::pair(0U, nullptr);
17652 }
17653 return std::pair(Reg, RC);
17654 }
17655 }
17656
17657 // Check for lossy scalar/vector conversions.
17658 if (VT.isVector() && VT.getSizeInBits() != 32)
17659 return std::pair(0U, nullptr);
17660 if (Idx < RC->getNumRegs())
17661 return std::pair(RC->getRegister(Idx), RC);
17662 return std::pair(0U, nullptr);
17663 }
17664 }
17665
17666 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17667 if (Ret.first)
17668 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17669
17670 return Ret;
17671}
17672
17673static bool isImmConstraint(StringRef Constraint) {
17674 if (Constraint.size() == 1) {
17675 switch (Constraint[0]) {
17676 default:
17677 break;
17678 case 'I':
17679 case 'J':
17680 case 'A':
17681 case 'B':
17682 case 'C':
17683 return true;
17684 }
17685 } else if (Constraint == "DA" || Constraint == "DB") {
17686 return true;
17687 }
17688 return false;
17689}
17690
17693 if (Constraint.size() == 1) {
17694 switch (Constraint[0]) {
17695 default:
17696 break;
17697 case 's':
17698 case 'v':
17699 case 'a':
17700 return C_RegisterClass;
17701 }
17702 } else if (Constraint.size() == 2) {
17703 if (Constraint == "VA")
17704 return C_RegisterClass;
17705 }
17706 if (isImmConstraint(Constraint)) {
17707 return C_Other;
17708 }
17709 return TargetLowering::getConstraintType(Constraint);
17710}
17711
17712static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17714 Val = Val & maskTrailingOnes<uint64_t>(Size);
17715 }
17716 return Val;
17717}
17718
17720 StringRef Constraint,
17721 std::vector<SDValue> &Ops,
17722 SelectionDAG &DAG) const {
17723 if (isImmConstraint(Constraint)) {
17724 uint64_t Val;
17725 if (getAsmOperandConstVal(Op, Val) &&
17726 checkAsmConstraintVal(Op, Constraint, Val)) {
17727 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17728 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17729 }
17730 } else {
17732 }
17733}
17734
17736 unsigned Size = Op.getScalarValueSizeInBits();
17737 if (Size > 64)
17738 return false;
17739
17740 if (Size == 16 && !Subtarget->has16BitInsts())
17741 return false;
17742
17744 Val = C->getSExtValue();
17745 return true;
17746 }
17748 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17749 return true;
17750 }
17752 if (Size != 16 || Op.getNumOperands() != 2)
17753 return false;
17754 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17755 return false;
17756 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17757 Val = C->getSExtValue();
17758 return true;
17759 }
17760 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17761 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17762 return true;
17763 }
17764 }
17765
17766 return false;
17767}
17768
17770 uint64_t Val) const {
17771 if (Constraint.size() == 1) {
17772 switch (Constraint[0]) {
17773 case 'I':
17775 case 'J':
17776 return isInt<16>(Val);
17777 case 'A':
17778 return checkAsmConstraintValA(Op, Val);
17779 case 'B':
17780 return isInt<32>(Val);
17781 case 'C':
17782 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17784 default:
17785 break;
17786 }
17787 } else if (Constraint.size() == 2) {
17788 if (Constraint == "DA") {
17789 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17790 int64_t LoBits = static_cast<int32_t>(Val);
17791 return checkAsmConstraintValA(Op, HiBits, 32) &&
17792 checkAsmConstraintValA(Op, LoBits, 32);
17793 }
17794 if (Constraint == "DB") {
17795 return true;
17796 }
17797 }
17798 llvm_unreachable("Invalid asm constraint");
17799}
17800
17802 unsigned MaxSize) const {
17803 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17804 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17805 if (Size == 16) {
17806 MVT VT = Op.getSimpleValueType();
17807 switch (VT.SimpleTy) {
17808 default:
17809 return false;
17810 case MVT::i16:
17811 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17812 case MVT::f16:
17813 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17814 case MVT::bf16:
17815 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17816 case MVT::v2i16:
17817 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17818 case MVT::v2f16:
17819 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17820 case MVT::v2bf16:
17821 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17822 }
17823 }
17824 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17825 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17826 return true;
17827 return false;
17828}
17829
17830static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17831 switch (UnalignedClassID) {
17832 case AMDGPU::VReg_64RegClassID:
17833 return AMDGPU::VReg_64_Align2RegClassID;
17834 case AMDGPU::VReg_96RegClassID:
17835 return AMDGPU::VReg_96_Align2RegClassID;
17836 case AMDGPU::VReg_128RegClassID:
17837 return AMDGPU::VReg_128_Align2RegClassID;
17838 case AMDGPU::VReg_160RegClassID:
17839 return AMDGPU::VReg_160_Align2RegClassID;
17840 case AMDGPU::VReg_192RegClassID:
17841 return AMDGPU::VReg_192_Align2RegClassID;
17842 case AMDGPU::VReg_224RegClassID:
17843 return AMDGPU::VReg_224_Align2RegClassID;
17844 case AMDGPU::VReg_256RegClassID:
17845 return AMDGPU::VReg_256_Align2RegClassID;
17846 case AMDGPU::VReg_288RegClassID:
17847 return AMDGPU::VReg_288_Align2RegClassID;
17848 case AMDGPU::VReg_320RegClassID:
17849 return AMDGPU::VReg_320_Align2RegClassID;
17850 case AMDGPU::VReg_352RegClassID:
17851 return AMDGPU::VReg_352_Align2RegClassID;
17852 case AMDGPU::VReg_384RegClassID:
17853 return AMDGPU::VReg_384_Align2RegClassID;
17854 case AMDGPU::VReg_512RegClassID:
17855 return AMDGPU::VReg_512_Align2RegClassID;
17856 case AMDGPU::VReg_1024RegClassID:
17857 return AMDGPU::VReg_1024_Align2RegClassID;
17858 case AMDGPU::AReg_64RegClassID:
17859 return AMDGPU::AReg_64_Align2RegClassID;
17860 case AMDGPU::AReg_96RegClassID:
17861 return AMDGPU::AReg_96_Align2RegClassID;
17862 case AMDGPU::AReg_128RegClassID:
17863 return AMDGPU::AReg_128_Align2RegClassID;
17864 case AMDGPU::AReg_160RegClassID:
17865 return AMDGPU::AReg_160_Align2RegClassID;
17866 case AMDGPU::AReg_192RegClassID:
17867 return AMDGPU::AReg_192_Align2RegClassID;
17868 case AMDGPU::AReg_256RegClassID:
17869 return AMDGPU::AReg_256_Align2RegClassID;
17870 case AMDGPU::AReg_512RegClassID:
17871 return AMDGPU::AReg_512_Align2RegClassID;
17872 case AMDGPU::AReg_1024RegClassID:
17873 return AMDGPU::AReg_1024_Align2RegClassID;
17874 default:
17875 return -1;
17876 }
17877}
17878
17879// Figure out which registers should be reserved for stack access. Only after
17880// the function is legalized do we know all of the non-spill stack objects or if
17881// calls are present.
17885 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17886 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17887 const SIInstrInfo *TII = ST.getInstrInfo();
17888
17889 if (Info->isEntryFunction()) {
17890 // Callable functions have fixed registers used for stack access.
17892 }
17893
17894 // TODO: Move this logic to getReservedRegs()
17895 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17896 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17897 Register SReg = ST.isWave32()
17898 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17899 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17900 &AMDGPU::SGPR_64RegClass);
17901 Info->setSGPRForEXECCopy(SReg);
17902
17903 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17904 Info->getStackPtrOffsetReg()));
17905 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17906 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17907
17908 // We need to worry about replacing the default register with itself in case
17909 // of MIR testcases missing the MFI.
17910 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17911 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17912
17913 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17914 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17915
17916 Info->limitOccupancy(MF);
17917
17918 if (ST.isWave32() && !MF.empty()) {
17919 for (auto &MBB : MF) {
17920 for (auto &MI : MBB) {
17921 TII->fixImplicitOperands(MI);
17922 }
17923 }
17924 }
17925
17926 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17927 // classes if required. Ideally the register class constraints would differ
17928 // per-subtarget, but there's no easy way to achieve that right now. This is
17929 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17930 // from using them as the register class for legal types.
17931 if (ST.needsAlignedVGPRs()) {
17932 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17933 const Register Reg = Register::index2VirtReg(I);
17934 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17935 if (!RC)
17936 continue;
17937 int NewClassID = getAlignedAGPRClassID(RC->getID());
17938 if (NewClassID != -1)
17939 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17940 }
17941 }
17942
17944}
17945
17947 KnownBits &Known,
17948 const APInt &DemandedElts,
17949 const SelectionDAG &DAG,
17950 unsigned Depth) const {
17951 Known.resetAll();
17952 unsigned Opc = Op.getOpcode();
17953 switch (Opc) {
17955 unsigned IID = Op.getConstantOperandVal(0);
17956 switch (IID) {
17957 case Intrinsic::amdgcn_mbcnt_lo:
17958 case Intrinsic::amdgcn_mbcnt_hi: {
17959 const GCNSubtarget &ST =
17961 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17962 // most 31 + src1.
17963 Known.Zero.setBitsFrom(
17964 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17965 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17966 Known = KnownBits::add(Known, Known2);
17967 return;
17968 }
17969 }
17970 break;
17971 }
17972 }
17974 Op, Known, DemandedElts, DAG, Depth);
17975}
17976
17978 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17980
17981 // Set the high bits to zero based on the maximum allowed scratch size per
17982 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17983 // calculation won't overflow, so assume the sign bit is never set.
17984 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17985}
17986
17988 GISelValueTracking &VT, KnownBits &Known,
17989 unsigned Dim) {
17990 unsigned MaxValue =
17991 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17992 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17993}
17994
17996 KnownBits &Known, const APInt &DemandedElts,
17997 unsigned BFEWidth, bool SExt, unsigned Depth) {
17999 const MachineOperand &Src1 = MI.getOperand(2);
18000
18001 unsigned Src1Cst = 0;
18002 if (Src1.isImm()) {
18003 Src1Cst = Src1.getImm();
18004 } else if (Src1.isReg()) {
18005 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18006 if (!Cst)
18007 return;
18008 Src1Cst = Cst->Value.getZExtValue();
18009 } else {
18010 return;
18011 }
18012
18013 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18014 // Width is always [22:16].
18015 const unsigned Offset =
18016 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18017 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18018
18019 if (Width >= BFEWidth) // Ill-formed.
18020 return;
18021
18022 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18023 Depth + 1);
18024
18025 Known = Known.extractBits(Width, Offset);
18026
18027 if (SExt)
18028 Known = Known.sext(BFEWidth);
18029 else
18030 Known = Known.zext(BFEWidth);
18031}
18032
18034 GISelValueTracking &VT, Register R, KnownBits &Known,
18035 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18036 unsigned Depth) const {
18037 Known.resetAll();
18038 const MachineInstr *MI = MRI.getVRegDef(R);
18039 switch (MI->getOpcode()) {
18040 case AMDGPU::S_BFE_I32:
18041 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18042 /*SExt=*/true, Depth);
18043 case AMDGPU::S_BFE_U32:
18044 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18045 /*SExt=*/false, Depth);
18046 case AMDGPU::S_BFE_I64:
18047 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18048 /*SExt=*/true, Depth);
18049 case AMDGPU::S_BFE_U64:
18050 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18051 /*SExt=*/false, Depth);
18052 case AMDGPU::G_INTRINSIC:
18053 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18054 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18055 switch (IID) {
18056 case Intrinsic::amdgcn_workitem_id_x:
18057 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18058 break;
18059 case Intrinsic::amdgcn_workitem_id_y:
18060 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18061 break;
18062 case Intrinsic::amdgcn_workitem_id_z:
18063 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18064 break;
18065 case Intrinsic::amdgcn_mbcnt_lo:
18066 case Intrinsic::amdgcn_mbcnt_hi: {
18067 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18068 // most 31 + src1.
18069 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18070 ? getSubtarget()->getWavefrontSizeLog2()
18071 : 5);
18072 KnownBits Known2;
18073 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18074 Depth + 1);
18075 Known = KnownBits::add(Known, Known2);
18076 break;
18077 }
18078 case Intrinsic::amdgcn_groupstaticsize: {
18079 // We can report everything over the maximum size as 0. We can't report
18080 // based on the actual size because we don't know if it's accurate or not
18081 // at any given point.
18082 Known.Zero.setHighBits(
18083 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18084 break;
18085 }
18086 }
18087 break;
18088 }
18089 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18090 Known.Zero.setHighBits(24);
18091 break;
18092 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18093 Known.Zero.setHighBits(16);
18094 break;
18095 case AMDGPU::G_AMDGPU_SMED3:
18096 case AMDGPU::G_AMDGPU_UMED3: {
18097 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18098
18099 KnownBits Known2;
18100 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18101 if (Known2.isUnknown())
18102 break;
18103
18104 KnownBits Known1;
18105 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18106 if (Known1.isUnknown())
18107 break;
18108
18109 KnownBits Known0;
18110 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18111 if (Known0.isUnknown())
18112 break;
18113
18114 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18115 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18116 Known.One = Known0.One & Known1.One & Known2.One;
18117 break;
18118 }
18119 }
18120}
18121
18124 unsigned Depth) const {
18125 const MachineInstr *MI = MRI.getVRegDef(R);
18126 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18127 // FIXME: Can this move to generic code? What about the case where the call
18128 // site specifies a lower alignment?
18129 Intrinsic::ID IID = GI->getIntrinsicID();
18131 AttributeList Attrs =
18132 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18133 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18134 return *RetAlign;
18135 }
18136 return Align(1);
18137}
18138
18141 const Align CacheLineAlign = Align(64);
18142
18143 // Pre-GFX10 target did not benefit from loop alignment
18144 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18145 getSubtarget()->hasInstFwdPrefetchBug())
18146 return PrefAlign;
18147
18148 // On GFX10 I$ is 4 x 64 bytes cache lines.
18149 // By default prefetcher keeps one cache line behind and reads two ahead.
18150 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18151 // behind and one ahead.
18152 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18153 // If loop fits 64 bytes it always spans no more than two cache lines and
18154 // does not need an alignment.
18155 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18156 // Else if loop is less or equal 192 bytes we need two lines behind.
18157
18159 const MachineBasicBlock *Header = ML->getHeader();
18160 if (Header->getAlignment() != PrefAlign)
18161 return Header->getAlignment(); // Already processed.
18162
18163 unsigned LoopSize = 0;
18164 for (const MachineBasicBlock *MBB : ML->blocks()) {
18165 // If inner loop block is aligned assume in average half of the alignment
18166 // size to be added as nops.
18167 if (MBB != Header)
18168 LoopSize += MBB->getAlignment().value() / 2;
18169
18170 for (const MachineInstr &MI : *MBB) {
18171 LoopSize += TII->getInstSizeInBytes(MI);
18172 if (LoopSize > 192)
18173 return PrefAlign;
18174 }
18175 }
18176
18177 if (LoopSize <= 64)
18178 return PrefAlign;
18179
18180 if (LoopSize <= 128)
18181 return CacheLineAlign;
18182
18183 // If any of parent loops is surrounded by prefetch instructions do not
18184 // insert new for inner loop, which would reset parent's settings.
18185 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18186 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18187 auto I = Exit->getFirstNonDebugInstr();
18188 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18189 return CacheLineAlign;
18190 }
18191 }
18192
18193 MachineBasicBlock *Pre = ML->getLoopPreheader();
18194 MachineBasicBlock *Exit = ML->getExitBlock();
18195
18196 if (Pre && Exit) {
18197 auto PreTerm = Pre->getFirstTerminator();
18198 if (PreTerm == Pre->begin() ||
18199 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18200 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18201 .addImm(1); // prefetch 2 lines behind PC
18202
18203 auto ExitHead = Exit->getFirstNonDebugInstr();
18204 if (ExitHead == Exit->end() ||
18205 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18206 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18207 .addImm(2); // prefetch 1 line behind PC
18208 }
18209
18210 return CacheLineAlign;
18211}
18212
18214static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18215 assert(N->getOpcode() == ISD::CopyFromReg);
18216 do {
18217 // Follow the chain until we find an INLINEASM node.
18218 N = N->getOperand(0).getNode();
18219 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18220 return true;
18221 } while (N->getOpcode() == ISD::CopyFromReg);
18222 return false;
18223}
18224
18227 UniformityInfo *UA) const {
18228 switch (N->getOpcode()) {
18229 case ISD::CopyFromReg: {
18230 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18231 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18232 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18233 Register Reg = R->getReg();
18234
18235 // FIXME: Why does this need to consider isLiveIn?
18236 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18237 return !TRI->isSGPRReg(MRI, Reg);
18238
18239 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18240 return UA->isDivergent(V);
18241
18243 return !TRI->isSGPRReg(MRI, Reg);
18244 }
18245 case ISD::LOAD: {
18246 const LoadSDNode *L = cast<LoadSDNode>(N);
18247 unsigned AS = L->getAddressSpace();
18248 // A flat load may access private memory.
18250 }
18251 case ISD::CALLSEQ_END:
18252 return true;
18254 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18256 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18275 // Target-specific read-modify-write atomics are sources of divergence.
18276 return true;
18277 default:
18278 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18279 // Generic read-modify-write atomics are sources of divergence.
18280 return A->readMem() && A->writeMem();
18281 }
18282 return false;
18283 }
18284}
18285
18287 EVT VT) const {
18288 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18289 case MVT::f32:
18291 case MVT::f64:
18292 case MVT::f16:
18294 default:
18295 return false;
18296 }
18297}
18298
18300 LLT Ty, const MachineFunction &MF) const {
18301 switch (Ty.getScalarSizeInBits()) {
18302 case 32:
18303 return !denormalModeIsFlushAllF32(MF);
18304 case 64:
18305 case 16:
18306 return !denormalModeIsFlushAllF64F16(MF);
18307 default:
18308 return false;
18309 }
18310}
18311
18313 const APInt &DemandedElts,
18314 const SelectionDAG &DAG,
18315 bool SNaN,
18316 unsigned Depth) const {
18317 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18318 const MachineFunction &MF = DAG.getMachineFunction();
18320
18321 if (Info->getMode().DX10Clamp)
18322 return true; // Clamped to 0.
18323 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18324 }
18325
18327 DAG, SNaN, Depth);
18328}
18329
18330// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18331// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18333 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18334 return true;
18335
18337 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18338 if (DenormMode == DenormalMode::getPreserveSign())
18339 return true;
18340
18341 // TODO: Remove this.
18342 return RMW->getFunction()
18343 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18344 .getValueAsBool();
18345}
18346
18348 LLVMContext &Ctx = RMW->getContext();
18349 StringRef MemScope =
18350 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18351
18352 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18353 << "Hardware instruction generated for atomic "
18354 << RMW->getOperationName(RMW->getOperation())
18355 << " operation at memory scope " << MemScope;
18356}
18357
18358static bool isV2F16OrV2BF16(Type *Ty) {
18359 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18360 Type *EltTy = VT->getElementType();
18361 return VT->getNumElements() == 2 &&
18362 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18363 }
18364
18365 return false;
18366}
18367
18368static bool isV2F16(Type *Ty) {
18370 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18371}
18372
18373static bool isV2BF16(Type *Ty) {
18375 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18376}
18377
18378/// \return true if atomicrmw integer ops work for the type.
18379static bool isAtomicRMWLegalIntTy(Type *Ty) {
18380 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18381 unsigned BW = IT->getBitWidth();
18382 return BW == 32 || BW == 64;
18383 }
18384
18385 return false;
18386}
18387
18388/// \return true if this atomicrmw xchg type can be selected.
18389static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18390 Type *Ty = RMW->getType();
18391 if (isAtomicRMWLegalIntTy(Ty))
18392 return true;
18393
18394 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18395 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18396 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18397 return BW == 32 || BW == 64;
18398 }
18399
18400 if (Ty->isFloatTy() || Ty->isDoubleTy())
18401 return true;
18402
18404 return VT->getNumElements() == 2 &&
18405 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18406 }
18407
18408 return false;
18409}
18410
18411/// \returns true if it's valid to emit a native instruction for \p RMW, based
18412/// on the properties of the target memory.
18413static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18414 const AtomicRMWInst *RMW,
18415 bool HasSystemScope) {
18416 // The remote/fine-grained access logic is different from the integer
18417 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18418 // fine-grained access does not work, even for a device local allocation.
18419 //
18420 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18421 // allocations work.
18422 if (HasSystemScope) {
18424 RMW->hasMetadata("amdgpu.no.remote.memory"))
18425 return true;
18426 if (Subtarget.hasEmulatedSystemScopeAtomics())
18427 return true;
18429 return true;
18430
18431 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18432}
18433
18434/// \return Action to perform on AtomicRMWInsts for integer operations.
18441
18442/// Return if a flat address space atomicrmw can access private memory.
18444 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18445 return !MD ||
18447}
18448
18456
18459 unsigned AS = RMW->getPointerAddressSpace();
18460 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18462
18463 // 64-bit flat atomics that dynamically reside in private memory will silently
18464 // be dropped.
18465 //
18466 // Note that we will emit a new copy of the original atomic in the expansion,
18467 // which will be incrementally relegalized.
18468 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18469 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18470 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18473
18474 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18476 ORE.emit([=]() {
18477 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18478 });
18479 return Kind;
18480 };
18481
18482 auto SSID = RMW->getSyncScopeID();
18483 bool HasSystemScope =
18484 SSID == SyncScope::System ||
18485 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18486
18487 auto Op = RMW->getOperation();
18488 switch (Op) {
18490 // PCIe supports add and xchg for system atomics.
18491 return isAtomicRMWLegalXChgTy(RMW)
18494 case AtomicRMWInst::Add:
18495 // PCIe supports add and xchg for system atomics.
18497 case AtomicRMWInst::Sub:
18498 case AtomicRMWInst::And:
18499 case AtomicRMWInst::Or:
18500 case AtomicRMWInst::Xor:
18501 case AtomicRMWInst::Max:
18502 case AtomicRMWInst::Min:
18509 if (Subtarget->hasEmulatedSystemScopeAtomics())
18511
18512 // On most subtargets, for atomicrmw operations other than add/xchg,
18513 // whether or not the instructions will behave correctly depends on where
18514 // the address physically resides and what interconnect is used in the
18515 // system configuration. On some some targets the instruction will nop,
18516 // and in others synchronization will only occur at degraded device scope.
18517 //
18518 // If the allocation is known local to the device, the instructions should
18519 // work correctly.
18520 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18522
18523 // If fine-grained remote memory works at device scope, we don't need to
18524 // do anything.
18525 if (!HasSystemScope &&
18526 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18528
18529 // If we are targeting a remote allocated address, it depends what kind of
18530 // allocation the address belongs to.
18531 //
18532 // If the allocation is fine-grained (in host memory, or in PCIe peer
18533 // device memory), the operation will fail depending on the target.
18534 //
18535 // Note fine-grained host memory access does work on APUs or if XGMI is
18536 // used, but we do not know if we are targeting an APU or the system
18537 // configuration from the ISA version/target-cpu.
18538 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18540
18543 // Atomic sub/or/xor do not work over PCI express, but atomic add
18544 // does. InstCombine transforms these with 0 to or, so undo that.
18545 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18546 ConstVal && ConstVal->isNullValue())
18548 }
18549
18550 // If the allocation could be in remote, fine-grained memory, the rmw
18551 // instructions may fail. cmpxchg should work, so emit that. On some
18552 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18553 // even work, so you're out of luck anyway.
18554
18555 // In summary:
18556 //
18557 // Cases that may fail:
18558 // - fine-grained pinned host memory
18559 // - fine-grained migratable host memory
18560 // - fine-grained PCIe peer device
18561 //
18562 // Cases that should work, but may be treated overly conservatively.
18563 // - fine-grained host memory on an APU
18564 // - fine-grained XGMI peer device
18566 }
18567
18569 }
18570 case AtomicRMWInst::FAdd: {
18571 Type *Ty = RMW->getType();
18572
18573 // TODO: Handle REGION_ADDRESS
18574 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18575 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18576 // is fixed to round-to-nearest-even.
18577 //
18578 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18579 // round-to-nearest-even.
18580 //
18581 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18582 // suggests it is OK if the floating-point mode may not match the calling
18583 // thread.
18584 if (Ty->isFloatTy()) {
18585 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18587 }
18588
18589 if (Ty->isDoubleTy()) {
18590 // Ignores denormal mode, but we don't consider flushing mandatory.
18591 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18593 }
18594
18595 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18597
18599 }
18600
18601 // LDS atomics respect the denormal mode from the mode register.
18602 //
18603 // Traditionally f32 global/buffer memory atomics would unconditionally
18604 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18605 // flush.
18606 //
18607 // On targets with flat atomic fadd, denormals would flush depending on
18608 // whether the target address resides in LDS or global memory. We consider
18609 // this flat-maybe-flush as will-flush.
18610 if (Ty->isFloatTy() &&
18611 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18614
18615 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18616 // safe. The message phrasing also should be better.
18617 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18618 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18619 // gfx942, gfx12
18620 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18621 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18622 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18623 // gfx90a, gfx942, gfx12
18624 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18625 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18626
18627 // gfx942, gfx12
18628 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18629 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18630 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18631 // gfx90a, gfx942, gfx12
18632 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18633 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18634
18635 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18636 // buffer. gfx12 does have the buffer version.
18637 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18638 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18639 }
18640
18641 // global and flat atomic fadd f64: gfx90a, gfx942.
18642 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18643 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18644
18645 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18646 if (Ty->isFloatTy()) {
18647 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18648 // gfx11+.
18649 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18650 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18651 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18652 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18653 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18654 } else {
18655 // gfx908
18656 if (RMW->use_empty() &&
18657 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18658 isV2F16(Ty))
18659 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18660 }
18661 }
18662
18663 // flat atomic fadd f32: gfx942, gfx11+.
18664 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18665 if (Subtarget->hasFlatAtomicFaddF32Inst())
18666 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18667
18668 // If it is in flat address space, and the type is float, we will try to
18669 // expand it, if the target supports global and lds atomic fadd. The
18670 // reason we need that is, in the expansion, we emit the check of
18671 // address space. If it is in global address space, we emit the global
18672 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18673 // fadd.
18674 if (Subtarget->hasLDSFPAtomicAddF32()) {
18675 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18677 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18679 }
18680 }
18681 }
18682
18684 }
18686 case AtomicRMWInst::FMax: {
18687 Type *Ty = RMW->getType();
18688
18689 // LDS float and double fmin/fmax were always supported.
18690 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18691 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18693 }
18694
18695 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18696 // For flat and global cases:
18697 // float, double in gfx7. Manual claims denormal support.
18698 // Removed in gfx8.
18699 // float, double restored in gfx10.
18700 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18701 //
18702 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18703 // no f32.
18704 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18705 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18706 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18707 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18708 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18709 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18711 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18712 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18713 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18714 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18715 }
18716 }
18717
18719 }
18722 default:
18724 }
18725
18726 llvm_unreachable("covered atomicrmw op switch");
18727}
18728
18735
18742
18745 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18746 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18748
18749 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18751
18752 const DataLayout &DL = CmpX->getDataLayout();
18753
18754 Type *ValTy = CmpX->getNewValOperand()->getType();
18755
18756 // If a 64-bit flat atomic may alias private, we need to avoid using the
18757 // atomic in the private case.
18758 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18760}
18761
18762const TargetRegisterClass *
18763SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18765 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18766 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18767 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18768 : &AMDGPU::SReg_32RegClass;
18769 if (!TRI->isSGPRClass(RC) && !isDivergent)
18770 return TRI->getEquivalentSGPRClass(RC);
18771 if (TRI->isSGPRClass(RC) && isDivergent)
18772 return TRI->getEquivalentVGPRClass(RC);
18773
18774 return RC;
18775}
18776
18777// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18778// uniform values (as produced by the mask results of control flow intrinsics)
18779// used outside of divergent blocks. The phi users need to also be treated as
18780// always uniform.
18781//
18782// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18783static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18784 unsigned WaveSize) {
18785 // FIXME: We assume we never cast the mask results of a control flow
18786 // intrinsic.
18787 // Early exit if the type won't be consistent as a compile time hack.
18788 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18789 if (!IT || IT->getBitWidth() != WaveSize)
18790 return false;
18791
18792 if (!isa<Instruction>(V))
18793 return false;
18794 if (!Visited.insert(V).second)
18795 return false;
18796 bool Result = false;
18797 for (const auto *U : V->users()) {
18799 if (V == U->getOperand(1)) {
18800 switch (Intrinsic->getIntrinsicID()) {
18801 default:
18802 Result = false;
18803 break;
18804 case Intrinsic::amdgcn_if_break:
18805 case Intrinsic::amdgcn_if:
18806 case Intrinsic::amdgcn_else:
18807 Result = true;
18808 break;
18809 }
18810 }
18811 if (V == U->getOperand(0)) {
18812 switch (Intrinsic->getIntrinsicID()) {
18813 default:
18814 Result = false;
18815 break;
18816 case Intrinsic::amdgcn_end_cf:
18817 case Intrinsic::amdgcn_loop:
18818 Result = true;
18819 break;
18820 }
18821 }
18822 } else {
18823 Result = hasCFUser(U, Visited, WaveSize);
18824 }
18825 if (Result)
18826 break;
18827 }
18828 return Result;
18829}
18830
18832 const Value *V) const {
18833 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18834 if (CI->isInlineAsm()) {
18835 // FIXME: This cannot give a correct answer. This should only trigger in
18836 // the case where inline asm returns mixed SGPR and VGPR results, used
18837 // outside the defining block. We don't have a specific result to
18838 // consider, so this assumes if any value is SGPR, the overall register
18839 // also needs to be SGPR.
18840 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18842 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18843 for (auto &TC : TargetConstraints) {
18844 if (TC.Type == InlineAsm::isOutput) {
18846 const TargetRegisterClass *RC =
18847 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18848 TC.ConstraintVT)
18849 .second;
18850 if (RC && SIRI->isSGPRClass(RC))
18851 return true;
18852 }
18853 }
18854 }
18855 }
18857 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18858}
18859
18861 for (SDUse &Use : N->uses()) {
18863 if (getBasePtrIndex(M) == Use.getOperandNo())
18864 return true;
18865 }
18866 }
18867 return false;
18868}
18869
18871 SDValue N1) const {
18872 if (!N0.hasOneUse())
18873 return false;
18874 // Take care of the opportunity to keep N0 uniform
18875 if (N0->isDivergent() || !N1->isDivergent())
18876 return true;
18877 // Check if we have a good chance to form the memory access pattern with the
18878 // base and offset
18879 return (DAG.isBaseWithConstantOffset(N0) &&
18881}
18882
18884 Register N0, Register N1) const {
18885 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18886}
18887
18890 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18892 if (I.getMetadata("amdgpu.noclobber"))
18893 Flags |= MONoClobber;
18894 if (I.getMetadata("amdgpu.last.use"))
18895 Flags |= MOLastUse;
18896 return Flags;
18897}
18898
18900 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18901 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18902 if (User->getOpcode() != ISD::CopyToReg)
18903 return false;
18904 if (!Def->isMachineOpcode())
18905 return false;
18907 if (!MDef)
18908 return false;
18909
18910 unsigned ResNo = User->getOperand(Op).getResNo();
18911 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18912 return false;
18913 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18914 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18915 PhysReg = AMDGPU::SCC;
18916 const TargetRegisterClass *RC =
18917 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18918 Cost = RC->getCopyCost();
18919 return true;
18920 }
18921 return false;
18922}
18923
18925 Instruction *AI) const {
18926 // Given: atomicrmw fadd ptr %addr, float %val ordering
18927 //
18928 // With this expansion we produce the following code:
18929 // [...]
18930 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18931 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18932 //
18933 // atomicrmw.shared:
18934 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18935 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18936 // float %val ordering
18937 // br label %atomicrmw.phi
18938 //
18939 // atomicrmw.check.private:
18940 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18941 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18942 //
18943 // atomicrmw.private:
18944 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18945 // %loaded.private = load float, ptr addrspace(5) %cast.private
18946 // %val.new = fadd float %loaded.private, %val
18947 // store float %val.new, ptr addrspace(5) %cast.private
18948 // br label %atomicrmw.phi
18949 //
18950 // atomicrmw.global:
18951 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18952 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18953 // float %val ordering
18954 // br label %atomicrmw.phi
18955 //
18956 // atomicrmw.phi:
18957 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18958 // [ %loaded.private, %atomicrmw.private ],
18959 // [ %loaded.global, %atomicrmw.global ]
18960 // br label %atomicrmw.end
18961 //
18962 // atomicrmw.end:
18963 // [...]
18964 //
18965 //
18966 // For 64-bit atomics which may reside in private memory, we perform a simpler
18967 // version that only inserts the private check, and uses the flat operation.
18968
18969 IRBuilder<> Builder(AI);
18970 LLVMContext &Ctx = Builder.getContext();
18971
18972 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18973 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18975 Value *Addr = AI->getOperand(PtrOpIdx);
18976
18977 /// TODO: Only need to check private, then emit flat-known-not private (no
18978 /// need for shared block, or cast to global).
18980
18981 Align Alignment;
18982 if (RMW)
18983 Alignment = RMW->getAlign();
18984 else if (CX)
18985 Alignment = CX->getAlign();
18986 else
18987 llvm_unreachable("unhandled atomic operation");
18988
18989 // FullFlatEmulation is true if we need to issue the private, shared, and
18990 // global cases.
18991 //
18992 // If this is false, we are only dealing with the flat-targeting-private case,
18993 // where we only insert a check for private and still use the flat instruction
18994 // for global and shared.
18995
18996 bool FullFlatEmulation =
18997 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18998 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18999 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19000 RMW->getType()->isDoubleTy()));
19001
19002 // If the return value isn't used, do not introduce a false use in the phi.
19003 bool ReturnValueIsUsed = !AI->use_empty();
19004
19005 BasicBlock *BB = Builder.GetInsertBlock();
19006 Function *F = BB->getParent();
19007 BasicBlock *ExitBB =
19008 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19009 BasicBlock *SharedBB = nullptr;
19010
19011 BasicBlock *CheckPrivateBB = BB;
19012 if (FullFlatEmulation) {
19013 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19014 CheckPrivateBB =
19015 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19016 }
19017
19018 BasicBlock *PrivateBB =
19019 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19020 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19021 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19022
19023 std::prev(BB->end())->eraseFromParent();
19024 Builder.SetInsertPoint(BB);
19025
19026 Value *LoadedShared = nullptr;
19027 if (FullFlatEmulation) {
19028 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19029 {Addr}, nullptr, "is.shared");
19030 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19031 Builder.SetInsertPoint(SharedBB);
19032 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19034
19035 Instruction *Clone = AI->clone();
19036 Clone->insertInto(SharedBB, SharedBB->end());
19037 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19038 LoadedShared = Clone;
19039
19040 Builder.CreateBr(PhiBB);
19041 Builder.SetInsertPoint(CheckPrivateBB);
19042 }
19043
19044 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19045 {Addr}, nullptr, "is.private");
19046 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19047
19048 Builder.SetInsertPoint(PrivateBB);
19049
19050 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19052
19053 Value *LoadedPrivate;
19054 if (RMW) {
19055 LoadedPrivate = Builder.CreateAlignedLoad(
19056 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19057
19058 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19059 LoadedPrivate, RMW->getValOperand());
19060
19061 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19062 } else {
19063 auto [ResultLoad, Equal] =
19064 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19065 CX->getNewValOperand(), CX->getAlign());
19066
19067 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19068 ResultLoad, 0);
19069 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19070 }
19071
19072 Builder.CreateBr(PhiBB);
19073
19074 Builder.SetInsertPoint(GlobalBB);
19075
19076 // Continue using a flat instruction if we only emitted the check for private.
19077 Instruction *LoadedGlobal = AI;
19078 if (FullFlatEmulation) {
19079 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19081 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19082 }
19083
19084 AI->removeFromParent();
19085 AI->insertInto(GlobalBB, GlobalBB->end());
19086
19087 // The new atomicrmw may go through another round of legalization later.
19088 if (!FullFlatEmulation) {
19089 // We inserted the runtime check already, make sure we do not try to
19090 // re-expand this.
19091 // TODO: Should union with any existing metadata.
19092 MDBuilder MDB(F->getContext());
19093 MDNode *RangeNotPrivate =
19096 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19097 RangeNotPrivate);
19098 }
19099
19100 Builder.CreateBr(PhiBB);
19101
19102 Builder.SetInsertPoint(PhiBB);
19103
19104 if (ReturnValueIsUsed) {
19105 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19106 AI->replaceAllUsesWith(Loaded);
19107 if (FullFlatEmulation)
19108 Loaded->addIncoming(LoadedShared, SharedBB);
19109 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19110 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19111 Loaded->takeName(AI);
19112 }
19113
19114 Builder.CreateBr(ExitBB);
19115}
19116
19118 unsigned PtrOpIdx) {
19119 Value *PtrOp = I->getOperand(PtrOpIdx);
19122
19123 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19124 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19125 I->getIterator());
19126 I->setOperand(PtrOpIdx, ASCast);
19127}
19128
19131
19134
19137 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19138 ConstVal && ConstVal->isNullValue()) {
19139 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19141
19142 // We may still need the private-alias-flat handling below.
19143
19144 // TODO: Skip this for cases where we cannot access remote memory.
19145 }
19146 }
19147
19148 // The non-flat expansions should only perform the de-canonicalization of
19149 // identity values.
19151 return;
19152
19154}
19155
19162
19166
19168 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19169}
19170
19172 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19173 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19174
19176 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19177}
19178
19179LoadInst *
19181 IRBuilder<> Builder(AI);
19182 auto Order = AI->getOrdering();
19183
19184 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19185 // must be flushed if the atomic ordering had a release semantics. This is
19186 // not necessary a fence, a release fence just coincides to do that flush.
19187 // Avoid replacing of an atomicrmw with a release semantics.
19188 if (isReleaseOrStronger(Order))
19189 return nullptr;
19190
19191 LoadInst *LI = Builder.CreateAlignedLoad(
19192 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19193 LI->setAtomic(Order, AI->getSyncScopeID());
19194 LI->copyMetadata(*AI);
19195 LI->takeName(AI);
19196 AI->replaceAllUsesWith(LI);
19197 AI->eraseFromParent();
19198 return LI;
19199}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1247
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1244
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1445
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:833
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2118
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1740
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:218
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:241
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs