LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/IR/MDBuilder.h"
45#include "llvm/Support/ModRef.h"
47#include <optional>
48
49using namespace llvm;
50using namespace llvm::SDPatternMatch;
51
52#define DEBUG_TYPE "si-lower"
53
54STATISTIC(NumTailCalls, "Number of tail calls");
55
56static cl::opt<bool>
57 DisableLoopAlignment("amdgpu-disable-loop-alignment",
58 cl::desc("Do not align and prefetch loops"),
59 cl::init(false));
60
62 "amdgpu-use-divergent-register-indexing", cl::Hidden,
63 cl::desc("Use indirect register addressing for divergent indexes"),
64 cl::init(false));
65
66// TODO: This option should be removed once we switch to always using PTRADD in
67// the SelectionDAG.
69 "amdgpu-use-sdag-ptradd", cl::Hidden,
70 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
71 "SelectionDAG ISel"),
72 cl::init(false));
73
76 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
77}
78
81 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
82}
83
84static unsigned findFirstFreeSGPR(CCState &CCInfo) {
85 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
86 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
87 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
88 return AMDGPU::SGPR0 + Reg;
89 }
90 }
91 llvm_unreachable("Cannot allocate sgpr");
92}
93
95 const GCNSubtarget &STI)
96 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
97 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
98 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
99
100 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
101 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const SIRegisterInfo *TRI = STI.getRegisterInfo();
106 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
147
148 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
149 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
150
151 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
152 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
153
154 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
156
157 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
158 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
159
160 if (Subtarget->has16BitInsts()) {
161 if (Subtarget->useRealTrue16Insts()) {
162 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
163 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
165 } else {
166 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
169 }
170
171 // Unless there are also VOP3P operations, not operations are really legal.
172 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
176 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
179 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
182 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
185 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
187 }
188
189 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
190 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
191
192 computeRegisterProperties(Subtarget->getRegisterInfo());
193
194 // The boolean content concept here is too inflexible. Compares only ever
195 // really produce a 1-bit result. Any copy/extend from these will turn into a
196 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
197 // it's what most targets use.
200
201 // We need to custom lower vector stores from local memory
202 setOperationAction(ISD::LOAD,
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
207 Custom);
208
209 setOperationAction(ISD::STORE,
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
212 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
213 MVT::i1, MVT::v32i32},
214 Custom);
215
216 if (isTypeLegal(MVT::bf16)) {
217 for (unsigned Opc :
219 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
220 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
221 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
222 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
223 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
224 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
225 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
226 ISD::SETCC}) {
227 // FIXME: The promoted to type shouldn't need to be explicit
228 setOperationAction(Opc, MVT::bf16, Promote);
229 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
230 }
231
233
235 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
236
237 setOperationAction(ISD::FABS, MVT::bf16, Legal);
238 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
240
241 // We only need to custom lower because we can't specify an action for bf16
242 // sources.
245 }
246
247 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
248 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
253 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
258 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
259 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
260 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
261 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
262 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
263
264 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
265 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
266 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
267 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
270 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
271
272 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
273
277 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
278
279 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
280
282 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
283
285 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
286 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
287
289 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
290 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
291 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
292 Expand);
294 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
295 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
296 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
297 Expand);
298
300 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
301 MVT::v3i16, MVT::v4i16, MVT::Other},
302 Custom);
303
304 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
305 setOperationAction(ISD::BR_CC,
306 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
307
309
311
313 Expand);
314
315#if 0
317#endif
318
319 // We only support LOAD/STORE and vector manipulation ops for vectors
320 // with > 4 elements.
321 for (MVT VT :
322 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
323 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
324 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
325 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
326 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
327 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
328 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
329 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
330 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
331 switch (Op) {
332 case ISD::LOAD:
333 case ISD::STORE:
335 case ISD::BITCAST:
336 case ISD::UNDEF:
340 case ISD::IS_FPCLASS:
341 break;
346 break;
347 default:
349 break;
350 }
351 }
352 }
353
354 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
355
356 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
357 // is expanded to avoid having two separate loops in case the index is a VGPR.
358
359 // Most operations are naturally 32-bit vector operations. We only support
360 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
361 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
363 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
364
366 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
367
369 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
373 }
374
375 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
377 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
378
380 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
381
383 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
387 }
388
389 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
391 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
392
394 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
395
397 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
401 }
402
403 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
405 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
406
408 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
409
411 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
415 }
416
417 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
419 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
420
422 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
423
425 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
429 }
430
432 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
433 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
434 Custom);
435
436 if (Subtarget->hasPkMovB32()) {
437 // TODO: 16-bit element vectors should be legal with even aligned elements.
438 // TODO: Can be legal with wider source types than the result with
439 // subregister extracts.
440 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
441 }
442
444 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
445 // instead lower to cndmask in SITargetLowering::LowerSELECT().
447 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
448 // alignbit.
449 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
450
451 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
452 Custom);
453
454 // Avoid stack access for these.
455 // TODO: Generalize to more vector types.
457 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
458 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
459 Custom);
460
461 // Deal with vec3 vector operations when widened to vec4.
463 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
464
465 // Deal with vec5/6/7 vector operations when widened to vec8.
467 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
468 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
469 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
470 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
471 Custom);
472
473 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
474 // and output demarshalling
475 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
476
477 // We can't return success/failure, only the old value,
478 // let LLVM add the comparison
479 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
480 Expand);
481
482 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
483
484 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
485
486 // FIXME: This should be narrowed to i32, but that only happens if i64 is
487 // illegal.
488 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
489 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
490
491 // On SI this is s_memtime and s_memrealtime on VI.
492 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
493
494 if (Subtarget->hasSMemRealTime() ||
495 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
496 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
497 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
498
499 if (Subtarget->has16BitInsts()) {
500 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
501 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
502 } else {
503 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
504 }
505
506 if (Subtarget->hasMadMacF32Insts())
508
509 if (!Subtarget->hasBFI())
510 // fcopysign can be done in a single instruction with BFI.
511 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
512
513 if (!Subtarget->hasBCNT(32))
515
516 if (!Subtarget->hasBCNT(64))
518
519 if (Subtarget->hasFFBH())
521
522 if (Subtarget->hasFFBL())
524
525 // We only really have 32-bit BFE instructions (and 16-bit on VI).
526 //
527 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
528 // effort to match them now. We want this to be false for i64 cases when the
529 // extraction isn't restricted to the upper or lower half. Ideally we would
530 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
531 // span the midpoint are probably relatively rare, so don't worry about them
532 // for now.
533 if (Subtarget->hasBFE())
535
536 // Clamp modifier on add/sub
537 if (Subtarget->hasIntClamp())
539
540 if (Subtarget->hasAddNoCarry())
541 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
542 Legal);
543
545 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
546 {MVT::f32, MVT::f64}, Custom);
547
548 // These are really only legal for ieee_mode functions. We should be avoiding
549 // them for functions that don't have ieee_mode enabled, so just say they are
550 // legal.
551 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
552 {MVT::f32, MVT::f64}, Legal);
553
554 if (Subtarget->haveRoundOpsF64())
555 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
556 Legal);
557 else
558 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
559 MVT::f64, Custom);
560
561 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
562 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
563 Legal);
564 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
565
566 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
568
569 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
570 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
571
572 // Custom lower these because we can't specify a rule based on an illegal
573 // source bf16.
574 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
575 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
576
577 if (Subtarget->has16BitInsts()) {
580 MVT::i16, Legal);
581
582 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
583
585 MVT::i16, Expand);
586
590 ISD::CTPOP},
591 MVT::i16, Promote);
592
593 setOperationAction(ISD::LOAD, MVT::i16, Custom);
594
595 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
596
597 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
598 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
599 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
600 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
601
605
607
608 // F16 - Constant Actions.
611
612 // F16 - Load/Store Actions.
613 setOperationAction(ISD::LOAD, MVT::f16, Promote);
614 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
615 setOperationAction(ISD::STORE, MVT::f16, Promote);
616 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
617
618 // BF16 - Load/Store Actions.
619 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
620 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
621 setOperationAction(ISD::STORE, MVT::bf16, Promote);
622 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
623
624 // F16 - VOP1 Actions.
626 ISD::FSIN, ISD::FROUND},
627 MVT::f16, Custom);
628
629 // BF16 - VOP1 Actions.
630 if (Subtarget->hasBF16TransInsts())
631 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
632
635
636 // F16 - VOP2 Actions.
637 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
638 Expand);
639 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
640 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
642
643 // F16 - VOP3 Actions.
645 if (STI.hasMadF16())
647
648 for (MVT VT :
649 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
650 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
651 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
652 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
653 switch (Op) {
654 case ISD::LOAD:
655 case ISD::STORE:
657 case ISD::BITCAST:
658 case ISD::UNDEF:
663 case ISD::IS_FPCLASS:
664 break;
668 break;
669 default:
671 break;
672 }
673 }
674 }
675
676 // v_perm_b32 can handle either of these.
677 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
679
680 // XXX - Do these do anything? Vector constants turn into build_vector.
681 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
682
683 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
684 Legal);
685
686 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
687 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
688 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
689 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
690
691 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
692 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
693 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
694 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
695
696 setOperationAction(ISD::AND, MVT::v2i16, Promote);
697 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
698 setOperationAction(ISD::OR, MVT::v2i16, Promote);
699 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
700 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
701 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
702
703 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
704 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
705 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
706 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
707 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
709
710 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
711 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
712 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
713 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
714 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
715 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
716
717 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
718 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
719 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
720 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
721 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
723
724 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
725 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
726 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
727 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
728
729 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
730 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
731 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
732 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
733 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
734 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
735
736 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
737 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
738 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
739 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
740 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
741 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
742
743 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
744 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
745 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
746 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
747 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
748 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
749
750 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
751 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
752 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
753 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
754 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
755 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
756
757 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
758 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
759 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
760 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
761 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
762 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
763
765 MVT::v2i32, Expand);
766 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
767
769 MVT::v4i32, Expand);
770
772 MVT::v8i32, Expand);
773
774 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
775 Subtarget->hasVOP3PInsts() ? Legal : Custom);
776
777 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
778 // This isn't really legal, but this avoids the legalizer unrolling it (and
779 // allows matching fneg (fabs x) patterns)
780 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
781
782 // Can do this in one BFI plus a constant materialize.
784 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
785 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
786 MVT::v32f16, MVT::v32bf16},
787 Custom);
788
790 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
791 MVT::f16, Custom);
792 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
793
794 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
795 ISD::FMAXIMUMNUM},
796 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
797 Custom);
798
799 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
800 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
801 Expand);
802
803 for (MVT Vec16 :
804 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
805 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
808 Vec16, Custom);
810 }
811 }
812
813 if (Subtarget->hasVOP3PInsts()) {
817 MVT::v2i16, Legal);
818
819 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
820 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
821 MVT::v2f16, Legal);
822
824 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
825
827 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
828 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
829 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
830 Custom);
831
832 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
833 // Split vector operations.
838 VT, Custom);
839
840 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
841 // Split vector operations.
843 VT, Custom);
844
846 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
847 {MVT::v2f16, MVT::v4f16}, Custom);
848
849 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
850 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
851 Custom);
852
853 if (Subtarget->hasPackedFP32Ops()) {
855 MVT::v2f32, Legal);
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
858 Custom);
859 }
860 }
861
862 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
863
864 if (Subtarget->has16BitInsts()) {
866 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
868 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
869 } else {
870 // Legalization hack.
871 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
872
873 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
874 }
875
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
881 Custom);
882
884
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
889
890 if (Subtarget->hasMad64_32())
892
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
894 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
895
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
897 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
899 } else {
900 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
901 if (Subtarget->hasMinimum3Maximum3F32())
902 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
903
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
905 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
906
907 // If only the vector form is available, we need to widen to a vector.
908 if (!Subtarget->hasMinimum3Maximum3F16())
909 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
910 }
911 }
912
913 if (Subtarget->hasVOP3PInsts()) {
914 // We want to break these into v2f16 pieces, not scalarize.
915 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
917 Custom);
918 }
919
920 if (Subtarget->hasIntMinMax64())
922 Legal);
923
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
927 MVT::i8},
928 Custom);
929
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
935 Custom);
936
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
942 Custom);
943
944 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
946 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
947 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
948 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
949
950 // TODO: Could move this to custom lowering, could benefit from combines on
951 // extract of relevant bits.
952 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
953
955
956 if (Subtarget->hasBF16ConversionInsts()) {
957 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
959 }
960
961 if (Subtarget->hasBF16PackedInsts()) {
963 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
964 MVT::v2bf16, Legal);
965 }
966
967 if (Subtarget->hasBF16TransInsts()) {
968 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
969 }
970
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
974 Custom);
975 }
976
978 ISD::PTRADD,
980 ISD::SUB,
982 ISD::MUL,
983 ISD::FADD,
984 ISD::FSUB,
985 ISD::FDIV,
986 ISD::FMUL,
987 ISD::FMINNUM,
988 ISD::FMAXNUM,
989 ISD::FMINNUM_IEEE,
990 ISD::FMAXNUM_IEEE,
991 ISD::FMINIMUM,
992 ISD::FMAXIMUM,
993 ISD::FMINIMUMNUM,
994 ISD::FMAXIMUMNUM,
995 ISD::FMA,
996 ISD::SMIN,
997 ISD::SMAX,
998 ISD::UMIN,
999 ISD::UMAX,
1000 ISD::SETCC,
1002 ISD::SMIN,
1003 ISD::SMAX,
1004 ISD::UMIN,
1005 ISD::UMAX,
1006 ISD::AND,
1007 ISD::OR,
1008 ISD::XOR,
1009 ISD::SHL,
1010 ISD::SRL,
1011 ISD::SRA,
1012 ISD::FSHR,
1022
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1025
1026 // All memory operations. Some folding on the pointer operand is done to help
1027 // matching the constant offsets in the addressing modes.
1028 setTargetDAGCombine({ISD::LOAD,
1029 ISD::STORE,
1030 ISD::ATOMIC_LOAD,
1031 ISD::ATOMIC_STORE,
1032 ISD::ATOMIC_CMP_SWAP,
1033 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1034 ISD::ATOMIC_SWAP,
1035 ISD::ATOMIC_LOAD_ADD,
1036 ISD::ATOMIC_LOAD_SUB,
1037 ISD::ATOMIC_LOAD_AND,
1038 ISD::ATOMIC_LOAD_OR,
1039 ISD::ATOMIC_LOAD_XOR,
1040 ISD::ATOMIC_LOAD_NAND,
1041 ISD::ATOMIC_LOAD_MIN,
1042 ISD::ATOMIC_LOAD_MAX,
1043 ISD::ATOMIC_LOAD_UMIN,
1044 ISD::ATOMIC_LOAD_UMAX,
1045 ISD::ATOMIC_LOAD_FADD,
1046 ISD::ATOMIC_LOAD_FMIN,
1047 ISD::ATOMIC_LOAD_FMAX,
1048 ISD::ATOMIC_LOAD_UINC_WRAP,
1049 ISD::ATOMIC_LOAD_UDEC_WRAP,
1052
1053 // FIXME: In other contexts we pretend this is a per-function property.
1055
1057}
1058
1059const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1060
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1063 return RCRegs;
1064}
1065
1066//===----------------------------------------------------------------------===//
1067// TargetLowering queries
1068//===----------------------------------------------------------------------===//
1069
1070// v_mad_mix* support a conversion from f16 to f32.
1071//
1072// There is only one special case when denormals are enabled we don't currently,
1073// where this is OK to use.
1074bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1075 EVT DestVT, EVT SrcVT) const {
1076 return DestVT.getScalarType() == MVT::f32 &&
1077 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1079 SrcVT.getScalarType() == MVT::f16) ||
1080 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1081 SrcVT.getScalarType() == MVT::bf16)) &&
1082 // TODO: This probably only requires no input flushing?
1084}
1085
1087 LLT DestTy, LLT SrcTy) const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1090 DestTy.getScalarSizeInBits() == 32 &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1092 // TODO: This probably only requires no input flushing?
1093 denormalModeIsFlushAllF32(*MI.getMF());
1094}
1095
1097 // SI has some legal vector types, but no legal vector operations. Say no
1098 // shuffles are legal in order to prefer scalarizing some vector operations.
1099 return false;
1100}
1101
1103 CallingConv::ID CC,
1104 EVT VT) const {
1106 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1107
1108 if (VT.isVector()) {
1109 EVT ScalarVT = VT.getScalarType();
1110 unsigned Size = ScalarVT.getSizeInBits();
1111 if (Size == 16) {
1112 if (Subtarget->has16BitInsts()) {
1113 if (VT.isInteger())
1114 return MVT::v2i16;
1115 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1116 }
1117 return VT.isInteger() ? MVT::i32 : MVT::f32;
1118 }
1119
1120 if (Size < 16)
1121 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1122 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1123 }
1124
1125 if (VT.getSizeInBits() > 32)
1126 return MVT::i32;
1127
1128 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1129}
1130
1132 CallingConv::ID CC,
1133 EVT VT) const {
1135 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1136
1137 if (VT.isVector()) {
1138 unsigned NumElts = VT.getVectorNumElements();
1139 EVT ScalarVT = VT.getScalarType();
1140 unsigned Size = ScalarVT.getSizeInBits();
1141
1142 // FIXME: Should probably promote 8-bit vectors to i16.
1143 if (Size == 16 && Subtarget->has16BitInsts())
1144 return (NumElts + 1) / 2;
1145
1146 if (Size <= 32)
1147 return NumElts;
1148
1149 if (Size > 32)
1150 return NumElts * ((Size + 31) / 32);
1151 } else if (VT.getSizeInBits() > 32)
1152 return (VT.getSizeInBits() + 31) / 32;
1153
1154 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1155}
1156
1158 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1159 unsigned &NumIntermediates, MVT &RegisterVT) const {
1160 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1161 unsigned NumElts = VT.getVectorNumElements();
1162 EVT ScalarVT = VT.getScalarType();
1163 unsigned Size = ScalarVT.getSizeInBits();
1164 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1165 // support, but unless we can properly handle 3-vectors, it will be still be
1166 // inconsistent.
1167 if (Size == 16 && Subtarget->has16BitInsts()) {
1168 if (ScalarVT == MVT::bf16) {
1169 RegisterVT = MVT::i32;
1170 IntermediateVT = MVT::v2bf16;
1171 } else {
1172 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1173 IntermediateVT = RegisterVT;
1174 }
1175 NumIntermediates = (NumElts + 1) / 2;
1176 return NumIntermediates;
1177 }
1178
1179 if (Size == 32) {
1180 RegisterVT = ScalarVT.getSimpleVT();
1181 IntermediateVT = RegisterVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1184 }
1185
1186 if (Size < 16 && Subtarget->has16BitInsts()) {
1187 // FIXME: Should probably form v2i16 pieces
1188 RegisterVT = MVT::i16;
1189 IntermediateVT = ScalarVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1192 }
1193
1194 if (Size != 16 && Size <= 32) {
1195 RegisterVT = MVT::i32;
1196 IntermediateVT = ScalarVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1199 }
1200
1201 if (Size > 32) {
1202 RegisterVT = MVT::i32;
1203 IntermediateVT = RegisterVT;
1204 NumIntermediates = NumElts * ((Size + 31) / 32);
1205 return NumIntermediates;
1206 }
1207 }
1208
1210 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1211}
1212
1214 const DataLayout &DL, Type *Ty,
1215 unsigned MaxNumLanes) {
1216 assert(MaxNumLanes != 0);
1217
1218 LLVMContext &Ctx = Ty->getContext();
1219 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1220 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1221 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1222 NumElts);
1223 }
1224
1225 return TLI.getValueType(DL, Ty);
1226}
1227
1228// Peek through TFE struct returns to only use the data size.
1230 const DataLayout &DL, Type *Ty,
1231 unsigned MaxNumLanes) {
1232 auto *ST = dyn_cast<StructType>(Ty);
1233 if (!ST)
1234 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1235
1236 // TFE intrinsics return an aggregate type.
1237 assert(ST->getNumContainedTypes() == 2 &&
1238 ST->getContainedType(1)->isIntegerTy(32));
1239 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1240}
1241
1242/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1243/// in-memory representation. This return value is a custom type because there
1244/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1245/// could cause issues during codegen, these address space 7 pointers will be
1246/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1247/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1248/// for cost modeling, to work. (This also sets us up decently for doing the
1249/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1251 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1252 return MVT::amdgpuBufferFatPointer;
1254 DL.getPointerSizeInBits(AS) == 192)
1255 return MVT::amdgpuBufferStridedPointer;
1257}
1258/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1259/// v8i32 when padding is added.
1260/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1261/// also v8i32 with padding.
1263 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1264 DL.getPointerSizeInBits(AS) == 160) ||
1266 DL.getPointerSizeInBits(AS) == 192))
1267 return MVT::v8i32;
1269}
1270
1271static unsigned getIntrMemWidth(unsigned IntrID) {
1272 switch (IntrID) {
1273 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1274 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1276 return 8;
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1282 return 32;
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1288 return 64;
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1292 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1293 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1294 return 128;
1295 default:
1296 llvm_unreachable("Unknown width");
1297 }
1298}
1299
1300static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1302 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1303 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1304 switch (AtomicOrderingCABI(Ord)) {
1307 break;
1310 break;
1313 break;
1314 default:
1316 break;
1317 }
1318
1319 Info.flags =
1321 Info.flags |= MOCooperative;
1322
1323 MDNode *ScopeMD = cast<MDNode>(
1324 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1325 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1326 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1327}
1328
1330 const CallInst &CI,
1331 MachineFunction &MF,
1332 unsigned IntrID) const {
1333 Info.flags = MachineMemOperand::MONone;
1334 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1335 Info.flags |= MachineMemOperand::MOInvariant;
1336 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1338 Info.flags |= getTargetMMOFlags(CI);
1339
1340 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1342 AttributeSet Attr =
1344 MemoryEffects ME = Attr.getMemoryEffects();
1345 if (ME.doesNotAccessMemory())
1346 return false;
1347
1348 // TODO: Should images get their own address space?
1349 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1350
1351 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1352 if (RsrcIntr->IsImage) {
1353 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1355 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1356 Info.align.reset();
1357 }
1358
1359 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1360 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1361 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1362 // We conservatively set the memory operand of a buffer intrinsic to the
1363 // base resource pointer, so that we can access alias information about
1364 // those pointers. Cases like "this points at the same value
1365 // but with a different offset" are handled in
1366 // areMemAccessesTriviallyDisjoint.
1367 Info.ptrVal = RsrcArg;
1368 }
1369
1370 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1371 if (!IsSPrefetch) {
1372 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1373 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1374 Info.flags |= MachineMemOperand::MOVolatile;
1375 }
1376
1378 if (ME.onlyReadsMemory()) {
1379 if (RsrcIntr->IsImage) {
1380 unsigned MaxNumLanes = 4;
1381
1382 if (!BaseOpcode->Gather4) {
1383 // If this isn't a gather, we may have excess loaded elements in the
1384 // IR type. Check the dmask for the real number of elements loaded.
1385 unsigned DMask =
1386 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1387 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1388 }
1389
1390 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1391 CI.getType(), MaxNumLanes);
1392 } else {
1393 Info.memVT =
1395 std::numeric_limits<unsigned>::max());
1396 }
1397
1398 // FIXME: What does alignment mean for an image?
1399 Info.opc = ISD::INTRINSIC_W_CHAIN;
1400 Info.flags |= MachineMemOperand::MOLoad;
1401 } else if (ME.onlyWritesMemory()) {
1402 Info.opc = ISD::INTRINSIC_VOID;
1403
1404 Type *DataTy = CI.getArgOperand(0)->getType();
1405 if (RsrcIntr->IsImage) {
1406 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1407 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1408 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1409 DMaskLanes);
1410 } else
1411 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1412
1413 Info.flags |= MachineMemOperand::MOStore;
1414 } else {
1415 // Atomic, NoReturn Sampler or prefetch
1416 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1418 Info.flags |=
1420
1421 if (!IsSPrefetch)
1422 Info.flags |= MachineMemOperand::MOStore;
1423
1424 switch (IntrID) {
1425 default:
1426 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1427 // Fake memory access type for no return sampler intrinsics
1428 Info.memVT = MVT::i32;
1429 } else {
1430 // XXX - Should this be volatile without known ordering?
1431 Info.flags |= MachineMemOperand::MOVolatile;
1432 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1433 }
1434 break;
1435 case Intrinsic::amdgcn_raw_buffer_load_lds:
1436 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1437 case Intrinsic::amdgcn_struct_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1439 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1440 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1441 Info.ptrVal = CI.getArgOperand(1);
1442 return true;
1443 }
1444 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1445 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1446 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1448 Info.memVT =
1450 std::numeric_limits<unsigned>::max());
1451 Info.flags &= ~MachineMemOperand::MOStore;
1452 return true;
1453 }
1454 }
1455 }
1456 return true;
1457 }
1458
1459 switch (IntrID) {
1460 case Intrinsic::amdgcn_ds_ordered_add:
1461 case Intrinsic::amdgcn_ds_ordered_swap: {
1462 Info.opc = ISD::INTRINSIC_W_CHAIN;
1463 Info.memVT = MVT::getVT(CI.getType());
1464 Info.ptrVal = CI.getOperand(0);
1465 Info.align.reset();
1467
1468 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1469 if (!Vol->isZero())
1470 Info.flags |= MachineMemOperand::MOVolatile;
1471
1472 return true;
1473 }
1474 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1475 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1476 Info.opc = ISD::INTRINSIC_W_CHAIN;
1477 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1478 Info.ptrVal = nullptr;
1479 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1481 return true;
1482 }
1483 case Intrinsic::amdgcn_ds_append:
1484 case Intrinsic::amdgcn_ds_consume: {
1485 Info.opc = ISD::INTRINSIC_W_CHAIN;
1486 Info.memVT = MVT::getVT(CI.getType());
1487 Info.ptrVal = CI.getOperand(0);
1488 Info.align.reset();
1490
1491 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1492 if (!Vol->isZero())
1493 Info.flags |= MachineMemOperand::MOVolatile;
1494
1495 return true;
1496 }
1497 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1498 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1499 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1502 Info.memVT = MVT::getVT(CI.getType());
1503 Info.ptrVal = CI.getOperand(0);
1504 Info.memVT = MVT::i64;
1505 Info.size = 8;
1506 Info.align.reset();
1508 return true;
1509 }
1510 case Intrinsic::amdgcn_global_atomic_csub: {
1511 Info.opc = ISD::INTRINSIC_W_CHAIN;
1512 Info.memVT = MVT::getVT(CI.getType());
1513 Info.ptrVal = CI.getOperand(0);
1514 Info.align.reset();
1517 return true;
1518 }
1519 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1520 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1522 Info.opc = ISD::INTRINSIC_W_CHAIN;
1523 Info.memVT =
1524 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1525 ? CI.getType()
1527 ->getElementType(0)); // XXX: what is correct VT?
1528
1529 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1530 Info.align.reset();
1531 Info.flags |=
1533 return true;
1534 }
1535 case Intrinsic::amdgcn_global_atomic_fmin_num:
1536 case Intrinsic::amdgcn_global_atomic_fmax_num:
1537 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1538 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1539 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1540 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1541 Info.opc = ISD::INTRINSIC_W_CHAIN;
1542 Info.memVT = MVT::getVT(CI.getType());
1543 Info.ptrVal = CI.getOperand(0);
1544 Info.align.reset();
1548 return true;
1549 }
1550 case Intrinsic::amdgcn_flat_load_monitor_b32:
1551 case Intrinsic::amdgcn_flat_load_monitor_b64:
1552 case Intrinsic::amdgcn_flat_load_monitor_b128:
1553 case Intrinsic::amdgcn_global_load_monitor_b32:
1554 case Intrinsic::amdgcn_global_load_monitor_b64:
1555 case Intrinsic::amdgcn_global_load_monitor_b128:
1556 case Intrinsic::amdgcn_cluster_load_b32:
1557 case Intrinsic::amdgcn_cluster_load_b64:
1558 case Intrinsic::amdgcn_cluster_load_b128:
1559 case Intrinsic::amdgcn_ds_load_tr6_b96:
1560 case Intrinsic::amdgcn_ds_load_tr4_b64:
1561 case Intrinsic::amdgcn_ds_load_tr8_b64:
1562 case Intrinsic::amdgcn_ds_load_tr16_b128:
1563 case Intrinsic::amdgcn_global_load_tr6_b96:
1564 case Intrinsic::amdgcn_global_load_tr4_b64:
1565 case Intrinsic::amdgcn_global_load_tr_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b128:
1567 case Intrinsic::amdgcn_ds_read_tr4_b64:
1568 case Intrinsic::amdgcn_ds_read_tr6_b96:
1569 case Intrinsic::amdgcn_ds_read_tr8_b64:
1570 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1571 Info.opc = ISD::INTRINSIC_W_CHAIN;
1572 Info.memVT = MVT::getVT(CI.getType());
1573 Info.ptrVal = CI.getOperand(0);
1574 Info.align.reset();
1575 Info.flags |= MachineMemOperand::MOLoad;
1576 return true;
1577 }
1578 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1579 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1581 Info.opc = ISD::INTRINSIC_W_CHAIN;
1582 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1583 Info.ptrVal = CI.getOperand(0);
1584 Info.align.reset();
1585 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1586 return true;
1587 }
1588 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1589 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1591 Info.opc = ISD::INTRINSIC_VOID;
1592 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1593 Info.ptrVal = CI.getArgOperand(0);
1594 Info.align.reset();
1595 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1596 return true;
1597 }
1598 case Intrinsic::amdgcn_ds_gws_init:
1599 case Intrinsic::amdgcn_ds_gws_barrier:
1600 case Intrinsic::amdgcn_ds_gws_sema_v:
1601 case Intrinsic::amdgcn_ds_gws_sema_br:
1602 case Intrinsic::amdgcn_ds_gws_sema_p:
1603 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1604 Info.opc = ISD::INTRINSIC_VOID;
1605
1606 const GCNTargetMachine &TM =
1607 static_cast<const GCNTargetMachine &>(getTargetMachine());
1608
1610 Info.ptrVal = MFI->getGWSPSV(TM);
1611
1612 // This is an abstract access, but we need to specify a type and size.
1613 Info.memVT = MVT::i32;
1614 Info.size = 4;
1615 Info.align = Align(4);
1616
1617 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1618 Info.flags |= MachineMemOperand::MOLoad;
1619 else
1620 Info.flags |= MachineMemOperand::MOStore;
1621 return true;
1622 }
1623 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1627 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1631 Info.opc = ISD::INTRINSIC_VOID;
1632 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1633 Info.ptrVal = CI.getArgOperand(1);
1635 return true;
1636 }
1637 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1643 Info.ptrVal = CI.getArgOperand(0);
1645 return true;
1646 }
1647 case Intrinsic::amdgcn_load_to_lds:
1648 case Intrinsic::amdgcn_global_load_lds: {
1649 Info.opc = ISD::INTRINSIC_VOID;
1650 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1651 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1652 Info.ptrVal = CI.getArgOperand(1);
1654 return true;
1655 }
1656 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1657 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1660 Info.opc = ISD::INTRINSIC_W_CHAIN;
1661
1662 const GCNTargetMachine &TM =
1663 static_cast<const GCNTargetMachine &>(getTargetMachine());
1664
1666 Info.ptrVal = MFI->getGWSPSV(TM);
1667
1668 // This is an abstract access, but we need to specify a type and size.
1669 Info.memVT = MVT::i32;
1670 Info.size = 4;
1671 Info.align = Align(4);
1672
1674 return true;
1675 }
1676 case Intrinsic::amdgcn_s_prefetch_data:
1677 case Intrinsic::amdgcn_flat_prefetch:
1678 case Intrinsic::amdgcn_global_prefetch: {
1679 Info.opc = ISD::INTRINSIC_VOID;
1680 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1681 Info.ptrVal = CI.getArgOperand(0);
1682 Info.flags |= MachineMemOperand::MOLoad;
1683 return true;
1684 }
1685 default:
1686 return false;
1687 }
1688}
1689
1691 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1693 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1694 // The DAG's ValueType loses the addrspaces.
1695 // Add them as 2 extra Constant operands "from" and "to".
1696 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1697 unsigned DstAS = I.getType()->getPointerAddressSpace();
1698 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1699 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1700 break;
1701 }
1702 default:
1703 break;
1704 }
1705}
1706
1709 Type *&AccessTy) const {
1710 Value *Ptr = nullptr;
1711 switch (II->getIntrinsicID()) {
1712 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1713 case Intrinsic::amdgcn_cluster_load_b128:
1714 case Intrinsic::amdgcn_cluster_load_b64:
1715 case Intrinsic::amdgcn_cluster_load_b32:
1716 case Intrinsic::amdgcn_ds_append:
1717 case Intrinsic::amdgcn_ds_consume:
1718 case Intrinsic::amdgcn_ds_load_tr8_b64:
1719 case Intrinsic::amdgcn_ds_load_tr16_b128:
1720 case Intrinsic::amdgcn_ds_load_tr4_b64:
1721 case Intrinsic::amdgcn_ds_load_tr6_b96:
1722 case Intrinsic::amdgcn_ds_read_tr4_b64:
1723 case Intrinsic::amdgcn_ds_read_tr6_b96:
1724 case Intrinsic::amdgcn_ds_read_tr8_b64:
1725 case Intrinsic::amdgcn_ds_read_tr16_b64:
1726 case Intrinsic::amdgcn_ds_ordered_add:
1727 case Intrinsic::amdgcn_ds_ordered_swap:
1728 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1729 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1730 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1731 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1732 case Intrinsic::amdgcn_flat_load_monitor_b128:
1733 case Intrinsic::amdgcn_flat_load_monitor_b32:
1734 case Intrinsic::amdgcn_flat_load_monitor_b64:
1735 case Intrinsic::amdgcn_global_atomic_csub:
1736 case Intrinsic::amdgcn_global_atomic_fmax_num:
1737 case Intrinsic::amdgcn_global_atomic_fmin_num:
1738 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1739 case Intrinsic::amdgcn_global_load_monitor_b128:
1740 case Intrinsic::amdgcn_global_load_monitor_b32:
1741 case Intrinsic::amdgcn_global_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_load_tr_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b128:
1744 case Intrinsic::amdgcn_global_load_tr4_b64:
1745 case Intrinsic::amdgcn_global_load_tr6_b96:
1746 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1750 Ptr = II->getArgOperand(0);
1751 break;
1752 case Intrinsic::amdgcn_load_to_lds:
1753 case Intrinsic::amdgcn_global_load_lds:
1754 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1758 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1762 Ptr = II->getArgOperand(1);
1763 break;
1764 default:
1765 return false;
1766 }
1767 AccessTy = II->getType();
1768 Ops.push_back(Ptr);
1769 return true;
1770}
1771
1773 unsigned AddrSpace) const {
1774 if (!Subtarget->hasFlatInstOffsets()) {
1775 // Flat instructions do not have offsets, and only have the register
1776 // address.
1777 return AM.BaseOffs == 0 && AM.Scale == 0;
1778 }
1779
1780 decltype(SIInstrFlags::FLAT) FlatVariant =
1784
1785 return AM.Scale == 0 &&
1786 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1787 AM.BaseOffs, AddrSpace, FlatVariant));
1788}
1789
1791 if (Subtarget->hasFlatGlobalInsts())
1793
1794 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1795 // Assume the we will use FLAT for all global memory accesses
1796 // on VI.
1797 // FIXME: This assumption is currently wrong. On VI we still use
1798 // MUBUF instructions for the r + i addressing mode. As currently
1799 // implemented, the MUBUF instructions only work on buffer < 4GB.
1800 // It may be possible to support > 4GB buffers with MUBUF instructions,
1801 // by setting the stride value in the resource descriptor which would
1802 // increase the size limit to (stride * 4GB). However, this is risky,
1803 // because it has never been validated.
1805 }
1806
1807 return isLegalMUBUFAddressingMode(AM);
1808}
1809
1810bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1811 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1812 // additionally can do r + r + i with addr64. 32-bit has more addressing
1813 // mode options. Depending on the resource constant, it can also do
1814 // (i64 r0) + (i32 r1) * (i14 i).
1815 //
1816 // Private arrays end up using a scratch buffer most of the time, so also
1817 // assume those use MUBUF instructions. Scratch loads / stores are currently
1818 // implemented as mubuf instructions with offen bit set, so slightly
1819 // different than the normal addr64.
1820 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1821 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1822 return false;
1823
1824 // FIXME: Since we can split immediate into soffset and immediate offset,
1825 // would it make sense to allow any immediate?
1826
1827 switch (AM.Scale) {
1828 case 0: // r + i or just i, depending on HasBaseReg.
1829 return true;
1830 case 1:
1831 return true; // We have r + r or r + i.
1832 case 2:
1833 if (AM.HasBaseReg) {
1834 // Reject 2 * r + r.
1835 return false;
1836 }
1837
1838 // Allow 2 * r as r + r
1839 // Or 2 * r + i is allowed as r + r + i.
1840 return true;
1841 default: // Don't allow n * r
1842 return false;
1843 }
1844}
1845
1847 const AddrMode &AM, Type *Ty,
1848 unsigned AS,
1849 Instruction *I) const {
1850 // No global is ever allowed as a base.
1851 if (AM.BaseGV)
1852 return false;
1853
1854 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1855 return isLegalGlobalAddressingMode(AM);
1856
1857 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1861 // If the offset isn't a multiple of 4, it probably isn't going to be
1862 // correctly aligned.
1863 // FIXME: Can we get the real alignment here?
1864 if (AM.BaseOffs % 4 != 0)
1865 return isLegalMUBUFAddressingMode(AM);
1866
1867 if (!Subtarget->hasScalarSubwordLoads()) {
1868 // There are no SMRD extloads, so if we have to do a small type access we
1869 // will use a MUBUF load.
1870 // FIXME?: We also need to do this if unaligned, but we don't know the
1871 // alignment here.
1872 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1873 return isLegalGlobalAddressingMode(AM);
1874 }
1875
1876 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1877 // SMRD instructions have an 8-bit, dword offset on SI.
1878 if (!isUInt<8>(AM.BaseOffs / 4))
1879 return false;
1880 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1881 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1882 // in 8-bits, it can use a smaller encoding.
1883 if (!isUInt<32>(AM.BaseOffs / 4))
1884 return false;
1885 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1886 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1887 if (!isUInt<20>(AM.BaseOffs))
1888 return false;
1889 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1890 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1891 // for S_BUFFER_* instructions).
1892 if (!isInt<21>(AM.BaseOffs))
1893 return false;
1894 } else {
1895 // On GFX12, all offsets are signed 24-bit in bytes.
1896 if (!isInt<24>(AM.BaseOffs))
1897 return false;
1898 }
1899
1900 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1902 AM.BaseOffs < 0) {
1903 // Scalar (non-buffer) loads can only use a negative offset if
1904 // soffset+offset is non-negative. Since the compiler can only prove that
1905 // in a few special cases, it is safer to claim that negative offsets are
1906 // not supported.
1907 return false;
1908 }
1909
1910 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1911 return true;
1912
1913 if (AM.Scale == 1 && AM.HasBaseReg)
1914 return true;
1915
1916 return false;
1917 }
1918
1919 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1920 return Subtarget->enableFlatScratch()
1922 : isLegalMUBUFAddressingMode(AM);
1923
1924 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1925 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1926 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1927 // field.
1928 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1929 // an 8-bit dword offset but we don't know the alignment here.
1930 if (!isUInt<16>(AM.BaseOffs))
1931 return false;
1932
1933 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1934 return true;
1935
1936 if (AM.Scale == 1 && AM.HasBaseReg)
1937 return true;
1938
1939 return false;
1940 }
1941
1943 // For an unknown address space, this usually means that this is for some
1944 // reason being used for pure arithmetic, and not based on some addressing
1945 // computation. We don't have instructions that compute pointers with any
1946 // addressing modes, so treat them as having no offset like flat
1947 // instructions.
1949 }
1950
1951 // Assume a user alias of global for unknown address spaces.
1952 return isLegalGlobalAddressingMode(AM);
1953}
1954
1956 const MachineFunction &MF) const {
1958 return (MemVT.getSizeInBits() <= 4 * 32);
1959 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1960 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1961 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1962 }
1964 return (MemVT.getSizeInBits() <= 2 * 32);
1965 return true;
1966}
1967
1969 unsigned Size, unsigned AddrSpace, Align Alignment,
1970 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1971 if (IsFast)
1972 *IsFast = 0;
1973
1974 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1975 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1976 // Check if alignment requirements for ds_read/write instructions are
1977 // disabled.
1978 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1979 return false;
1980
1981 Align RequiredAlignment(
1982 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1983 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1984 Alignment < RequiredAlignment)
1985 return false;
1986
1987 // Either, the alignment requirements are "enabled", or there is an
1988 // unaligned LDS access related hardware bug though alignment requirements
1989 // are "disabled". In either case, we need to check for proper alignment
1990 // requirements.
1991 //
1992 switch (Size) {
1993 case 64:
1994 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1995 // address is negative, then the instruction is incorrectly treated as
1996 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1997 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1998 // load later in the SILoadStoreOptimizer.
1999 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2000 return false;
2001
2002 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2003 // can do a 4 byte aligned, 8 byte access in a single operation using
2004 // ds_read2/write2_b32 with adjacent offsets.
2005 RequiredAlignment = Align(4);
2006
2007 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2008 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2009 // ds_write2_b32 depending on the alignment. In either case with either
2010 // alignment there is no faster way of doing this.
2011
2012 // The numbers returned here and below are not additive, it is a 'speed
2013 // rank'. They are just meant to be compared to decide if a certain way
2014 // of lowering an operation is faster than another. For that purpose
2015 // naturally aligned operation gets it bitsize to indicate that "it
2016 // operates with a speed comparable to N-bit wide load". With the full
2017 // alignment ds128 is slower than ds96 for example. If underaligned it
2018 // is comparable to a speed of a single dword access, which would then
2019 // mean 32 < 128 and it is faster to issue a wide load regardless.
2020 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2021 // wider load which will not be aligned anymore the latter is slower.
2022 if (IsFast)
2023 *IsFast = (Alignment >= RequiredAlignment) ? 64
2024 : (Alignment < Align(4)) ? 32
2025 : 1;
2026 return true;
2027 }
2028
2029 break;
2030 case 96:
2031 if (!Subtarget->hasDS96AndDS128())
2032 return false;
2033
2034 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2035 // gfx8 and older.
2036
2037 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2038 // Naturally aligned access is fastest. However, also report it is Fast
2039 // if memory is aligned less than DWORD. A narrow load or store will be
2040 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2041 // be more of them, so overall we will pay less penalty issuing a single
2042 // instruction.
2043
2044 // See comment on the values above.
2045 if (IsFast)
2046 *IsFast = (Alignment >= RequiredAlignment) ? 96
2047 : (Alignment < Align(4)) ? 32
2048 : 1;
2049 return true;
2050 }
2051
2052 break;
2053 case 128:
2054 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2055 return false;
2056
2057 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2058 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2059 // single operation using ds_read2/write2_b64.
2060 RequiredAlignment = Align(8);
2061
2062 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2063 // Naturally aligned access is fastest. However, also report it is Fast
2064 // if memory is aligned less than DWORD. A narrow load or store will be
2065 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2066 // will be more of them, so overall we will pay less penalty issuing a
2067 // single instruction.
2068
2069 // See comment on the values above.
2070 if (IsFast)
2071 *IsFast = (Alignment >= RequiredAlignment) ? 128
2072 : (Alignment < Align(4)) ? 32
2073 : 1;
2074 return true;
2075 }
2076
2077 break;
2078 default:
2079 if (Size > 32)
2080 return false;
2081
2082 break;
2083 }
2084
2085 // See comment on the values above.
2086 // Note that we have a single-dword or sub-dword here, so if underaligned
2087 // it is a slowest possible access, hence returned value is 0.
2088 if (IsFast)
2089 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2090
2091 return Alignment >= RequiredAlignment ||
2092 Subtarget->hasUnalignedDSAccessEnabled();
2093 }
2094
2095 // FIXME: We have to be conservative here and assume that flat operations
2096 // will access scratch. If we had access to the IR function, then we
2097 // could determine if any private memory was used in the function.
2098 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2099 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2100 bool AlignedBy4 = Alignment >= Align(4);
2101 if (IsFast)
2102 *IsFast = AlignedBy4;
2103
2104 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
2105 }
2106
2107 // So long as they are correct, wide global memory operations perform better
2108 // than multiple smaller memory ops -- even when misaligned
2109 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2110 if (IsFast)
2111 *IsFast = Size;
2112
2113 return Alignment >= Align(4) ||
2114 Subtarget->hasUnalignedBufferAccessEnabled();
2115 }
2116
2117 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2118 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2119 // out-of-bounds behavior, but in the edge case where an access starts
2120 // out-of-bounds and then enter in-bounds, the entire access would be treated
2121 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2122 // natural alignment of buffer accesses.
2123 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2124 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2125 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2126 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2127 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2128 return false;
2129 }
2130
2131 // Smaller than dword value must be aligned.
2132 if (Size < 32)
2133 return false;
2134
2135 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2136 // byte-address are ignored, thus forcing Dword alignment.
2137 // This applies to private, global, and constant memory.
2138 if (IsFast)
2139 *IsFast = 1;
2140
2141 return Size >= 32 && Alignment >= Align(4);
2142}
2143
2145 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2146 unsigned *IsFast) const {
2148 Alignment, Flags, IsFast);
2149}
2150
2152 LLVMContext &Context, const MemOp &Op,
2153 const AttributeList &FuncAttributes) const {
2154 // FIXME: Should account for address space here.
2155
2156 // The default fallback uses the private pointer size as a guess for a type to
2157 // use. Make sure we switch these to 64-bit accesses.
2158
2159 if (Op.size() >= 16 &&
2160 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2161 return MVT::v4i32;
2162
2163 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2164 return MVT::v2i32;
2165
2166 // Use the default.
2167 return MVT::Other;
2168}
2169
2171 const MemSDNode *MemNode = cast<MemSDNode>(N);
2172 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2173}
2174
2179
2181 unsigned DestAS) const {
2182 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2183 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2184 Subtarget->hasGloballyAddressableScratch()) {
2185 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2186 return false;
2187 }
2188
2189 // Flat -> private/local is a simple truncate.
2190 // Flat -> global is no-op
2191 return true;
2192 }
2193
2194 const GCNTargetMachine &TM =
2195 static_cast<const GCNTargetMachine &>(getTargetMachine());
2196 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2197}
2198
2206
2208 Type *Ty) const {
2209 // FIXME: Could be smarter if called for vector constants.
2210 return true;
2211}
2212
2214 unsigned Index) const {
2216 return false;
2217
2218 // TODO: Add more cases that are cheap.
2219 return Index == 0;
2220}
2221
2222bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2223 // TODO: This should be more aggressive, particular for 16-bit element
2224 // vectors. However there are some mixed improvements and regressions.
2225 EVT EltTy = VT.getVectorElementType();
2226 return EltTy.getSizeInBits() % 32 == 0;
2227}
2228
2230 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2231 switch (Op) {
2232 case ISD::LOAD:
2233 case ISD::STORE:
2234 return true;
2235 default:
2236 return false;
2237 }
2238 }
2239
2240 // SimplifySetCC uses this function to determine whether or not it should
2241 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2242 if (VT == MVT::i1 && Op == ISD::SETCC)
2243 return false;
2244
2246}
2247
2248SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2249 const SDLoc &SL,
2250 SDValue Chain,
2251 uint64_t Offset) const {
2252 const DataLayout &DL = DAG.getDataLayout();
2256
2257 auto [InputPtrReg, RC, ArgTy] =
2258 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2259
2260 // We may not have the kernarg segment argument if we have no kernel
2261 // arguments.
2262 if (!InputPtrReg)
2263 return DAG.getConstant(Offset, SL, PtrVT);
2264
2266 SDValue BasePtr = DAG.getCopyFromReg(
2267 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2268
2269 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2270}
2271
2272SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2273 const SDLoc &SL) const {
2276 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2277}
2278
2279SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2280 const SDLoc &SL) const {
2281
2283 std::optional<uint32_t> KnownSize =
2285 if (KnownSize.has_value())
2286 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2287 return SDValue();
2288}
2289
2290SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2291 const SDLoc &SL, SDValue Val,
2292 bool Signed,
2293 const ISD::InputArg *Arg) const {
2294 // First, if it is a widened vector, narrow it.
2295 if (VT.isVector() &&
2297 EVT NarrowedVT =
2300 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2301 DAG.getConstant(0, SL, MVT::i32));
2302 }
2303
2304 // Then convert the vector elements or scalar value.
2305 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2306 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2307 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2308 }
2309
2310 if (MemVT.isFloatingPoint())
2311 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2312 else if (Signed)
2313 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2314 else
2315 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2316
2317 return Val;
2318}
2319
2320SDValue SITargetLowering::lowerKernargMemParameter(
2321 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2322 uint64_t Offset, Align Alignment, bool Signed,
2323 const ISD::InputArg *Arg) const {
2324 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2325
2326 // Try to avoid using an extload by loading earlier than the argument address,
2327 // and extracting the relevant bits. The load should hopefully be merged with
2328 // the previous argument.
2329 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2330 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2331 int64_t AlignDownOffset = alignDown(Offset, 4);
2332 int64_t OffsetDiff = Offset - AlignDownOffset;
2333
2334 EVT IntVT = MemVT.changeTypeToInteger();
2335
2336 // TODO: If we passed in the base kernel offset we could have a better
2337 // alignment than 4, but we don't really need it.
2338 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2339 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2342
2343 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2344 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2345
2346 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2347 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2348 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2349
2350 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2351 }
2352
2353 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2354 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2357
2358 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2359 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2360}
2361
2362/// Coerce an argument which was passed in a different ABI type to the original
2363/// expected value type.
2364SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2365 SDValue Val,
2366 CCValAssign &VA,
2367 const SDLoc &SL) const {
2368 EVT ValVT = VA.getValVT();
2369
2370 // If this is an 8 or 16-bit value, it is really passed promoted
2371 // to 32 bits. Insert an assert[sz]ext to capture this, then
2372 // truncate to the right size.
2373 switch (VA.getLocInfo()) {
2374 case CCValAssign::Full:
2375 return Val;
2376 case CCValAssign::BCvt:
2377 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2378 case CCValAssign::SExt:
2379 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2380 DAG.getValueType(ValVT));
2381 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2382 case CCValAssign::ZExt:
2383 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2384 DAG.getValueType(ValVT));
2385 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2386 case CCValAssign::AExt:
2387 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2388 default:
2389 llvm_unreachable("Unknown loc info!");
2390 }
2391}
2392
2393SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2394 CCValAssign &VA, const SDLoc &SL,
2395 SDValue Chain,
2396 const ISD::InputArg &Arg) const {
2397 MachineFunction &MF = DAG.getMachineFunction();
2398 MachineFrameInfo &MFI = MF.getFrameInfo();
2399
2400 if (Arg.Flags.isByVal()) {
2401 unsigned Size = Arg.Flags.getByValSize();
2402 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2403 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2404 }
2405
2406 unsigned ArgOffset = VA.getLocMemOffset();
2407 unsigned ArgSize = VA.getValVT().getStoreSize();
2408
2409 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2410
2411 // Create load nodes to retrieve arguments from the stack.
2412 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2413
2414 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2416 MVT MemVT = VA.getValVT();
2417
2418 switch (VA.getLocInfo()) {
2419 default:
2420 break;
2421 case CCValAssign::BCvt:
2422 MemVT = VA.getLocVT();
2423 break;
2424 case CCValAssign::SExt:
2425 ExtType = ISD::SEXTLOAD;
2426 break;
2427 case CCValAssign::ZExt:
2428 ExtType = ISD::ZEXTLOAD;
2429 break;
2430 case CCValAssign::AExt:
2431 ExtType = ISD::EXTLOAD;
2432 break;
2433 }
2434
2435 SDValue ArgValue = DAG.getExtLoad(
2436 ExtType, SL, VA.getLocVT(), Chain, FIN,
2438
2439 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2440 if (ConvertedVal == ArgValue)
2441 return ConvertedVal;
2442
2443 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2444}
2445
2446SDValue SITargetLowering::lowerWorkGroupId(
2447 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2450 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2451 if (!Subtarget->hasClusters())
2452 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2453
2454 // Clusters are supported. Return the global position in the grid. If clusters
2455 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2456
2457 // WorkGroupIdXYZ = ClusterId == 0 ?
2458 // ClusterIdXYZ :
2459 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2460 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2461 SDLoc SL(ClusterIdXYZ);
2462 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2463 SDValue One = DAG.getConstant(1, SL, VT);
2464 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2465 SDValue ClusterWorkGroupIdXYZ =
2466 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2467 SDValue GlobalIdXYZ =
2468 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2469 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2470
2471 switch (MFI.getClusterDims().getKind()) {
2474 return GlobalIdXYZ;
2476 return ClusterIdXYZ;
2478 using namespace AMDGPU::Hwreg;
2479 SDValue ClusterIdField =
2480 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2481 SDNode *GetReg =
2482 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2483 SDValue ClusterId(GetReg, 0);
2484 SDValue Zero = DAG.getConstant(0, SL, VT);
2485 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2486 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2487 }
2488 }
2489
2490 llvm_unreachable("nothing should reach here");
2491}
2492
2493SDValue SITargetLowering::getPreloadedValue(
2494 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2496 const ArgDescriptor *Reg = nullptr;
2497 const TargetRegisterClass *RC;
2498 LLT Ty;
2499
2501 const ArgDescriptor WorkGroupIDX =
2502 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2503 // If GridZ is not programmed in an entry function then the hardware will set
2504 // it to all zeros, so there is no need to mask the GridY value in the low
2505 // order bits.
2506 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2507 AMDGPU::TTMP7,
2508 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2509 const ArgDescriptor WorkGroupIDZ =
2510 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2511 const ArgDescriptor ClusterWorkGroupIDX =
2512 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2513 const ArgDescriptor ClusterWorkGroupIDY =
2514 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2515 const ArgDescriptor ClusterWorkGroupIDZ =
2516 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2517 const ArgDescriptor ClusterWorkGroupMaxIDX =
2518 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2519 const ArgDescriptor ClusterWorkGroupMaxIDY =
2520 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2521 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2522 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2523 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2524 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2525
2526 auto LoadConstant = [&](unsigned N) {
2527 return DAG.getConstant(N, SDLoc(), VT);
2528 };
2529
2530 if (Subtarget->hasArchitectedSGPRs() &&
2532 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2533 bool HasFixedDims = ClusterDims.isFixedDims();
2534
2535 switch (PVID) {
2537 Reg = &WorkGroupIDX;
2538 RC = &AMDGPU::SReg_32RegClass;
2539 Ty = LLT::scalar(32);
2540 break;
2542 Reg = &WorkGroupIDY;
2543 RC = &AMDGPU::SReg_32RegClass;
2544 Ty = LLT::scalar(32);
2545 break;
2547 Reg = &WorkGroupIDZ;
2548 RC = &AMDGPU::SReg_32RegClass;
2549 Ty = LLT::scalar(32);
2550 break;
2552 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2553 return LoadConstant(0);
2554 Reg = &ClusterWorkGroupIDX;
2555 RC = &AMDGPU::SReg_32RegClass;
2556 Ty = LLT::scalar(32);
2557 break;
2559 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDY;
2562 RC = &AMDGPU::SReg_32RegClass;
2563 Ty = LLT::scalar(32);
2564 break;
2566 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDZ;
2569 RC = &AMDGPU::SReg_32RegClass;
2570 Ty = LLT::scalar(32);
2571 break;
2573 if (HasFixedDims)
2574 return LoadConstant(ClusterDims.getDims()[0] - 1);
2575 Reg = &ClusterWorkGroupMaxIDX;
2576 RC = &AMDGPU::SReg_32RegClass;
2577 Ty = LLT::scalar(32);
2578 break;
2580 if (HasFixedDims)
2581 return LoadConstant(ClusterDims.getDims()[1] - 1);
2582 Reg = &ClusterWorkGroupMaxIDY;
2583 RC = &AMDGPU::SReg_32RegClass;
2584 Ty = LLT::scalar(32);
2585 break;
2587 if (HasFixedDims)
2588 return LoadConstant(ClusterDims.getDims()[2] - 1);
2589 Reg = &ClusterWorkGroupMaxIDZ;
2590 RC = &AMDGPU::SReg_32RegClass;
2591 Ty = LLT::scalar(32);
2592 break;
2594 Reg = &ClusterWorkGroupMaxFlatID;
2595 RC = &AMDGPU::SReg_32RegClass;
2596 Ty = LLT::scalar(32);
2597 break;
2598 default:
2599 break;
2600 }
2601 }
2602
2603 if (!Reg)
2604 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2605 if (!Reg) {
2607 // It's possible for a kernarg intrinsic call to appear in a kernel with
2608 // no allocated segment, in which case we do not add the user sgpr
2609 // argument, so just return null.
2610 return DAG.getConstant(0, SDLoc(), VT);
2611 }
2612
2613 // It's undefined behavior if a function marked with the amdgpu-no-*
2614 // attributes uses the corresponding intrinsic.
2615 return DAG.getPOISON(VT);
2616 }
2617
2618 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2619}
2620
2622 CallingConv::ID CallConv,
2623 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2624 FunctionType *FType,
2626 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2627 const ISD::InputArg *Arg = &Ins[I];
2628
2629 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2630 "vector type argument should have been split");
2631
2632 // First check if it's a PS input addr.
2633 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2634 PSInputNum <= 15) {
2635 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2636
2637 // Inconveniently only the first part of the split is marked as isSplit,
2638 // so skip to the end. We only want to increment PSInputNum once for the
2639 // entire split argument.
2640 if (Arg->Flags.isSplit()) {
2641 while (!Arg->Flags.isSplitEnd()) {
2642 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2643 "unexpected vector split in ps argument type");
2644 if (!SkipArg)
2645 Splits.push_back(*Arg);
2646 Arg = &Ins[++I];
2647 }
2648 }
2649
2650 if (SkipArg) {
2651 // We can safely skip PS inputs.
2652 Skipped.set(Arg->getOrigArgIndex());
2653 ++PSInputNum;
2654 continue;
2655 }
2656
2657 Info->markPSInputAllocated(PSInputNum);
2658 if (Arg->Used)
2659 Info->markPSInputEnabled(PSInputNum);
2660
2661 ++PSInputNum;
2662 }
2663
2664 Splits.push_back(*Arg);
2665 }
2666}
2667
2668// Allocate special inputs passed in VGPRs.
2670 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2671 SIMachineFunctionInfo &Info) const {
2672 const LLT S32 = LLT::scalar(32);
2674
2675 if (Info.hasWorkItemIDX()) {
2676 Register Reg = AMDGPU::VGPR0;
2677 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2678
2679 CCInfo.AllocateReg(Reg);
2680 unsigned Mask =
2681 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2682 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2683 }
2684
2685 if (Info.hasWorkItemIDY()) {
2686 assert(Info.hasWorkItemIDX());
2687 if (Subtarget->hasPackedTID()) {
2688 Info.setWorkItemIDY(
2689 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2690 } else {
2691 unsigned Reg = AMDGPU::VGPR1;
2692 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2693
2694 CCInfo.AllocateReg(Reg);
2695 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2696 }
2697 }
2698
2699 if (Info.hasWorkItemIDZ()) {
2700 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2701 if (Subtarget->hasPackedTID()) {
2702 Info.setWorkItemIDZ(
2703 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2704 } else {
2705 unsigned Reg = AMDGPU::VGPR2;
2706 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2707
2708 CCInfo.AllocateReg(Reg);
2709 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2710 }
2711 }
2712}
2713
2714// Try to allocate a VGPR at the end of the argument list, or if no argument
2715// VGPRs are left allocating a stack slot.
2716// If \p Mask is is given it indicates bitfield position in the register.
2717// If \p Arg is given use it with new ]p Mask instead of allocating new.
2718static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2719 ArgDescriptor Arg = ArgDescriptor()) {
2720 if (Arg.isSet())
2721 return ArgDescriptor::createArg(Arg, Mask);
2722
2723 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2724 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2725 if (RegIdx == ArgVGPRs.size()) {
2726 // Spill to stack required.
2727 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2728
2729 return ArgDescriptor::createStack(Offset, Mask);
2730 }
2731
2732 unsigned Reg = ArgVGPRs[RegIdx];
2733 Reg = CCInfo.AllocateReg(Reg);
2734 assert(Reg != AMDGPU::NoRegister);
2735
2736 MachineFunction &MF = CCInfo.getMachineFunction();
2737 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2738 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2739 return ArgDescriptor::createRegister(Reg, Mask);
2740}
2741
2743 const TargetRegisterClass *RC,
2744 unsigned NumArgRegs) {
2745 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2746 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2747 if (RegIdx == ArgSGPRs.size())
2748 report_fatal_error("ran out of SGPRs for arguments");
2749
2750 unsigned Reg = ArgSGPRs[RegIdx];
2751 Reg = CCInfo.AllocateReg(Reg);
2752 assert(Reg != AMDGPU::NoRegister);
2753
2754 MachineFunction &MF = CCInfo.getMachineFunction();
2755 MF.addLiveIn(Reg, RC);
2757}
2758
2759// If this has a fixed position, we still should allocate the register in the
2760// CCInfo state. Technically we could get away with this for values passed
2761// outside of the normal argument range.
2763 const TargetRegisterClass *RC,
2764 MCRegister Reg) {
2765 Reg = CCInfo.AllocateReg(Reg);
2766 assert(Reg != AMDGPU::NoRegister);
2767 MachineFunction &MF = CCInfo.getMachineFunction();
2768 MF.addLiveIn(Reg, RC);
2769}
2770
2771static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2772 if (Arg) {
2773 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2774 Arg.getRegister());
2775 } else
2776 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2777}
2778
2779static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2780 if (Arg) {
2781 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2782 Arg.getRegister());
2783 } else
2784 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2785}
2786
2787/// Allocate implicit function VGPR arguments at the end of allocated user
2788/// arguments.
2790 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2791 SIMachineFunctionInfo &Info) const {
2792 const unsigned Mask = 0x3ff;
2793 ArgDescriptor Arg;
2794
2795 if (Info.hasWorkItemIDX()) {
2796 Arg = allocateVGPR32Input(CCInfo, Mask);
2797 Info.setWorkItemIDX(Arg);
2798 }
2799
2800 if (Info.hasWorkItemIDY()) {
2801 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2802 Info.setWorkItemIDY(Arg);
2803 }
2804
2805 if (Info.hasWorkItemIDZ())
2806 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2807}
2808
2809/// Allocate implicit function VGPR arguments in fixed registers.
2811 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2812 SIMachineFunctionInfo &Info) const {
2813 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2814 if (!Reg)
2815 report_fatal_error("failed to allocate VGPR for implicit arguments");
2816
2817 const unsigned Mask = 0x3ff;
2818 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2819 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2820 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2821}
2822
2824 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2825 SIMachineFunctionInfo &Info) const {
2826 auto &ArgInfo = Info.getArgInfo();
2827 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2828
2829 // TODO: Unify handling with private memory pointers.
2830 if (UserSGPRInfo.hasDispatchPtr())
2831 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2832
2833 if (UserSGPRInfo.hasQueuePtr())
2834 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2835
2836 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2837 // constant offset from the kernarg segment.
2838 if (Info.hasImplicitArgPtr())
2839 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2840
2841 if (UserSGPRInfo.hasDispatchID())
2842 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2843
2844 // flat_scratch_init is not applicable for non-kernel functions.
2845
2846 if (Info.hasWorkGroupIDX())
2847 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2848
2849 if (Info.hasWorkGroupIDY())
2850 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2851
2852 if (Info.hasWorkGroupIDZ())
2853 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2854
2855 if (Info.hasLDSKernelId())
2856 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2857}
2858
2859// Allocate special inputs passed in user SGPRs.
2861 MachineFunction &MF,
2862 const SIRegisterInfo &TRI,
2863 SIMachineFunctionInfo &Info) const {
2864 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2865 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2866 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2867 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2868 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2869 }
2870
2871 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2872 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2873 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2874 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2875 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2876 }
2877
2878 if (UserSGPRInfo.hasDispatchPtr()) {
2879 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2880 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2881 CCInfo.AllocateReg(DispatchPtrReg);
2882 }
2883
2884 if (UserSGPRInfo.hasQueuePtr()) {
2885 Register QueuePtrReg = Info.addQueuePtr(TRI);
2886 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2887 CCInfo.AllocateReg(QueuePtrReg);
2888 }
2889
2890 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2892 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2893 CCInfo.AllocateReg(InputPtrReg);
2894
2895 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2896 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2897 }
2898
2899 if (UserSGPRInfo.hasDispatchID()) {
2900 Register DispatchIDReg = Info.addDispatchID(TRI);
2901 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2902 CCInfo.AllocateReg(DispatchIDReg);
2903 }
2904
2905 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2906 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2907 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2908 CCInfo.AllocateReg(FlatScratchInitReg);
2909 }
2910
2911 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2912 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2913 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2914 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2915 }
2916
2917 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2918 // these from the dispatch pointer.
2919}
2920
2921// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2922// sequential starting from the first argument.
2924 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2926 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2927 Function &F = MF.getFunction();
2928 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2929 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2930 bool InPreloadSequence = true;
2931 unsigned InIdx = 0;
2932 bool AlignedForImplictArgs = false;
2933 unsigned ImplicitArgOffset = 0;
2934 for (auto &Arg : F.args()) {
2935 if (!InPreloadSequence || !Arg.hasInRegAttr())
2936 break;
2937
2938 unsigned ArgIdx = Arg.getArgNo();
2939 // Don't preload non-original args or parts not in the current preload
2940 // sequence.
2941 if (InIdx < Ins.size() &&
2942 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2943 break;
2944
2945 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2946 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2947 InIdx++) {
2948 assert(ArgLocs[ArgIdx].isMemLoc());
2949 auto &ArgLoc = ArgLocs[InIdx];
2950 const Align KernelArgBaseAlign = Align(16);
2951 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2952 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2953 unsigned NumAllocSGPRs =
2954 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2955
2956 // Fix alignment for hidden arguments.
2957 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2958 if (!AlignedForImplictArgs) {
2959 ImplicitArgOffset =
2960 alignTo(LastExplicitArgOffset,
2961 Subtarget->getAlignmentForImplicitArgPtr()) -
2962 LastExplicitArgOffset;
2963 AlignedForImplictArgs = true;
2964 }
2965 ArgOffset += ImplicitArgOffset;
2966 }
2967
2968 // Arg is preloaded into the previous SGPR.
2969 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2970 assert(InIdx >= 1 && "No previous SGPR");
2971 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2972 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2973 continue;
2974 }
2975
2976 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2977 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2978 // Check for free user SGPRs for preloading.
2979 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2980 InPreloadSequence = false;
2981 break;
2982 }
2983
2984 // Preload this argument.
2985 const TargetRegisterClass *RC =
2986 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2987 SmallVectorImpl<MCRegister> *PreloadRegs =
2988 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2989
2990 if (PreloadRegs->size() > 1)
2991 RC = &AMDGPU::SGPR_32RegClass;
2992 for (auto &Reg : *PreloadRegs) {
2993 assert(Reg);
2994 MF.addLiveIn(Reg, RC);
2995 CCInfo.AllocateReg(Reg);
2996 }
2997
2998 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2999 }
3000 }
3001}
3002
3004 const SIRegisterInfo &TRI,
3005 SIMachineFunctionInfo &Info) const {
3006 // Always allocate this last since it is a synthetic preload.
3007 if (Info.hasLDSKernelId()) {
3008 Register Reg = Info.addLDSKernelId();
3009 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3010 CCInfo.AllocateReg(Reg);
3011 }
3012}
3013
3014// Allocate special input registers that are initialized per-wave.
3017 CallingConv::ID CallConv,
3018 bool IsShader) const {
3019 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3020 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3021 // Note: user SGPRs are handled by the front-end for graphics shaders
3022 // Pad up the used user SGPRs with dead inputs.
3023
3024 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3025 // before enabling architected SGPRs for workgroup IDs.
3026 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3027
3028 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3029 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3030 // rely on it to reach 16 since if we end up having no stack usage, it will
3031 // not really be added.
3032 unsigned NumRequiredSystemSGPRs =
3033 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3034 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3035 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3036 Register Reg = Info.addReservedUserSGPR();
3037 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3038 CCInfo.AllocateReg(Reg);
3039 }
3040 }
3041
3042 if (!HasArchitectedSGPRs) {
3043 if (Info.hasWorkGroupIDX()) {
3044 Register Reg = Info.addWorkGroupIDX();
3045 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3046 CCInfo.AllocateReg(Reg);
3047 }
3048
3049 if (Info.hasWorkGroupIDY()) {
3050 Register Reg = Info.addWorkGroupIDY();
3051 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3052 CCInfo.AllocateReg(Reg);
3053 }
3054
3055 if (Info.hasWorkGroupIDZ()) {
3056 Register Reg = Info.addWorkGroupIDZ();
3057 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3058 CCInfo.AllocateReg(Reg);
3059 }
3060 }
3061
3062 if (Info.hasWorkGroupInfo()) {
3063 Register Reg = Info.addWorkGroupInfo();
3064 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3065 CCInfo.AllocateReg(Reg);
3066 }
3067
3068 if (Info.hasPrivateSegmentWaveByteOffset()) {
3069 // Scratch wave offset passed in system SGPR.
3070 unsigned PrivateSegmentWaveByteOffsetReg;
3071
3072 if (IsShader) {
3073 PrivateSegmentWaveByteOffsetReg =
3074 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3075
3076 // This is true if the scratch wave byte offset doesn't have a fixed
3077 // location.
3078 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3079 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3080 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3081 }
3082 } else
3083 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3084
3085 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3086 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3087 }
3088
3089 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3090 Info.getNumPreloadedSGPRs() >= 16);
3091}
3092
3094 MachineFunction &MF,
3095 const SIRegisterInfo &TRI,
3097 // Now that we've figured out where the scratch register inputs are, see if
3098 // should reserve the arguments and use them directly.
3099 MachineFrameInfo &MFI = MF.getFrameInfo();
3100 bool HasStackObjects = MFI.hasStackObjects();
3101 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3102
3103 // Record that we know we have non-spill stack objects so we don't need to
3104 // check all stack objects later.
3105 if (HasStackObjects)
3106 Info.setHasNonSpillStackObjects(true);
3107
3108 // Everything live out of a block is spilled with fast regalloc, so it's
3109 // almost certain that spilling will be required.
3110 if (TM.getOptLevel() == CodeGenOptLevel::None)
3111 HasStackObjects = true;
3112
3113 // For now assume stack access is needed in any callee functions, so we need
3114 // the scratch registers to pass in.
3115 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3116
3117 if (!ST.enableFlatScratch()) {
3118 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3119 // If we have stack objects, we unquestionably need the private buffer
3120 // resource. For the Code Object V2 ABI, this will be the first 4 user
3121 // SGPR inputs. We can reserve those and use them directly.
3122
3123 Register PrivateSegmentBufferReg =
3125 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3126 } else {
3127 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3128 // We tentatively reserve the last registers (skipping the last registers
3129 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3130 // we'll replace these with the ones immediately after those which were
3131 // really allocated. In the prologue copies will be inserted from the
3132 // argument to these reserved registers.
3133
3134 // Without HSA, relocations are used for the scratch pointer and the
3135 // buffer resource setup is always inserted in the prologue. Scratch wave
3136 // offset is still in an input SGPR.
3137 Info.setScratchRSrcReg(ReservedBufferReg);
3138 }
3139 }
3140
3142
3143 // For entry functions we have to set up the stack pointer if we use it,
3144 // whereas non-entry functions get this "for free". This means there is no
3145 // intrinsic advantage to using S32 over S34 in cases where we do not have
3146 // calls but do need a frame pointer (i.e. if we are requested to have one
3147 // because frame pointer elimination is disabled). To keep things simple we
3148 // only ever use S32 as the call ABI stack pointer, and so using it does not
3149 // imply we need a separate frame pointer.
3150 //
3151 // Try to use s32 as the SP, but move it if it would interfere with input
3152 // arguments. This won't work with calls though.
3153 //
3154 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3155 // registers.
3156 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3157 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3158 } else {
3160
3161 if (MFI.hasCalls())
3162 report_fatal_error("call in graphics shader with too many input SGPRs");
3163
3164 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3165 if (!MRI.isLiveIn(Reg)) {
3166 Info.setStackPtrOffsetReg(Reg);
3167 break;
3168 }
3169 }
3170
3171 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3172 report_fatal_error("failed to find register for SP");
3173 }
3174
3175 // hasFP should be accurate for entry functions even before the frame is
3176 // finalized, because it does not rely on the known stack size, only
3177 // properties like whether variable sized objects are present.
3178 if (ST.getFrameLowering()->hasFP(MF)) {
3179 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3180 }
3181}
3182
3185 return !Info->isEntryFunction();
3186}
3187
3189
3191 MachineBasicBlock *Entry,
3192 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3194
3195 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3196 if (!IStart)
3197 return;
3198
3199 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3200 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3201 MachineBasicBlock::iterator MBBI = Entry->begin();
3202 for (const MCPhysReg *I = IStart; *I; ++I) {
3203 const TargetRegisterClass *RC = nullptr;
3204 if (AMDGPU::SReg_64RegClass.contains(*I))
3205 RC = &AMDGPU::SGPR_64RegClass;
3206 else if (AMDGPU::SReg_32RegClass.contains(*I))
3207 RC = &AMDGPU::SGPR_32RegClass;
3208 else
3209 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3210
3211 Register NewVR = MRI->createVirtualRegister(RC);
3212 // Create copy from CSR to a virtual register.
3213 Entry->addLiveIn(*I);
3214 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3215 .addReg(*I);
3216
3217 // Insert the copy-back instructions right before the terminator.
3218 for (auto *Exit : Exits)
3219 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3220 TII->get(TargetOpcode::COPY), *I)
3221 .addReg(NewVR);
3222 }
3223}
3224
3226 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3227 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3228 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3230
3232 const Function &Fn = MF.getFunction();
3235 bool IsError = false;
3236
3237 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3239 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3240 IsError = true;
3241 }
3242
3245 BitVector Skipped(Ins.size());
3246 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3247 *DAG.getContext());
3248
3249 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3250 bool IsKernel = AMDGPU::isKernel(CallConv);
3251 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3252
3253 if (IsGraphics) {
3254 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3255 assert(!UserSGPRInfo.hasDispatchPtr() &&
3256 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3257 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3258 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3259 (void)UserSGPRInfo;
3260 if (!Subtarget->enableFlatScratch())
3261 assert(!UserSGPRInfo.hasFlatScratchInit());
3262 if ((CallConv != CallingConv::AMDGPU_CS &&
3263 CallConv != CallingConv::AMDGPU_Gfx &&
3264 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3265 !Subtarget->hasArchitectedSGPRs())
3266 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3267 !Info->hasWorkGroupIDZ());
3268 }
3269
3270 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3271
3272 if (CallConv == CallingConv::AMDGPU_PS) {
3273 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3274
3275 // At least one interpolation mode must be enabled or else the GPU will
3276 // hang.
3277 //
3278 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3279 // set PSInputAddr, the user wants to enable some bits after the compilation
3280 // based on run-time states. Since we can't know what the final PSInputEna
3281 // will look like, so we shouldn't do anything here and the user should take
3282 // responsibility for the correct programming.
3283 //
3284 // Otherwise, the following restrictions apply:
3285 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3286 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3287 // enabled too.
3288 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3289 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3290 CCInfo.AllocateReg(AMDGPU::VGPR0);
3291 CCInfo.AllocateReg(AMDGPU::VGPR1);
3292 Info->markPSInputAllocated(0);
3293 Info->markPSInputEnabled(0);
3294 }
3295 if (Subtarget->isAmdPalOS()) {
3296 // For isAmdPalOS, the user does not enable some bits after compilation
3297 // based on run-time states; the register values being generated here are
3298 // the final ones set in hardware. Therefore we need to apply the
3299 // workaround to PSInputAddr and PSInputEnable together. (The case where
3300 // a bit is set in PSInputAddr but not PSInputEnable is where the
3301 // frontend set up an input arg for a particular interpolation mode, but
3302 // nothing uses that input arg. Really we should have an earlier pass
3303 // that removes such an arg.)
3304 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3305 if ((PsInputBits & 0x7F) == 0 ||
3306 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3307 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3308 }
3309 } else if (IsKernel) {
3310 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3311 } else {
3312 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3313 Ins.end());
3314 }
3315
3316 if (IsKernel)
3317 analyzeFormalArgumentsCompute(CCInfo, Ins);
3318
3319 if (IsEntryFunc) {
3320 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3321 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3322 if (IsKernel && Subtarget->hasKernargPreload())
3323 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3324
3325 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3326 } else if (!IsGraphics) {
3327 // For the fixed ABI, pass workitem IDs in the last argument register.
3328 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3329
3330 // FIXME: Sink this into allocateSpecialInputSGPRs
3331 if (!Subtarget->enableFlatScratch())
3332 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3333
3334 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3335 }
3336
3337 if (!IsKernel) {
3338 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3339 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3340
3341 // This assumes the registers are allocated by CCInfo in ascending order
3342 // with no gaps.
3343 Info->setNumWaveDispatchSGPRs(
3344 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3345 Info->setNumWaveDispatchVGPRs(
3346 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3347 } else if (Info->getNumKernargPreloadedSGPRs()) {
3348 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3349 }
3350
3352
3353 if (IsWholeWaveFunc) {
3355 {MVT::i1, MVT::Other}, Chain);
3356 InVals.push_back(Setup.getValue(0));
3357 Chains.push_back(Setup.getValue(1));
3358 }
3359
3360 // FIXME: This is the minimum kernel argument alignment. We should improve
3361 // this to the maximum alignment of the arguments.
3362 //
3363 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3364 // kern arg offset.
3365 const Align KernelArgBaseAlign = Align(16);
3366
3367 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3368 ++i) {
3369 const ISD::InputArg &Arg = Ins[i];
3370 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3371 InVals.push_back(DAG.getPOISON(Arg.VT));
3372 continue;
3373 }
3374
3375 CCValAssign &VA = ArgLocs[ArgIdx++];
3376 MVT VT = VA.getLocVT();
3377
3378 if (IsEntryFunc && VA.isMemLoc()) {
3379 VT = Ins[i].VT;
3380 EVT MemVT = VA.getLocVT();
3381
3382 const uint64_t Offset = VA.getLocMemOffset();
3383 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3384
3385 if (Arg.Flags.isByRef()) {
3386 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3387
3388 const GCNTargetMachine &TM =
3389 static_cast<const GCNTargetMachine &>(getTargetMachine());
3390 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3391 Arg.Flags.getPointerAddrSpace())) {
3394 }
3395
3396 InVals.push_back(Ptr);
3397 continue;
3398 }
3399
3400 SDValue NewArg;
3401 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3402 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3403 // In this case the argument is packed into the previous preload SGPR.
3404 int64_t AlignDownOffset = alignDown(Offset, 4);
3405 int64_t OffsetDiff = Offset - AlignDownOffset;
3406 EVT IntVT = MemVT.changeTypeToInteger();
3407
3408 const SIMachineFunctionInfo *Info =
3411 Register Reg =
3412 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3413
3414 assert(Reg);
3415 Register VReg = MRI.getLiveInVirtReg(Reg);
3416 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3417
3418 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3419 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3420
3421 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3422 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3423 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3424 Ins[i].Flags.isSExt(), &Ins[i]);
3425
3426 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3427 } else {
3428 const SIMachineFunctionInfo *Info =
3431 const SmallVectorImpl<MCRegister> &PreloadRegs =
3432 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3433
3434 SDValue Copy;
3435 if (PreloadRegs.size() == 1) {
3436 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3437 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3438 NewArg = DAG.getCopyFromReg(
3439 Chain, DL, VReg,
3441 TRI->getRegSizeInBits(*RC)));
3442
3443 } else {
3444 // If the kernarg alignment does not match the alignment of the SGPR
3445 // tuple RC that can accommodate this argument, it will be built up
3446 // via copies from from the individual SGPRs that the argument was
3447 // preloaded to.
3449 for (auto Reg : PreloadRegs) {
3450 Register VReg = MRI.getLiveInVirtReg(Reg);
3451 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3452 Elts.push_back(Copy);
3453 }
3454 NewArg =
3455 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3456 PreloadRegs.size()),
3457 DL, Elts);
3458 }
3459
3460 // If the argument was preloaded to multiple consecutive 32-bit
3461 // registers because of misalignment between addressable SGPR tuples
3462 // and the argument size, we can still assume that because of kernarg
3463 // segment alignment restrictions that NewArg's size is the same as
3464 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3465 // truncate since we cannot preload to less than a single SGPR and the
3466 // MemVT may be smaller.
3467 EVT MemVTInt =
3469 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3470 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3471
3472 NewArg = DAG.getBitcast(MemVT, NewArg);
3473 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3474 Ins[i].Flags.isSExt(), &Ins[i]);
3475 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3476 }
3477 } else {
3478 // Hidden arguments that are in the kernel signature must be preloaded
3479 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3480 // the argument list and is not preloaded.
3481 if (Arg.isOrigArg()) {
3482 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3483 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3485 *OrigArg->getParent(),
3486 "hidden argument in kernel signature was not preloaded",
3487 DL.getDebugLoc()));
3488 }
3489 }
3490
3491 NewArg =
3492 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3493 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3494 }
3495 Chains.push_back(NewArg.getValue(1));
3496
3497 auto *ParamTy =
3498 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3499 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3500 ParamTy &&
3501 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3502 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3503 // On SI local pointers are just offsets into LDS, so they are always
3504 // less than 16-bits. On CI and newer they could potentially be
3505 // real pointers, so we can't guarantee their size.
3506 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3507 DAG.getValueType(MVT::i16));
3508 }
3509
3510 InVals.push_back(NewArg);
3511 continue;
3512 }
3513 if (!IsEntryFunc && VA.isMemLoc()) {
3514 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3515 InVals.push_back(Val);
3516 if (!Arg.Flags.isByVal())
3517 Chains.push_back(Val.getValue(1));
3518 continue;
3519 }
3520
3521 assert(VA.isRegLoc() && "Parameter must be in a register!");
3522
3523 Register Reg = VA.getLocReg();
3524 const TargetRegisterClass *RC = nullptr;
3525 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3526 RC = &AMDGPU::VGPR_32RegClass;
3527 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3528 RC = &AMDGPU::SGPR_32RegClass;
3529 else
3530 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3531
3532 Reg = MF.addLiveIn(Reg, RC);
3533 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3534
3535 if (Arg.Flags.isSRet()) {
3536 // The return object should be reasonably addressable.
3537
3538 // FIXME: This helps when the return is a real sret. If it is a
3539 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3540 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3541 unsigned NumBits =
3543 Val = DAG.getNode(
3544 ISD::AssertZext, DL, VT, Val,
3545 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3546 }
3547
3548 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3549 InVals.push_back(Val);
3550 }
3551
3552 // Start adding system SGPRs.
3553 if (IsEntryFunc)
3554 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3555
3556 // DAG.getPass() returns nullptr when using new pass manager.
3557 // TODO: Use DAG.getMFAM() to access analysis result.
3558 if (DAG.getPass()) {
3559 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3560 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3561 }
3562
3563 unsigned StackArgSize = CCInfo.getStackSize();
3564 Info->setBytesInStackArgArea(StackArgSize);
3565
3566 return Chains.empty() ? Chain
3567 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3568}
3569
3570// TODO: If return values can't fit in registers, we should return as many as
3571// possible in registers before passing on stack.
3573 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3574 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3575 const Type *RetTy) const {
3576 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3577 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3578 // for shaders. Vector types should be explicitly handled by CC.
3579 if (AMDGPU::isEntryFunctionCC(CallConv))
3580 return true;
3581
3583 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3584 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3585 return false;
3586
3587 // We must use the stack if return would require unavailable registers.
3588 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3589 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3590 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3591 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3592 return false;
3593
3594 return true;
3595}
3596
3597SDValue
3599 bool isVarArg,
3601 const SmallVectorImpl<SDValue> &OutVals,
3602 const SDLoc &DL, SelectionDAG &DAG) const {
3606
3607 if (AMDGPU::isKernel(CallConv)) {
3608 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3609 OutVals, DL, DAG);
3610 }
3611
3612 bool IsShader = AMDGPU::isShader(CallConv);
3613
3614 Info->setIfReturnsVoid(Outs.empty());
3615 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3616
3617 // CCValAssign - represent the assignment of the return value to a location.
3619
3620 // CCState - Info about the registers and stack slots.
3621 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3622 *DAG.getContext());
3623
3624 // Analyze outgoing return values.
3625 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3626
3627 SDValue Glue;
3629 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3630
3631 SDValue ReadFirstLane =
3632 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3633 // Copy the result values into the output registers.
3634 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3635 ++I, ++RealRVLocIdx) {
3636 CCValAssign &VA = RVLocs[I];
3637 assert(VA.isRegLoc() && "Can only return in registers!");
3638 // TODO: Partially return in registers if return values don't fit.
3639 SDValue Arg = OutVals[RealRVLocIdx];
3640
3641 // Copied from other backends.
3642 switch (VA.getLocInfo()) {
3643 case CCValAssign::Full:
3644 break;
3645 case CCValAssign::BCvt:
3646 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3647 break;
3648 case CCValAssign::SExt:
3649 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3650 break;
3651 case CCValAssign::ZExt:
3652 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3653 break;
3654 case CCValAssign::AExt:
3655 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3656 break;
3657 default:
3658 llvm_unreachable("Unknown loc info!");
3659 }
3660 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3662 ReadFirstLane, Arg);
3663 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3664 Glue = Chain.getValue(1);
3665 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3666 }
3667
3668 // FIXME: Does sret work properly?
3669 if (!Info->isEntryFunction()) {
3670 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3671 const MCPhysReg *I =
3672 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3673 if (I) {
3674 for (; *I; ++I) {
3675 if (AMDGPU::SReg_64RegClass.contains(*I))
3676 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3677 else if (AMDGPU::SReg_32RegClass.contains(*I))
3678 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3679 else
3680 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3681 }
3682 }
3683 }
3684
3685 // Update chain and glue.
3686 RetOps[0] = Chain;
3687 if (Glue.getNode())
3688 RetOps.push_back(Glue);
3689
3690 unsigned Opc = AMDGPUISD::ENDPGM;
3691 if (!IsWaveEnd)
3692 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3693 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3695 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3696}
3697
3699 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3700 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3701 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3702 SDValue ThisVal) const {
3703 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3704
3705 // Assign locations to each value returned by this call.
3707 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3708 *DAG.getContext());
3709 CCInfo.AnalyzeCallResult(Ins, RetCC);
3710
3711 // Copy all of the result registers out of their specified physreg.
3712 for (CCValAssign VA : RVLocs) {
3713 SDValue Val;
3714
3715 if (VA.isRegLoc()) {
3716 Val =
3717 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3718 Chain = Val.getValue(1);
3719 InGlue = Val.getValue(2);
3720 } else if (VA.isMemLoc()) {
3721 report_fatal_error("TODO: return values in memory");
3722 } else
3723 llvm_unreachable("unknown argument location type");
3724
3725 switch (VA.getLocInfo()) {
3726 case CCValAssign::Full:
3727 break;
3728 case CCValAssign::BCvt:
3729 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3730 break;
3731 case CCValAssign::ZExt:
3732 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3733 DAG.getValueType(VA.getValVT()));
3734 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3735 break;
3736 case CCValAssign::SExt:
3737 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3738 DAG.getValueType(VA.getValVT()));
3739 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3740 break;
3741 case CCValAssign::AExt:
3742 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3743 break;
3744 default:
3745 llvm_unreachable("Unknown loc info!");
3746 }
3747
3748 InVals.push_back(Val);
3749 }
3750
3751 return Chain;
3752}
3753
3754// Add code to pass special inputs required depending on used features separate
3755// from the explicit user arguments present in the IR.
3757 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3758 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3759 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3760 // If we don't have a call site, this was a call inserted by
3761 // legalization. These can never use special inputs.
3762 if (!CLI.CB)
3763 return;
3764
3765 SelectionDAG &DAG = CLI.DAG;
3766 const SDLoc &DL = CLI.DL;
3767 const Function &F = DAG.getMachineFunction().getFunction();
3768
3769 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3770 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3771
3772 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3774 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3775 // DAG.getPass() returns nullptr when using new pass manager.
3776 // TODO: Use DAG.getMFAM() to access analysis result.
3777 if (DAG.getPass()) {
3778 auto &ArgUsageInfo =
3780 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3781 }
3782 }
3783
3784 // TODO: Unify with private memory register handling. This is complicated by
3785 // the fact that at least in kernels, the input argument is not necessarily
3786 // in the same location as the input.
3787 // clang-format off
3788 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3790 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3791 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3792 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3793 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3794 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3795 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3796 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3797 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3798 };
3799 // clang-format on
3800
3801 for (auto [InputID, Attr] : ImplicitAttrs) {
3802 // If the callee does not use the attribute value, skip copying the value.
3803 if (CLI.CB->hasFnAttr(Attr))
3804 continue;
3805
3806 const auto [OutgoingArg, ArgRC, ArgTy] =
3807 CalleeArgInfo->getPreloadedValue(InputID);
3808 if (!OutgoingArg)
3809 continue;
3810
3811 const auto [IncomingArg, IncomingArgRC, Ty] =
3812 CallerArgInfo.getPreloadedValue(InputID);
3813 assert(IncomingArgRC == ArgRC);
3814
3815 // All special arguments are ints for now.
3816 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3817 SDValue InputReg;
3818
3819 if (IncomingArg) {
3820 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3821 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3822 // The implicit arg ptr is special because it doesn't have a corresponding
3823 // input for kernels, and is computed from the kernarg segment pointer.
3824 InputReg = getImplicitArgPtr(DAG, DL);
3825 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3826 std::optional<uint32_t> Id =
3828 if (Id.has_value()) {
3829 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3830 } else {
3831 InputReg = DAG.getPOISON(ArgVT);
3832 }
3833 } else {
3834 // We may have proven the input wasn't needed, although the ABI is
3835 // requiring it. We just need to allocate the register appropriately.
3836 InputReg = DAG.getPOISON(ArgVT);
3837 }
3838
3839 if (OutgoingArg->isRegister()) {
3840 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3841 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3842 report_fatal_error("failed to allocate implicit input argument");
3843 } else {
3844 unsigned SpecialArgOffset =
3845 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3846 SDValue ArgStore =
3847 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3848 MemOpChains.push_back(ArgStore);
3849 }
3850 }
3851
3852 // Pack workitem IDs into a single register or pass it as is if already
3853 // packed.
3854
3855 auto [OutgoingArg, ArgRC, Ty] =
3857 if (!OutgoingArg)
3858 std::tie(OutgoingArg, ArgRC, Ty) =
3860 if (!OutgoingArg)
3861 std::tie(OutgoingArg, ArgRC, Ty) =
3863 if (!OutgoingArg)
3864 return;
3865
3866 const ArgDescriptor *IncomingArgX = std::get<0>(
3868 const ArgDescriptor *IncomingArgY = std::get<0>(
3870 const ArgDescriptor *IncomingArgZ = std::get<0>(
3872
3873 SDValue InputReg;
3874 SDLoc SL;
3875
3876 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3877 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3878 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3879
3880 // If incoming ids are not packed we need to pack them.
3881 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3882 NeedWorkItemIDX) {
3883 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3884 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3885 } else {
3886 InputReg = DAG.getConstant(0, DL, MVT::i32);
3887 }
3888 }
3889
3890 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3891 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3892 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3893 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3894 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3895 InputReg = InputReg.getNode()
3896 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3897 : Y;
3898 }
3899
3900 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3901 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3902 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3903 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3904 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3905 InputReg = InputReg.getNode()
3906 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3907 : Z;
3908 }
3909
3910 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3911 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3912 // We're in a situation where the outgoing function requires the workitem
3913 // ID, but the calling function does not have it (e.g a graphics function
3914 // calling a C calling convention function). This is illegal, but we need
3915 // to produce something.
3916 InputReg = DAG.getPOISON(MVT::i32);
3917 } else {
3918 // Workitem ids are already packed, any of present incoming arguments
3919 // will carry all required fields.
3920 ArgDescriptor IncomingArg =
3921 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3922 : IncomingArgY ? *IncomingArgY
3923 : *IncomingArgZ,
3924 ~0u);
3925 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3926 }
3927 }
3928
3929 if (OutgoingArg->isRegister()) {
3930 if (InputReg)
3931 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3932
3933 CCInfo.AllocateReg(OutgoingArg->getRegister());
3934 } else {
3935 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3936 if (InputReg) {
3937 SDValue ArgStore =
3938 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3939 MemOpChains.push_back(ArgStore);
3940 }
3941 }
3942}
3943
3945 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3947 const SmallVectorImpl<SDValue> &OutVals,
3948 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3949 if (AMDGPU::isChainCC(CalleeCC))
3950 return true;
3951
3952 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3953 return false;
3954
3955 // For a divergent call target, we need to do a waterfall loop over the
3956 // possible callees which precludes us from using a simple jump.
3957 if (Callee->isDivergent())
3958 return false;
3959
3961 const Function &CallerF = MF.getFunction();
3962 CallingConv::ID CallerCC = CallerF.getCallingConv();
3964 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3965
3966 // Kernels aren't callable, and don't have a live in return address so it
3967 // doesn't make sense to do a tail call with entry functions.
3968 if (!CallerPreserved)
3969 return false;
3970
3971 bool CCMatch = CallerCC == CalleeCC;
3972
3974 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3975 return true;
3976 return false;
3977 }
3978
3979 // TODO: Can we handle var args?
3980 if (IsVarArg)
3981 return false;
3982
3983 for (const Argument &Arg : CallerF.args()) {
3984 if (Arg.hasByValAttr())
3985 return false;
3986 }
3987
3988 LLVMContext &Ctx = *DAG.getContext();
3989
3990 // Check that the call results are passed in the same way.
3991 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3992 CCAssignFnForCall(CalleeCC, IsVarArg),
3993 CCAssignFnForCall(CallerCC, IsVarArg)))
3994 return false;
3995
3996 // The callee has to preserve all registers the caller needs to preserve.
3997 if (!CCMatch) {
3998 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3999 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4000 return false;
4001 }
4002
4003 // Nothing more to check if the callee is taking no arguments.
4004 if (Outs.empty())
4005 return true;
4006
4008 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4009
4010 // FIXME: We are not allocating special input registers, so we will be
4011 // deciding based on incorrect register assignments.
4012 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4013
4014 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4015 // If the stack arguments for this call do not fit into our own save area then
4016 // the call cannot be made tail.
4017 // TODO: Is this really necessary?
4018 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4019 return false;
4020
4021 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4022 // FIXME: What about inreg arguments that end up passed in memory?
4023 if (!CCVA.isRegLoc())
4024 continue;
4025
4026 // If we are passing an argument in an SGPR, and the value is divergent,
4027 // this call requires a waterfall loop.
4028 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4029 LLVM_DEBUG(
4030 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4031 << printReg(CCVA.getLocReg(), TRI) << '\n');
4032 return false;
4033 }
4034 }
4035
4036 const MachineRegisterInfo &MRI = MF.getRegInfo();
4037 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4038}
4039
4041 if (!CI->isTailCall())
4042 return false;
4043
4044 const Function *ParentFn = CI->getParent()->getParent();
4046 return false;
4047 return true;
4048}
4049
4050namespace {
4051// Chain calls have special arguments that we need to handle. These are
4052// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4053// arguments (index 0 and 1 respectively).
4054enum ChainCallArgIdx {
4055 Exec = 2,
4056 Flags,
4057 NumVGPRs,
4058 FallbackExec,
4059 FallbackCallee
4060};
4061} // anonymous namespace
4062
4063// The wave scratch offset register is used as the global base pointer.
4065 SmallVectorImpl<SDValue> &InVals) const {
4066 CallingConv::ID CallConv = CLI.CallConv;
4067 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4068
4069 SelectionDAG &DAG = CLI.DAG;
4070
4071 const SDLoc &DL = CLI.DL;
4072 SDValue Chain = CLI.Chain;
4073 SDValue Callee = CLI.Callee;
4074
4075 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4076 bool UsesDynamicVGPRs = false;
4077 if (IsChainCallConv) {
4078 // The last arguments should be the value that we need to put in EXEC,
4079 // followed by the flags and any other arguments with special meanings.
4080 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4081 // we don't treat them like the "real" arguments.
4082 auto RequestedExecIt =
4083 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4084 return Arg.OrigArgIndex == 2;
4085 });
4086 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4087
4088 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4089 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4090 CLI.OutVals.end());
4091 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4092
4093 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4094 "Haven't popped all the special args");
4095
4096 TargetLowering::ArgListEntry RequestedExecArg =
4097 CLI.Args[ChainCallArgIdx::Exec];
4098 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4099 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4100
4101 // Convert constants into TargetConstants, so they become immediate operands
4102 // instead of being selected into S_MOV.
4103 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4104 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4105 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4106 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4107 } else
4108 ChainCallSpecialArgs.push_back(Arg.Node);
4109 };
4110
4111 PushNodeOrTargetConstant(RequestedExecArg);
4112
4113 // Process any other special arguments depending on the value of the flags.
4114 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4115
4116 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4117 if (FlagsValue.isZero()) {
4118 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4119 return lowerUnhandledCall(CLI, InVals,
4120 "no additional args allowed if flags == 0");
4121 } else if (FlagsValue.isOneBitSet(0)) {
4122 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4123 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4124 }
4125
4126 if (!Subtarget->isWave32()) {
4127 return lowerUnhandledCall(
4128 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4129 }
4130
4131 UsesDynamicVGPRs = true;
4132 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4133 CLI.Args.end(), PushNodeOrTargetConstant);
4134 }
4135 }
4136
4138 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4140 bool &IsTailCall = CLI.IsTailCall;
4141 bool IsVarArg = CLI.IsVarArg;
4142 bool IsSibCall = false;
4144
4145 if (Callee.isUndef() || isNullConstant(Callee)) {
4146 if (!CLI.IsTailCall) {
4147 for (ISD::InputArg &Arg : CLI.Ins)
4148 InVals.push_back(DAG.getPOISON(Arg.VT));
4149 }
4150
4151 return Chain;
4152 }
4153
4154 if (IsVarArg) {
4155 return lowerUnhandledCall(CLI, InVals,
4156 "unsupported call to variadic function ");
4157 }
4158
4159 if (!CLI.CB)
4160 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4161
4162 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4163 return lowerUnhandledCall(CLI, InVals,
4164 "unsupported required tail call to function ");
4165 }
4166
4167 if (IsTailCall) {
4168 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4169 Outs, OutVals, Ins, DAG);
4170 if (!IsTailCall &&
4171 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4172 report_fatal_error("failed to perform tail call elimination on a call "
4173 "site marked musttail or on llvm.amdgcn.cs.chain");
4174 }
4175
4176 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4177
4178 // A sibling call is one where we're under the usual C ABI and not planning
4179 // to change that but can still do a tail call:
4180 if (!TailCallOpt && IsTailCall)
4181 IsSibCall = true;
4182
4183 if (IsTailCall)
4184 ++NumTailCalls;
4185 }
4186
4189 SmallVector<SDValue, 8> MemOpChains;
4190
4191 // Analyze operands of the call, assigning locations to each operand.
4193 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4194 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4195
4196 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4198 // With a fixed ABI, allocate fixed registers before user arguments.
4199 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4200 }
4201
4202 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4203
4204 // Get a count of how many bytes are to be pushed on the stack.
4205 unsigned NumBytes = CCInfo.getStackSize();
4206
4207 if (IsSibCall) {
4208 // Since we're not changing the ABI to make this a tail call, the memory
4209 // operands are already available in the caller's incoming argument space.
4210 NumBytes = 0;
4211 }
4212
4213 // FPDiff is the byte offset of the call's argument area from the callee's.
4214 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4215 // by this amount for a tail call. In a sibling call it must be 0 because the
4216 // caller will deallocate the entire stack and the callee still expects its
4217 // arguments to begin at SP+0. Completely unused for non-tail calls.
4218 int32_t FPDiff = 0;
4219 MachineFrameInfo &MFI = MF.getFrameInfo();
4220 auto *TRI = Subtarget->getRegisterInfo();
4221
4222 // Adjust the stack pointer for the new arguments...
4223 // These operations are automatically eliminated by the prolog/epilog pass
4224 if (!IsSibCall)
4225 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4226
4227 if (!IsSibCall || IsChainCallConv) {
4228 if (!Subtarget->enableFlatScratch()) {
4229 SmallVector<SDValue, 4> CopyFromChains;
4230
4231 // In the HSA case, this should be an identity copy.
4232 SDValue ScratchRSrcReg =
4233 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4234 RegsToPass.emplace_back(IsChainCallConv
4235 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4236 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4237 ScratchRSrcReg);
4238 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4239 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4240 }
4241 }
4242
4243 const unsigned NumSpecialInputs = RegsToPass.size();
4244
4245 MVT PtrVT = MVT::i32;
4246
4247 // Walk the register/memloc assignments, inserting copies/loads.
4248 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4249 CCValAssign &VA = ArgLocs[i];
4250 SDValue Arg = OutVals[i];
4251
4252 // Promote the value if needed.
4253 switch (VA.getLocInfo()) {
4254 case CCValAssign::Full:
4255 break;
4256 case CCValAssign::BCvt:
4257 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4258 break;
4259 case CCValAssign::ZExt:
4260 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4261 break;
4262 case CCValAssign::SExt:
4263 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4264 break;
4265 case CCValAssign::AExt:
4266 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4267 break;
4268 case CCValAssign::FPExt:
4269 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4270 break;
4271 default:
4272 llvm_unreachable("Unknown loc info!");
4273 }
4274
4275 if (VA.isRegLoc()) {
4276 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4277 } else {
4278 assert(VA.isMemLoc());
4279
4280 SDValue DstAddr;
4281 MachinePointerInfo DstInfo;
4282
4283 unsigned LocMemOffset = VA.getLocMemOffset();
4284 int32_t Offset = LocMemOffset;
4285
4286 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4287 MaybeAlign Alignment;
4288
4289 if (IsTailCall) {
4290 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4291 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4292 : VA.getValVT().getStoreSize();
4293
4294 // FIXME: We can have better than the minimum byval required alignment.
4295 Alignment =
4296 Flags.isByVal()
4297 ? Flags.getNonZeroByValAlign()
4298 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4299
4300 Offset = Offset + FPDiff;
4301 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4302
4303 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4304 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4305
4306 // Make sure any stack arguments overlapping with where we're storing
4307 // are loaded before this eventual operation. Otherwise they'll be
4308 // clobbered.
4309
4310 // FIXME: Why is this really necessary? This seems to just result in a
4311 // lot of code to copy the stack and write them back to the same
4312 // locations, which are supposed to be immutable?
4313 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4314 } else {
4315 // Stores to the argument stack area are relative to the stack pointer.
4316 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4317 MVT::i32);
4318 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4319 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4320 Alignment =
4321 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4322 }
4323
4324 if (Outs[i].Flags.isByVal()) {
4325 SDValue SizeNode =
4326 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4327 SDValue Cpy =
4328 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4329 Outs[i].Flags.getNonZeroByValAlign(),
4330 /*isVol = */ false, /*AlwaysInline = */ true,
4331 /*CI=*/nullptr, std::nullopt, DstInfo,
4333
4334 MemOpChains.push_back(Cpy);
4335 } else {
4336 SDValue Store =
4337 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4338 MemOpChains.push_back(Store);
4339 }
4340 }
4341 }
4342
4343 if (!MemOpChains.empty())
4344 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4345
4346 SDValue ReadFirstLaneID =
4347 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4348
4349 SDValue TokenGlue;
4350 if (CLI.ConvergenceControlToken) {
4351 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4353 }
4354
4355 // Build a sequence of copy-to-reg nodes chained together with token chain
4356 // and flag operands which copy the outgoing args into the appropriate regs.
4357 SDValue InGlue;
4358
4359 unsigned ArgIdx = 0;
4360 for (auto [Reg, Val] : RegsToPass) {
4361 if (ArgIdx++ >= NumSpecialInputs &&
4362 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4363 // For chain calls, the inreg arguments are required to be
4364 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4365 // they are uniform.
4366 //
4367 // For other calls, if an inreg arguments is known to be uniform,
4368 // speculatively insert a readfirstlane in case it is in a VGPR.
4369 //
4370 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4371 // value, so let that continue to produce invalid code.
4372
4373 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4374 if (TokenGlue)
4375 ReadfirstlaneArgs.push_back(TokenGlue);
4377 ReadfirstlaneArgs);
4378 }
4379
4380 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4381 InGlue = Chain.getValue(1);
4382 }
4383
4384 // We don't usually want to end the call-sequence here because we would tidy
4385 // the frame up *after* the call, however in the ABI-changing tail-call case
4386 // we've carefully laid out the parameters so that when sp is reset they'll be
4387 // in the correct location.
4388 if (IsTailCall && !IsSibCall) {
4389 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4390 InGlue = Chain.getValue(1);
4391 }
4392
4393 std::vector<SDValue> Ops({Chain});
4394
4395 // Add a redundant copy of the callee global which will not be legalized, as
4396 // we need direct access to the callee later.
4398 const GlobalValue *GV = GSD->getGlobal();
4399 Ops.push_back(Callee);
4400 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4401 } else {
4402 if (IsTailCall) {
4403 // isEligibleForTailCallOptimization considered whether the call target is
4404 // divergent, but we may still end up with a uniform value in a VGPR.
4405 // Insert a readfirstlane just in case.
4406 SDValue ReadFirstLaneID =
4407 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4408
4409 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4410 if (TokenGlue)
4411 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4412 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4413 ReadfirstlaneArgs);
4414 }
4415
4416 Ops.push_back(Callee);
4417 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4418 }
4419
4420 if (IsTailCall) {
4421 // Each tail call may have to adjust the stack by a different amount, so
4422 // this information must travel along with the operation for eventual
4423 // consumption by emitEpilogue.
4424 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4425 }
4426
4427 if (IsChainCallConv)
4428 llvm::append_range(Ops, ChainCallSpecialArgs);
4429
4430 // Add argument registers to the end of the list so that they are known live
4431 // into the call.
4432 for (auto &[Reg, Val] : RegsToPass)
4433 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4434
4435 // Add a register mask operand representing the call-preserved registers.
4436 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4437 assert(Mask && "Missing call preserved mask for calling convention");
4438 Ops.push_back(DAG.getRegisterMask(Mask));
4439
4440 if (SDValue Token = CLI.ConvergenceControlToken) {
4442 GlueOps.push_back(Token);
4443 if (InGlue)
4444 GlueOps.push_back(InGlue);
4445
4446 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4447 MVT::Glue, GlueOps),
4448 0);
4449 }
4450
4451 if (InGlue)
4452 Ops.push_back(InGlue);
4453
4454 // If we're doing a tall call, use a TC_RETURN here rather than an
4455 // actual call instruction.
4456 if (IsTailCall) {
4457 MFI.setHasTailCall();
4458 unsigned OPC = AMDGPUISD::TC_RETURN;
4459 switch (CallConv) {
4462 break;
4465 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4467 break;
4468 }
4469
4470 // If the caller is a whole wave function, we need to use a special opcode
4471 // so we can patch up EXEC.
4472 if (Info->isWholeWaveFunction())
4474
4475 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4476 }
4477
4478 // Returns a chain and a flag for retval copy to use.
4479 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4480 Chain = Call.getValue(0);
4481 InGlue = Call.getValue(1);
4482
4483 uint64_t CalleePopBytes = NumBytes;
4484 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4485 if (!Ins.empty())
4486 InGlue = Chain.getValue(1);
4487
4488 // Handle result values, copying them out of physregs into vregs that we
4489 // return.
4490 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4491 InVals, /*IsThisReturn=*/false, SDValue());
4492}
4493
4494// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4495// except for:
4496// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4497// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4499 SelectionDAG &DAG) const {
4500 const MachineFunction &MF = DAG.getMachineFunction();
4502
4503 SDLoc dl(Op);
4504 EVT VT = Op.getValueType();
4505 SDValue Chain = Op.getOperand(0);
4506 Register SPReg = Info->getStackPtrOffsetReg();
4507
4508 // Chain the dynamic stack allocation so that it doesn't modify the stack
4509 // pointer when other instructions are using the stack.
4510 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4511
4512 SDValue Size = Op.getOperand(1);
4513 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4514 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4515
4516 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4518 "Stack grows upwards for AMDGPU");
4519
4520 Chain = BaseAddr.getValue(1);
4521 Align StackAlign = TFL->getStackAlign();
4522 if (Alignment > StackAlign) {
4523 uint64_t ScaledAlignment = Alignment.value()
4524 << Subtarget->getWavefrontSizeLog2();
4525 uint64_t StackAlignMask = ScaledAlignment - 1;
4526 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4527 DAG.getConstant(StackAlignMask, dl, VT));
4528 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4529 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4530 }
4531
4532 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4533 SDValue NewSP;
4535 // For constant sized alloca, scale alloca size by wave-size
4536 SDValue ScaledSize = DAG.getNode(
4537 ISD::SHL, dl, VT, Size,
4538 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4539 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4540 } else {
4541 // For dynamic sized alloca, perform wave-wide reduction to get max of
4542 // alloca size(divergent) and then scale it by wave-size
4543 SDValue WaveReduction =
4544 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4545 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4546 Size, DAG.getConstant(0, dl, MVT::i32));
4547 SDValue ScaledSize = DAG.getNode(
4548 ISD::SHL, dl, VT, Size,
4549 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4550 NewSP =
4551 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4552 SDValue ReadFirstLaneID =
4553 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4554 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4555 NewSP);
4556 }
4557
4558 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4559 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4560
4561 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4562}
4563
4565 if (Op.getValueType() != MVT::i32)
4566 return Op; // Defer to cannot select error.
4567
4569 SDLoc SL(Op);
4570
4571 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4572
4573 // Convert from wave uniform to swizzled vector address. This should protect
4574 // from any edge cases where the stacksave result isn't directly used with
4575 // stackrestore.
4576 SDValue VectorAddress =
4577 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4578 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4579}
4580
4582 SelectionDAG &DAG) const {
4583 SDLoc SL(Op);
4584 assert(Op.getValueType() == MVT::i32);
4585
4586 uint32_t BothRoundHwReg =
4588 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4589
4590 SDValue IntrinID =
4591 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4592 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4593 Op.getOperand(0), IntrinID, GetRoundBothImm);
4594
4595 // There are two rounding modes, one for f32 and one for f64/f16. We only
4596 // report in the standard value range if both are the same.
4597 //
4598 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4599 // ties away from zero is not supported, and the other values are rotated by
4600 // 1.
4601 //
4602 // If the two rounding modes are not the same, report a target defined value.
4603
4604 // Mode register rounding mode fields:
4605 //
4606 // [1:0] Single-precision round mode.
4607 // [3:2] Double/Half-precision round mode.
4608 //
4609 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4610 //
4611 // Hardware Spec
4612 // Toward-0 3 0
4613 // Nearest Even 0 1
4614 // +Inf 1 2
4615 // -Inf 2 3
4616 // NearestAway0 N/A 4
4617 //
4618 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4619 // table we can index by the raw hardware mode.
4620 //
4621 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4622
4623 SDValue BitTable =
4625
4626 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4627 SDValue RoundModeTimesNumBits =
4628 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4629
4630 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4631 // knew only one mode was demanded.
4632 SDValue TableValue =
4633 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4634 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4635
4636 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4637 SDValue TableEntry =
4638 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4639
4640 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4641 // if it's an extended value.
4642 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4643 SDValue IsStandardValue =
4644 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4645 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4646 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4647 TableEntry, EnumOffset);
4648
4649 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4650}
4651
4653 SelectionDAG &DAG) const {
4654 SDLoc SL(Op);
4655
4656 SDValue NewMode = Op.getOperand(1);
4657 assert(NewMode.getValueType() == MVT::i32);
4658
4659 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4660 // hardware MODE.fp_round values.
4661 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4662 uint32_t ClampedVal = std::min(
4663 static_cast<uint32_t>(ConstMode->getZExtValue()),
4665 NewMode = DAG.getConstant(
4666 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4667 } else {
4668 // If we know the input can only be one of the supported standard modes in
4669 // the range 0-3, we can use a simplified mapping to hardware values.
4670 KnownBits KB = DAG.computeKnownBits(NewMode);
4671 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4672 // The supported standard values are 0-3. The extended values start at 8. We
4673 // need to offset by 4 if the value is in the extended range.
4674
4675 if (UseReducedTable) {
4676 // Truncate to the low 32-bits.
4677 SDValue BitTable = DAG.getConstant(
4678 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4679
4680 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4681 SDValue RoundModeTimesNumBits =
4682 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4683
4684 NewMode =
4685 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4686
4687 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4688 // the table extracted bits into inline immediates.
4689 } else {
4690 // table_index = umin(value, value - 4)
4691 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4692 SDValue BitTable =
4694
4695 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4696 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4697 SDValue IndexVal =
4698 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4699
4700 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4701 SDValue RoundModeTimesNumBits =
4702 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4703
4704 SDValue TableValue =
4705 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4706 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4707
4708 // No need to mask out the high bits since the setreg will ignore them
4709 // anyway.
4710 NewMode = TruncTable;
4711 }
4712
4713 // Insert a readfirstlane in case the value is a VGPR. We could do this
4714 // earlier and keep more operations scalar, but that interferes with
4715 // combining the source.
4716 SDValue ReadFirstLaneID =
4717 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4718 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4719 ReadFirstLaneID, NewMode);
4720 }
4721
4722 // N.B. The setreg will be later folded into s_round_mode on supported
4723 // targets.
4724 SDValue IntrinID =
4725 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4726 uint32_t BothRoundHwReg =
4728 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4729
4730 SDValue SetReg =
4731 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4732 IntrinID, RoundBothImm, NewMode);
4733
4734 return SetReg;
4735}
4736
4738 if (Op->isDivergent() &&
4739 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4740 // Cannot do I$ prefetch with divergent pointer.
4741 return SDValue();
4742
4743 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4747 break;
4749 if (Subtarget->hasSafeSmemPrefetch())
4750 break;
4751 [[fallthrough]];
4752 default:
4753 return SDValue();
4754 }
4755
4756 // I$ prefetch
4757 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4758 return SDValue();
4759
4760 return Op;
4761}
4762
4763// Work around DAG legality rules only based on the result type.
4765 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4766 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4767 EVT SrcVT = Src.getValueType();
4768
4769 if (SrcVT.getScalarType() != MVT::bf16)
4770 return Op;
4771
4772 SDLoc SL(Op);
4773 SDValue BitCast =
4774 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4775
4776 EVT DstVT = Op.getValueType();
4777 if (IsStrict)
4778 llvm_unreachable("Need STRICT_BF16_TO_FP");
4779
4780 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4781}
4782
4784 SDLoc SL(Op);
4785 if (Op.getValueType() != MVT::i64)
4786 return Op;
4787
4788 uint32_t ModeHwReg =
4790 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4791 uint32_t TrapHwReg =
4793 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4794
4795 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4796 SDValue IntrinID =
4797 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4798 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4799 Op.getOperand(0), IntrinID, ModeHwRegImm);
4800 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4801 Op.getOperand(0), IntrinID, TrapHwRegImm);
4802 SDValue TokenReg =
4803 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4804 GetTrapReg.getValue(1));
4805
4806 SDValue CvtPtr =
4807 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4808 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4809
4810 return DAG.getMergeValues({Result, TokenReg}, SL);
4811}
4812
4814 SDLoc SL(Op);
4815 if (Op.getOperand(1).getValueType() != MVT::i64)
4816 return Op;
4817
4818 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4819 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4820 DAG.getConstant(0, SL, MVT::i32));
4821 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4822 DAG.getConstant(1, SL, MVT::i32));
4823
4824 SDValue ReadFirstLaneID =
4825 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4826 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4827 ReadFirstLaneID, NewModeReg);
4828 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4829 ReadFirstLaneID, NewTrapReg);
4830
4831 unsigned ModeHwReg =
4833 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4834 unsigned TrapHwReg =
4836 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4837
4838 SDValue IntrinID =
4839 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4840 SDValue SetModeReg =
4841 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4842 IntrinID, ModeHwRegImm, NewModeReg);
4843 SDValue SetTrapReg =
4844 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4845 IntrinID, TrapHwRegImm, NewTrapReg);
4846 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4847}
4848
4850 const MachineFunction &MF) const {
4851 const Function &Fn = MF.getFunction();
4852
4854 .Case("m0", AMDGPU::M0)
4855 .Case("exec", AMDGPU::EXEC)
4856 .Case("exec_lo", AMDGPU::EXEC_LO)
4857 .Case("exec_hi", AMDGPU::EXEC_HI)
4858 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4859 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4860 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4861 .Default(Register());
4862 if (!Reg)
4863 return Reg;
4864
4865 if (!Subtarget->hasFlatScrRegister() &&
4866 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4867 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4868 "\" for subtarget."));
4869 }
4870
4871 switch (Reg) {
4872 case AMDGPU::M0:
4873 case AMDGPU::EXEC_LO:
4874 case AMDGPU::EXEC_HI:
4875 case AMDGPU::FLAT_SCR_LO:
4876 case AMDGPU::FLAT_SCR_HI:
4877 if (VT.getSizeInBits() == 32)
4878 return Reg;
4879 break;
4880 case AMDGPU::EXEC:
4881 case AMDGPU::FLAT_SCR:
4882 if (VT.getSizeInBits() == 64)
4883 return Reg;
4884 break;
4885 default:
4886 llvm_unreachable("missing register type checking");
4887 }
4888
4890 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4891}
4892
4893// If kill is not the last instruction, split the block so kill is always a
4894// proper terminator.
4897 MachineBasicBlock *BB) const {
4898 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4900 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4901 return SplitBB;
4902}
4903
4904// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4905// \p MI will be the only instruction in the loop body block. Otherwise, it will
4906// be the first instruction in the remainder block.
4907//
4908/// \returns { LoopBody, Remainder }
4909static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4911 MachineFunction *MF = MBB.getParent();
4913
4914 // To insert the loop we need to split the block. Move everything after this
4915 // point to a new block, and insert a new empty block between the two.
4917 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4919 ++MBBI;
4920
4921 MF->insert(MBBI, LoopBB);
4922 MF->insert(MBBI, RemainderBB);
4923
4924 LoopBB->addSuccessor(LoopBB);
4925 LoopBB->addSuccessor(RemainderBB);
4926
4927 // Move the rest of the block into a new block.
4928 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4929
4930 if (InstInLoop) {
4931 auto Next = std::next(I);
4932
4933 // Move instruction to loop body.
4934 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4935
4936 // Move the rest of the block.
4937 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4938 } else {
4939 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4940 }
4941
4942 MBB.addSuccessor(LoopBB);
4943
4944 return std::pair(LoopBB, RemainderBB);
4945}
4946
4947/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4949 MachineBasicBlock *MBB = MI.getParent();
4951 auto I = MI.getIterator();
4952 auto E = std::next(I);
4953
4954 // clang-format off
4955 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4956 .addImm(0);
4957 // clang-format on
4958
4959 MIBundleBuilder Bundler(*MBB, I, E);
4960 finalizeBundle(*MBB, Bundler.begin());
4961}
4962
4965 MachineBasicBlock *BB) const {
4966 const DebugLoc &DL = MI.getDebugLoc();
4967
4969
4971
4972 // Apparently kill flags are only valid if the def is in the same block?
4973 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4974 Src->setIsKill(false);
4975
4976 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4977
4978 MachineBasicBlock::iterator I = LoopBB->end();
4979
4980 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4982
4983 // Clear TRAP_STS.MEM_VIOL
4984 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4985 .addImm(0)
4986 .addImm(EncodedReg);
4987
4989
4990 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4991
4992 // Load and check TRAP_STS.MEM_VIOL
4993 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4994 .addImm(EncodedReg);
4995
4996 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4997 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4998 .addReg(Reg, RegState::Kill)
4999 .addImm(0);
5000 // clang-format off
5001 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5002 .addMBB(LoopBB);
5003 // clang-format on
5004
5005 return RemainderBB;
5006}
5007
5008// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5009// wavefront. If the value is uniform and just happens to be in a VGPR, this
5010// will only do one iteration. In the worst case, this will loop 64 times.
5011//
5012// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5015 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5016 const DebugLoc &DL, const MachineOperand &Idx,
5017 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5018 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5019 Register &SGPRIdxReg) {
5020
5021 MachineFunction *MF = OrigBB.getParent();
5022 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5023 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5025
5026 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5027 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5028 Register NewExec = MRI.createVirtualRegister(BoolRC);
5029 Register CurrentIdxReg =
5030 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5031 Register CondReg = MRI.createVirtualRegister(BoolRC);
5032
5033 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5034 .addReg(InitReg)
5035 .addMBB(&OrigBB)
5036 .addReg(ResultReg)
5037 .addMBB(&LoopBB);
5038
5039 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5040 .addReg(InitSaveExecReg)
5041 .addMBB(&OrigBB)
5042 .addReg(NewExec)
5043 .addMBB(&LoopBB);
5044
5045 // Read the next variant <- also loop target.
5046 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5047 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5048
5049 // Compare the just read M0 value to all possible Idx values.
5050 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5051 .addReg(CurrentIdxReg)
5052 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5053
5054 // Update EXEC, save the original EXEC value to VCC.
5055 BuildMI(LoopBB, I, DL,
5056 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
5057 : AMDGPU::S_AND_SAVEEXEC_B64),
5058 NewExec)
5059 .addReg(CondReg, RegState::Kill);
5060
5061 MRI.setSimpleHint(NewExec, CondReg);
5062
5063 if (UseGPRIdxMode) {
5064 if (Offset == 0) {
5065 SGPRIdxReg = CurrentIdxReg;
5066 } else {
5067 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5068 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5069 .addReg(CurrentIdxReg, RegState::Kill)
5070 .addImm(Offset);
5071 }
5072 } else {
5073 // Move index from VCC into M0
5074 if (Offset == 0) {
5075 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5076 .addReg(CurrentIdxReg, RegState::Kill);
5077 } else {
5078 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5079 .addReg(CurrentIdxReg, RegState::Kill)
5080 .addImm(Offset);
5081 }
5082 }
5083
5084 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5085 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5086 MachineInstr *InsertPt =
5087 BuildMI(LoopBB, I, DL,
5088 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
5089 : AMDGPU::S_XOR_B64_term),
5090 Exec)
5091 .addReg(Exec)
5092 .addReg(NewExec);
5093
5094 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5095 // s_cbranch_scc0?
5096
5097 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5098 // clang-format off
5099 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5100 .addMBB(&LoopBB);
5101 // clang-format on
5102
5103 return InsertPt->getIterator();
5104}
5105
5106// This has slightly sub-optimal regalloc when the source vector is killed by
5107// the read. The register allocator does not understand that the kill is
5108// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5109// subregister from it, using 1 more VGPR than necessary. This was saved when
5110// this was expanded after register allocation.
5113 unsigned InitResultReg, unsigned PhiReg, int Offset,
5114 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5115 MachineFunction *MF = MBB.getParent();
5116 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5117 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5119 const DebugLoc &DL = MI.getDebugLoc();
5121
5122 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5123 Register DstReg = MI.getOperand(0).getReg();
5124 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5125 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5126 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5127 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5128
5129 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5130
5131 // Save the EXEC mask
5132 // clang-format off
5133 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
5134 .addReg(Exec);
5135 // clang-format on
5136
5137 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5138
5139 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5140
5141 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5142 InitResultReg, DstReg, PhiReg, TmpExec,
5143 Offset, UseGPRIdxMode, SGPRIdxReg);
5144
5145 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5147 ++MBBI;
5148 MF->insert(MBBI, LandingPad);
5149 LoopBB->removeSuccessor(RemainderBB);
5150 LandingPad->addSuccessor(RemainderBB);
5151 LoopBB->addSuccessor(LandingPad);
5152 MachineBasicBlock::iterator First = LandingPad->begin();
5153 // clang-format off
5154 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
5155 .addReg(SaveExec);
5156 // clang-format on
5157
5158 return InsPt;
5159}
5160
5161// Returns subreg index, offset
5162static std::pair<unsigned, int>
5164 const TargetRegisterClass *SuperRC, unsigned VecReg,
5165 int Offset) {
5166 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5167
5168 // Skip out of bounds offsets, or else we would end up using an undefined
5169 // register.
5170 if (Offset >= NumElts || Offset < 0)
5171 return std::pair(AMDGPU::sub0, Offset);
5172
5173 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5174}
5175
5178 int Offset) {
5179 MachineBasicBlock *MBB = MI.getParent();
5180 const DebugLoc &DL = MI.getDebugLoc();
5182
5183 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5184
5185 assert(Idx->getReg() != AMDGPU::NoRegister);
5186
5187 if (Offset == 0) {
5188 // clang-format off
5189 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5190 .add(*Idx);
5191 // clang-format on
5192 } else {
5193 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5194 .add(*Idx)
5195 .addImm(Offset);
5196 }
5197}
5198
5201 int Offset) {
5202 MachineBasicBlock *MBB = MI.getParent();
5203 const DebugLoc &DL = MI.getDebugLoc();
5205
5206 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5207
5208 if (Offset == 0)
5209 return Idx->getReg();
5210
5211 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5212 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5213 .add(*Idx)
5214 .addImm(Offset);
5215 return Tmp;
5216}
5217
5220 const GCNSubtarget &ST) {
5221 const SIInstrInfo *TII = ST.getInstrInfo();
5222 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5223 MachineFunction *MF = MBB.getParent();
5225
5226 Register Dst = MI.getOperand(0).getReg();
5227 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5228 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5229 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5230
5231 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5232 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5233
5234 unsigned SubReg;
5235 std::tie(SubReg, Offset) =
5236 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5237
5238 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5239
5240 // Check for a SGPR index.
5241 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5243 const DebugLoc &DL = MI.getDebugLoc();
5244
5245 if (UseGPRIdxMode) {
5246 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5247 // to avoid interfering with other uses, so probably requires a new
5248 // optimization pass.
5250
5251 const MCInstrDesc &GPRIDXDesc =
5252 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5253 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5254 .addReg(SrcReg)
5255 .addReg(Idx)
5256 .addImm(SubReg);
5257 } else {
5259
5260 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5261 .addReg(SrcReg, 0, SubReg)
5262 .addReg(SrcReg, RegState::Implicit);
5263 }
5264
5265 MI.eraseFromParent();
5266
5267 return &MBB;
5268 }
5269
5270 // Control flow needs to be inserted if indexing with a VGPR.
5271 const DebugLoc &DL = MI.getDebugLoc();
5273
5274 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5275 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5276
5277 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5278
5279 Register SGPRIdxReg;
5280 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5281 UseGPRIdxMode, SGPRIdxReg);
5282
5283 MachineBasicBlock *LoopBB = InsPt->getParent();
5284
5285 if (UseGPRIdxMode) {
5286 const MCInstrDesc &GPRIDXDesc =
5287 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5288
5289 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5290 .addReg(SrcReg)
5291 .addReg(SGPRIdxReg)
5292 .addImm(SubReg);
5293 } else {
5294 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5295 .addReg(SrcReg, 0, SubReg)
5296 .addReg(SrcReg, RegState::Implicit);
5297 }
5298
5299 MI.eraseFromParent();
5300
5301 return LoopBB;
5302}
5303
5306 const GCNSubtarget &ST) {
5307 const SIInstrInfo *TII = ST.getInstrInfo();
5308 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5309 MachineFunction *MF = MBB.getParent();
5311
5312 Register Dst = MI.getOperand(0).getReg();
5313 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5314 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5315 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5316 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5317 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5318 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5319
5320 // This can be an immediate, but will be folded later.
5321 assert(Val->getReg());
5322
5323 unsigned SubReg;
5324 std::tie(SubReg, Offset) =
5325 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5326 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5327
5328 if (Idx->getReg() == AMDGPU::NoRegister) {
5330 const DebugLoc &DL = MI.getDebugLoc();
5331
5332 assert(Offset == 0);
5333
5334 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5335 .add(*SrcVec)
5336 .add(*Val)
5337 .addImm(SubReg);
5338
5339 MI.eraseFromParent();
5340 return &MBB;
5341 }
5342
5343 // Check for a SGPR index.
5344 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5346 const DebugLoc &DL = MI.getDebugLoc();
5347
5348 if (UseGPRIdxMode) {
5350
5351 const MCInstrDesc &GPRIDXDesc =
5352 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5353 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5354 .addReg(SrcVec->getReg())
5355 .add(*Val)
5356 .addReg(Idx)
5357 .addImm(SubReg);
5358 } else {
5360
5361 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5362 TRI.getRegSizeInBits(*VecRC), 32, false);
5363 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5364 .addReg(SrcVec->getReg())
5365 .add(*Val)
5366 .addImm(SubReg);
5367 }
5368 MI.eraseFromParent();
5369 return &MBB;
5370 }
5371
5372 // Control flow needs to be inserted if indexing with a VGPR.
5373 if (Val->isReg())
5374 MRI.clearKillFlags(Val->getReg());
5375
5376 const DebugLoc &DL = MI.getDebugLoc();
5377
5378 Register PhiReg = MRI.createVirtualRegister(VecRC);
5379
5380 Register SGPRIdxReg;
5381 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5382 UseGPRIdxMode, SGPRIdxReg);
5383 MachineBasicBlock *LoopBB = InsPt->getParent();
5384
5385 if (UseGPRIdxMode) {
5386 const MCInstrDesc &GPRIDXDesc =
5387 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5388
5389 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5390 .addReg(PhiReg)
5391 .add(*Val)
5392 .addReg(SGPRIdxReg)
5393 .addImm(SubReg);
5394 } else {
5395 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5396 TRI.getRegSizeInBits(*VecRC), 32, false);
5397 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5398 .addReg(PhiReg)
5399 .add(*Val)
5400 .addImm(SubReg);
5401 }
5402
5403 MI.eraseFromParent();
5404 return LoopBB;
5405}
5406
5408 MachineBasicBlock *BB) {
5409 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5410 // For GFX12, we emit s_add_u64 and s_sub_u64.
5411 MachineFunction *MF = BB->getParent();
5412 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5413 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5415 const DebugLoc &DL = MI.getDebugLoc();
5416 MachineOperand &Dest = MI.getOperand(0);
5417 MachineOperand &Src0 = MI.getOperand(1);
5418 MachineOperand &Src1 = MI.getOperand(2);
5419 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5420 if (ST.hasScalarAddSub64()) {
5421 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5422 // clang-format off
5423 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5424 .add(Src0)
5425 .add(Src1);
5426 // clang-format on
5427 } else {
5428 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5429 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5430
5431 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5432 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5433
5434 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5435 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5436 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5437 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5438
5439 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5440 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5441 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5442 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5443
5444 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5445 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5446 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5447 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5448 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5449 .addReg(DestSub0)
5450 .addImm(AMDGPU::sub0)
5451 .addReg(DestSub1)
5452 .addImm(AMDGPU::sub1);
5453 }
5454 MI.eraseFromParent();
5455 return BB;
5456}
5457
5459 switch (Opc) {
5460 case AMDGPU::S_MIN_U32:
5461 return std::numeric_limits<uint32_t>::max();
5462 case AMDGPU::S_MIN_I32:
5463 return std::numeric_limits<int32_t>::max();
5464 case AMDGPU::S_MAX_U32:
5465 return std::numeric_limits<uint32_t>::min();
5466 case AMDGPU::S_MAX_I32:
5467 return std::numeric_limits<int32_t>::min();
5468 case AMDGPU::S_ADD_I32:
5469 case AMDGPU::S_SUB_I32:
5470 case AMDGPU::S_OR_B32:
5471 case AMDGPU::S_XOR_B32:
5472 return std::numeric_limits<uint32_t>::min();
5473 case AMDGPU::S_AND_B32:
5474 return std::numeric_limits<uint32_t>::max();
5475 default:
5477 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5478 }
5479}
5480
5482 switch (Opc) {
5483 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5484 return std::numeric_limits<uint64_t>::max();
5485 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5486 return std::numeric_limits<int64_t>::max();
5487 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5488 return std::numeric_limits<uint64_t>::min();
5489 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5490 return std::numeric_limits<int64_t>::min();
5491 case AMDGPU::S_ADD_U64_PSEUDO:
5492 case AMDGPU::S_SUB_U64_PSEUDO:
5493 case AMDGPU::S_OR_B64:
5494 case AMDGPU::S_XOR_B64:
5495 return std::numeric_limits<uint64_t>::min();
5496 case AMDGPU::S_AND_B64:
5497 return std::numeric_limits<uint64_t>::max();
5498 default:
5500 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5501 }
5502}
5503
5504static bool is32bitWaveReduceOperation(unsigned Opc) {
5505 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5506 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5507 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5508 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5509 Opc == AMDGPU::S_XOR_B32;
5510}
5511
5514 const GCNSubtarget &ST,
5515 unsigned Opc) {
5517 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5518 const DebugLoc &DL = MI.getDebugLoc();
5519 const SIInstrInfo *TII = ST.getInstrInfo();
5520
5521 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5522 Register SrcReg = MI.getOperand(1).getReg();
5523 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5524 Register DstReg = MI.getOperand(0).getReg();
5525 MachineBasicBlock *RetBB = nullptr;
5526 if (isSGPR) {
5527 switch (Opc) {
5528 case AMDGPU::S_MIN_U32:
5529 case AMDGPU::S_MIN_I32:
5530 case AMDGPU::S_MAX_U32:
5531 case AMDGPU::S_MAX_I32:
5532 case AMDGPU::S_AND_B32:
5533 case AMDGPU::S_OR_B32: {
5534 // Idempotent operations.
5535 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5536 RetBB = &BB;
5537 break;
5538 }
5539 case AMDGPU::V_CMP_LT_U64_e64: // umin
5540 case AMDGPU::V_CMP_LT_I64_e64: // min
5541 case AMDGPU::V_CMP_GT_U64_e64: // umax
5542 case AMDGPU::V_CMP_GT_I64_e64: // max
5543 case AMDGPU::S_AND_B64:
5544 case AMDGPU::S_OR_B64: {
5545 // Idempotent operations.
5546 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5547 RetBB = &BB;
5548 break;
5549 }
5550 case AMDGPU::S_XOR_B32:
5551 case AMDGPU::S_XOR_B64:
5552 case AMDGPU::S_ADD_I32:
5553 case AMDGPU::S_ADD_U64_PSEUDO:
5554 case AMDGPU::S_SUB_I32:
5555 case AMDGPU::S_SUB_U64_PSEUDO: {
5556 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5557 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5558 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5559 Register NumActiveLanes =
5560 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5561
5562 bool IsWave32 = ST.isWave32();
5563 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5564 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5565 unsigned BitCountOpc =
5566 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5567
5568 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5569
5570 auto NewAccumulator =
5571 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5572 .addReg(ExecMask);
5573
5574 switch (Opc) {
5575 case AMDGPU::S_XOR_B32:
5576 case AMDGPU::S_XOR_B64: {
5577 // Performing an XOR operation on a uniform value
5578 // depends on the parity of the number of active lanes.
5579 // For even parity, the result will be 0, for odd
5580 // parity the result will be the same as the input value.
5581 Register ParityRegister =
5582 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5583
5584 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5585 .addReg(NewAccumulator->getOperand(0).getReg())
5586 .addImm(1)
5587 .setOperandDead(3); // Dead scc
5588 if (Opc == AMDGPU::S_XOR_B32) {
5589 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5590 .addReg(SrcReg)
5591 .addReg(ParityRegister);
5592 } else {
5593 Register DestSub0 =
5594 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5595 Register DestSub1 =
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5597
5598 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5599 const TargetRegisterClass *SrcSubRC =
5600 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5601
5602 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5603 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5604 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5605 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5606
5607 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5608 .add(Op1L)
5609 .addReg(ParityRegister);
5610
5611 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5612 .add(Op1H)
5613 .addReg(ParityRegister);
5614
5615 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5616 .addReg(DestSub0)
5617 .addImm(AMDGPU::sub0)
5618 .addReg(DestSub1)
5619 .addImm(AMDGPU::sub1);
5620 }
5621 break;
5622 }
5623 case AMDGPU::S_SUB_I32: {
5624 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5625
5626 // Take the negation of the source operand.
5627 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5628 .addImm(0)
5629 .addReg(SrcReg);
5630 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5631 .addReg(NegatedVal)
5632 .addReg(NewAccumulator->getOperand(0).getReg());
5633 break;
5634 }
5635 case AMDGPU::S_ADD_I32: {
5636 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5637 .addReg(SrcReg)
5638 .addReg(NewAccumulator->getOperand(0).getReg());
5639 break;
5640 }
5641 case AMDGPU::S_ADD_U64_PSEUDO:
5642 case AMDGPU::S_SUB_U64_PSEUDO: {
5643 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5644 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5645 Register Op1H_Op0L_Reg =
5646 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5647 Register Op1L_Op0H_Reg =
5648 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register NegatedValLo =
5652 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register NegatedValHi =
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655
5656 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5657 const TargetRegisterClass *Src1SubRC =
5658 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5659
5660 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5661 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5662 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5663 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5664
5665 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5666 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5667 .addImm(0)
5668 .addReg(NewAccumulator->getOperand(0).getReg())
5669 .setOperandDead(3); // Dead scc
5670 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5671 .addReg(NegatedValLo)
5672 .addImm(31)
5673 .setOperandDead(3); // Dead scc
5674 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5675 .add(Op1L)
5676 .addReg(NegatedValHi);
5677 }
5678 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5679 ? NegatedValLo
5680 : NewAccumulator->getOperand(0).getReg();
5681 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5682 .add(Op1L)
5683 .addReg(LowOpcode);
5684 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5685 .add(Op1L)
5686 .addReg(LowOpcode);
5687 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5688 .add(Op1H)
5689 .addReg(LowOpcode);
5690
5691 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5692 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5693 .addReg(CarryReg)
5694 .addReg(Op1H_Op0L_Reg)
5695 .setOperandDead(3); // Dead scc
5696
5697 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5698 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5699 .addReg(HiVal)
5700 .addReg(Op1L_Op0H_Reg)
5701 .setOperandDead(3); // Dead scc
5702 }
5703 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5704 .addReg(DestSub0)
5705 .addImm(AMDGPU::sub0)
5706 .addReg(DestSub1)
5707 .addImm(AMDGPU::sub1);
5708 break;
5709 }
5710 }
5711 RetBB = &BB;
5712 }
5713 }
5714 } else {
5715 // TODO: Implement DPP Strategy and switch based on immediate strategy
5716 // operand. For now, for all the cases (default, Iterative and DPP we use
5717 // iterative approach by default.)
5718
5719 // To reduce the VGPR using iterative approach, we need to iterate
5720 // over all the active lanes. Lowering consists of ComputeLoop,
5721 // which iterate over only active lanes. We use copy of EXEC register
5722 // as induction variable and every active lane modifies it using bitset0
5723 // so that we will get the next active lane for next iteration.
5725 Register SrcReg = MI.getOperand(1).getReg();
5726 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5727
5728 // Create Control flow for loop
5729 // Split MI's Machine Basic block into For loop
5730 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5731
5732 // Create virtual registers required for lowering.
5733 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5734 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5735 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5736 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5737 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5738 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5739 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5740 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5741 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5742
5743 bool IsWave32 = ST.isWave32();
5744 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5745 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5746
5747 // Create initial values of induction variable from Exec, Accumulator and
5748 // insert branch instr to newly created ComputeBlock
5749 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5750 if (is32BitOpc) {
5752 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5753 .addImm(IdentityValue);
5754 } else {
5756 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5757 .addImm(IdentityValue);
5758 }
5759 // clang-format off
5760 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5761 .addMBB(ComputeLoop);
5762 // clang-format on
5763
5764 // Start constructing ComputeLoop
5765 I = ComputeLoop->begin();
5766 auto Accumulator =
5767 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5768 .addReg(IdentityValReg)
5769 .addMBB(&BB);
5770 auto ActiveBits =
5771 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5772 .addReg(LoopIterator)
5773 .addMBB(&BB);
5774
5775 I = ComputeLoop->end();
5776 MachineInstr *NewAccumulator;
5777 // Perform the computations
5778 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5779 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5780 .addReg(ActiveBitsReg);
5781 if (is32BitOpc) {
5782 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5783 LaneValueReg)
5784 .addReg(SrcReg)
5785 .addReg(FF1Reg);
5786 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5787 .addReg(Accumulator->getOperand(0).getReg())
5788 .addReg(LaneValueReg);
5789 } else {
5790 Register LaneValueLoReg =
5791 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5792 Register LaneValueHiReg =
5793 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5794 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5795 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5796 const TargetRegisterClass *SrcSubRC =
5797 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5798 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5799 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5800 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5801 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5802 // lane value input should be in an sgpr
5803 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5804 LaneValueLoReg)
5805 .add(Op1L)
5806 .addReg(FF1Reg);
5807 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5808 LaneValueHiReg)
5809 .add(Op1H)
5810 .addReg(FF1Reg);
5811 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5812 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5813 .addReg(LaneValueLoReg)
5814 .addImm(AMDGPU::sub0)
5815 .addReg(LaneValueHiReg)
5816 .addImm(AMDGPU::sub1);
5817 switch (Opc) {
5818 case AMDGPU::S_OR_B64:
5819 case AMDGPU::S_AND_B64:
5820 case AMDGPU::S_XOR_B64: {
5821 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5822 .addReg(Accumulator->getOperand(0).getReg())
5823 .addReg(LaneValue->getOperand(0).getReg())
5824 .setOperandDead(3); // Dead scc
5825 break;
5826 }
5827 case AMDGPU::V_CMP_GT_I64_e64:
5828 case AMDGPU::V_CMP_GT_U64_e64:
5829 case AMDGPU::V_CMP_LT_I64_e64:
5830 case AMDGPU::V_CMP_LT_U64_e64: {
5831 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5832 Register ComparisonResultReg =
5833 MRI.createVirtualRegister(WaveMaskRegClass);
5834 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5835 const TargetRegisterClass *VSubRegClass =
5836 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5837 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5838 MachineOperand SrcReg0Sub0 =
5839 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5840 VregClass, AMDGPU::sub0, VSubRegClass);
5841 MachineOperand SrcReg0Sub1 =
5842 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5843 VregClass, AMDGPU::sub1, VSubRegClass);
5844 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5845 AccumulatorVReg)
5846 .add(SrcReg0Sub0)
5847 .addImm(AMDGPU::sub0)
5848 .add(SrcReg0Sub1)
5849 .addImm(AMDGPU::sub1);
5850 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5851 .addReg(LaneValue->getOperand(0).getReg())
5852 .addReg(AccumulatorVReg);
5853
5854 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5855 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5856 .addReg(LaneMaskReg)
5857 .addReg(ActiveBitsReg);
5858
5859 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5860 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5861 .addReg(LaneValue->getOperand(0).getReg())
5862 .addReg(Accumulator->getOperand(0).getReg());
5863 break;
5864 }
5865 case AMDGPU::S_ADD_U64_PSEUDO:
5866 case AMDGPU::S_SUB_U64_PSEUDO: {
5867 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5868 .addReg(Accumulator->getOperand(0).getReg())
5869 .addReg(LaneValue->getOperand(0).getReg());
5870 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5871 break;
5872 }
5873 }
5874 }
5875 // Manipulate the iterator to get the next active lane
5876 unsigned BITSETOpc =
5877 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5878 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5879 .addReg(FF1Reg)
5880 .addReg(ActiveBitsReg);
5881
5882 // Add phi nodes
5883 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5884 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5885
5886 // Creating branching
5887 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5888 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5889 .addReg(NewActiveBitsReg)
5890 .addImm(0);
5891 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5892 .addMBB(ComputeLoop);
5893
5894 RetBB = ComputeEnd;
5895 }
5896 MI.eraseFromParent();
5897 return RetBB;
5898}
5899
5902 MachineBasicBlock *BB) const {
5903
5905 MachineFunction *MF = BB->getParent();
5907
5908 switch (MI.getOpcode()) {
5909 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5910 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5911 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5912 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5913 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5914 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5915 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5916 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5917 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5918 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5919 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5920 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5921 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5922 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5923 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5924 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5925 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5926 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5927 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5928 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5929 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5930 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5931 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5932 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5933 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5934 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5935 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5936 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5937 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5938 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5939 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5940 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5941 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5942 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5943 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5944 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5945 case AMDGPU::S_UADDO_PSEUDO:
5946 case AMDGPU::S_USUBO_PSEUDO: {
5947 const DebugLoc &DL = MI.getDebugLoc();
5948 MachineOperand &Dest0 = MI.getOperand(0);
5949 MachineOperand &Dest1 = MI.getOperand(1);
5950 MachineOperand &Src0 = MI.getOperand(2);
5951 MachineOperand &Src1 = MI.getOperand(3);
5952
5953 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5954 ? AMDGPU::S_ADD_I32
5955 : AMDGPU::S_SUB_I32;
5956 // clang-format off
5957 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5958 .add(Src0)
5959 .add(Src1);
5960 // clang-format on
5961
5962 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5963 .addImm(1)
5964 .addImm(0);
5965
5966 MI.eraseFromParent();
5967 return BB;
5968 }
5969 case AMDGPU::S_ADD_U64_PSEUDO:
5970 case AMDGPU::S_SUB_U64_PSEUDO: {
5971 return Expand64BitScalarArithmetic(MI, BB);
5972 }
5973 case AMDGPU::V_ADD_U64_PSEUDO:
5974 case AMDGPU::V_SUB_U64_PSEUDO: {
5976 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5977 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5978 const DebugLoc &DL = MI.getDebugLoc();
5979
5980 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5981
5982 MachineOperand &Dest = MI.getOperand(0);
5983 MachineOperand &Src0 = MI.getOperand(1);
5984 MachineOperand &Src1 = MI.getOperand(2);
5985
5986 if (ST.hasAddSubU64Insts()) {
5987 auto I = BuildMI(*BB, MI, DL,
5988 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5989 : AMDGPU::V_SUB_U64_e64),
5990 Dest.getReg())
5991 .add(Src0)
5992 .add(Src1)
5993 .addImm(0); // clamp
5994 TII->legalizeOperands(*I);
5995 MI.eraseFromParent();
5996 return BB;
5997 }
5998
5999 if (IsAdd && ST.hasLshlAddU64Inst()) {
6000 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6001 Dest.getReg())
6002 .add(Src0)
6003 .addImm(0)
6004 .add(Src1);
6005 TII->legalizeOperands(*Add);
6006 MI.eraseFromParent();
6007 return BB;
6008 }
6009
6010 const auto *CarryRC = TRI->getWaveMaskRegClass();
6011
6012 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6013 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6014
6015 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6016 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6017
6018 const TargetRegisterClass *Src0RC = Src0.isReg()
6019 ? MRI.getRegClass(Src0.getReg())
6020 : &AMDGPU::VReg_64RegClass;
6021 const TargetRegisterClass *Src1RC = Src1.isReg()
6022 ? MRI.getRegClass(Src1.getReg())
6023 : &AMDGPU::VReg_64RegClass;
6024
6025 const TargetRegisterClass *Src0SubRC =
6026 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6027 const TargetRegisterClass *Src1SubRC =
6028 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6029
6030 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6031 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6032 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6033 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6034
6035 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6036 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6037 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6038 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6039
6040 unsigned LoOpc =
6041 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6042 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6043 .addReg(CarryReg, RegState::Define)
6044 .add(SrcReg0Sub0)
6045 .add(SrcReg1Sub0)
6046 .addImm(0); // clamp bit
6047
6048 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6049 MachineInstr *HiHalf =
6050 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6051 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6052 .add(SrcReg0Sub1)
6053 .add(SrcReg1Sub1)
6054 .addReg(CarryReg, RegState::Kill)
6055 .addImm(0); // clamp bit
6056
6057 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6058 .addReg(DestSub0)
6059 .addImm(AMDGPU::sub0)
6060 .addReg(DestSub1)
6061 .addImm(AMDGPU::sub1);
6062 TII->legalizeOperands(*LoHalf);
6063 TII->legalizeOperands(*HiHalf);
6064 MI.eraseFromParent();
6065 return BB;
6066 }
6067 case AMDGPU::S_ADD_CO_PSEUDO:
6068 case AMDGPU::S_SUB_CO_PSEUDO: {
6069 // This pseudo has a chance to be selected
6070 // only from uniform add/subcarry node. All the VGPR operands
6071 // therefore assumed to be splat vectors.
6073 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6074 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6076 const DebugLoc &DL = MI.getDebugLoc();
6077 MachineOperand &Dest = MI.getOperand(0);
6078 MachineOperand &CarryDest = MI.getOperand(1);
6079 MachineOperand &Src0 = MI.getOperand(2);
6080 MachineOperand &Src1 = MI.getOperand(3);
6081 MachineOperand &Src2 = MI.getOperand(4);
6082 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6083 ? AMDGPU::S_ADDC_U32
6084 : AMDGPU::S_SUBB_U32;
6085 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6086 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6087 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6088 .addReg(Src0.getReg());
6089 Src0.setReg(RegOp0);
6090 }
6091 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6092 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6093 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6094 .addReg(Src1.getReg());
6095 Src1.setReg(RegOp1);
6096 }
6097 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6098 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6099 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6100 .addReg(Src2.getReg());
6101 Src2.setReg(RegOp2);
6102 }
6103
6104 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6105 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
6106 assert(WaveSize == 64 || WaveSize == 32);
6107
6108 if (WaveSize == 64) {
6109 if (ST.hasScalarCompareEq64()) {
6110 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6111 .addReg(Src2.getReg())
6112 .addImm(0);
6113 } else {
6114 const TargetRegisterClass *SubRC =
6115 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6116 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6117 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6118 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6119 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6120 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6121
6122 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6123 .add(Src2Sub0)
6124 .add(Src2Sub1);
6125
6126 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6127 .addReg(Src2_32, RegState::Kill)
6128 .addImm(0);
6129 }
6130 } else {
6131 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6132 .addReg(Src2.getReg())
6133 .addImm(0);
6134 }
6135
6136 // clang-format off
6137 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6138 .add(Src0)
6139 .add(Src1);
6140 // clang-format on
6141
6142 unsigned SelOpc =
6143 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6144
6145 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6146 .addImm(-1)
6147 .addImm(0);
6148
6149 MI.eraseFromParent();
6150 return BB;
6151 }
6152 case AMDGPU::SI_INIT_M0: {
6153 MachineOperand &M0Init = MI.getOperand(0);
6154 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6155 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6156 AMDGPU::M0)
6157 .add(M0Init);
6158 MI.eraseFromParent();
6159 return BB;
6160 }
6161 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6162 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6163 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6164 TII->get(AMDGPU::S_CMP_EQ_U32))
6165 .addImm(0)
6166 .addImm(0);
6167 return BB;
6168 }
6169 case AMDGPU::GET_GROUPSTATICSIZE: {
6170 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6171 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6172 DebugLoc DL = MI.getDebugLoc();
6173 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6174 .add(MI.getOperand(0))
6175 .addImm(MFI->getLDSSize());
6176 MI.eraseFromParent();
6177 return BB;
6178 }
6179 case AMDGPU::GET_SHADERCYCLESHILO: {
6182 const DebugLoc &DL = MI.getDebugLoc();
6183 // The algorithm is:
6184 //
6185 // hi1 = getreg(SHADER_CYCLES_HI)
6186 // lo1 = getreg(SHADER_CYCLES_LO)
6187 // hi2 = getreg(SHADER_CYCLES_HI)
6188 //
6189 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6190 // Otherwise there was overflow and the result is hi2:0. In both cases the
6191 // result should represent the actual time at some point during the sequence
6192 // of three getregs.
6193 using namespace AMDGPU::Hwreg;
6194 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6195 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6196 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6197 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6198 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6199 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6200 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6201 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6202 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6203 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6204 .addReg(RegHi1)
6205 .addReg(RegHi2);
6206 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6207 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6208 .addReg(RegLo1)
6209 .addImm(0);
6210 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6211 .add(MI.getOperand(0))
6212 .addReg(RegLo)
6213 .addImm(AMDGPU::sub0)
6214 .addReg(RegHi2)
6215 .addImm(AMDGPU::sub1);
6216 MI.eraseFromParent();
6217 return BB;
6218 }
6219 case AMDGPU::SI_INDIRECT_SRC_V1:
6220 case AMDGPU::SI_INDIRECT_SRC_V2:
6221 case AMDGPU::SI_INDIRECT_SRC_V4:
6222 case AMDGPU::SI_INDIRECT_SRC_V8:
6223 case AMDGPU::SI_INDIRECT_SRC_V9:
6224 case AMDGPU::SI_INDIRECT_SRC_V10:
6225 case AMDGPU::SI_INDIRECT_SRC_V11:
6226 case AMDGPU::SI_INDIRECT_SRC_V12:
6227 case AMDGPU::SI_INDIRECT_SRC_V16:
6228 case AMDGPU::SI_INDIRECT_SRC_V32:
6229 return emitIndirectSrc(MI, *BB, *getSubtarget());
6230 case AMDGPU::SI_INDIRECT_DST_V1:
6231 case AMDGPU::SI_INDIRECT_DST_V2:
6232 case AMDGPU::SI_INDIRECT_DST_V4:
6233 case AMDGPU::SI_INDIRECT_DST_V8:
6234 case AMDGPU::SI_INDIRECT_DST_V9:
6235 case AMDGPU::SI_INDIRECT_DST_V10:
6236 case AMDGPU::SI_INDIRECT_DST_V11:
6237 case AMDGPU::SI_INDIRECT_DST_V12:
6238 case AMDGPU::SI_INDIRECT_DST_V16:
6239 case AMDGPU::SI_INDIRECT_DST_V32:
6240 return emitIndirectDst(MI, *BB, *getSubtarget());
6241 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6242 case AMDGPU::SI_KILL_I1_PSEUDO:
6243 return splitKillBlock(MI, BB);
6244 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6246 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6247 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6248
6249 Register Dst = MI.getOperand(0).getReg();
6250 const MachineOperand &Src0 = MI.getOperand(1);
6251 const MachineOperand &Src1 = MI.getOperand(2);
6252 const DebugLoc &DL = MI.getDebugLoc();
6253 Register SrcCond = MI.getOperand(3).getReg();
6254
6255 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6256 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6257 const auto *CondRC = TRI->getWaveMaskRegClass();
6258 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6259
6260 const TargetRegisterClass *Src0RC = Src0.isReg()
6261 ? MRI.getRegClass(Src0.getReg())
6262 : &AMDGPU::VReg_64RegClass;
6263 const TargetRegisterClass *Src1RC = Src1.isReg()
6264 ? MRI.getRegClass(Src1.getReg())
6265 : &AMDGPU::VReg_64RegClass;
6266
6267 const TargetRegisterClass *Src0SubRC =
6268 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6269 const TargetRegisterClass *Src1SubRC =
6270 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6271
6272 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6273 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6274 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6275 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6276
6277 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6278 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6279 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6280 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6281
6282 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6283 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6284 .addImm(0)
6285 .add(Src0Sub0)
6286 .addImm(0)
6287 .add(Src1Sub0)
6288 .addReg(SrcCondCopy);
6289 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6290 .addImm(0)
6291 .add(Src0Sub1)
6292 .addImm(0)
6293 .add(Src1Sub1)
6294 .addReg(SrcCondCopy);
6295
6296 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6297 .addReg(DstLo)
6298 .addImm(AMDGPU::sub0)
6299 .addReg(DstHi)
6300 .addImm(AMDGPU::sub1);
6301 MI.eraseFromParent();
6302 return BB;
6303 }
6304 case AMDGPU::SI_BR_UNDEF: {
6306 const DebugLoc &DL = MI.getDebugLoc();
6307 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6308 .add(MI.getOperand(0));
6309 Br->getOperand(1).setIsUndef(); // read undef SCC
6310 MI.eraseFromParent();
6311 return BB;
6312 }
6313 case AMDGPU::ADJCALLSTACKUP:
6314 case AMDGPU::ADJCALLSTACKDOWN: {
6316 MachineInstrBuilder MIB(*MF, &MI);
6317 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6318 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6319 return BB;
6320 }
6321 case AMDGPU::SI_CALL_ISEL: {
6323 const DebugLoc &DL = MI.getDebugLoc();
6324
6325 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6326
6328 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6329
6330 for (const MachineOperand &MO : MI.operands())
6331 MIB.add(MO);
6332
6333 MIB.cloneMemRefs(MI);
6334 MI.eraseFromParent();
6335 return BB;
6336 }
6337 case AMDGPU::V_ADD_CO_U32_e32:
6338 case AMDGPU::V_SUB_CO_U32_e32:
6339 case AMDGPU::V_SUBREV_CO_U32_e32: {
6340 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6341 const DebugLoc &DL = MI.getDebugLoc();
6342 unsigned Opc = MI.getOpcode();
6343
6344 bool NeedClampOperand = false;
6345 if (TII->pseudoToMCOpcode(Opc) == -1) {
6347 NeedClampOperand = true;
6348 }
6349
6350 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6351 if (TII->isVOP3(*I)) {
6352 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6353 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6354 I.addReg(TRI->getVCC(), RegState::Define);
6355 }
6356 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6357 if (NeedClampOperand)
6358 I.addImm(0); // clamp bit for e64 encoding
6359
6360 TII->legalizeOperands(*I);
6361
6362 MI.eraseFromParent();
6363 return BB;
6364 }
6365 case AMDGPU::V_ADDC_U32_e32:
6366 case AMDGPU::V_SUBB_U32_e32:
6367 case AMDGPU::V_SUBBREV_U32_e32:
6368 // These instructions have an implicit use of vcc which counts towards the
6369 // constant bus limit.
6370 TII->legalizeOperands(MI);
6371 return BB;
6372 case AMDGPU::DS_GWS_INIT:
6373 case AMDGPU::DS_GWS_SEMA_BR:
6374 case AMDGPU::DS_GWS_BARRIER:
6375 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6376 [[fallthrough]];
6377 case AMDGPU::DS_GWS_SEMA_V:
6378 case AMDGPU::DS_GWS_SEMA_P:
6379 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6380 // A s_waitcnt 0 is required to be the instruction immediately following.
6381 if (getSubtarget()->hasGWSAutoReplay()) {
6383 return BB;
6384 }
6385
6386 return emitGWSMemViolTestLoop(MI, BB);
6387 case AMDGPU::S_SETREG_B32: {
6388 // Try to optimize cases that only set the denormal mode or rounding mode.
6389 //
6390 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6391 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6392 // instead.
6393 //
6394 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6395 // allow you to have a no side effect instruction in the output of a
6396 // sideeffecting pattern.
6397 auto [ID, Offset, Width] =
6398 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6400 return BB;
6401
6402 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6403 const unsigned SetMask = WidthMask << Offset;
6404
6405 if (getSubtarget()->hasDenormModeInst()) {
6406 unsigned SetDenormOp = 0;
6407 unsigned SetRoundOp = 0;
6408
6409 // The dedicated instructions can only set the whole denorm or round mode
6410 // at once, not a subset of bits in either.
6411 if (SetMask ==
6413 // If this fully sets both the round and denorm mode, emit the two
6414 // dedicated instructions for these.
6415 SetRoundOp = AMDGPU::S_ROUND_MODE;
6416 SetDenormOp = AMDGPU::S_DENORM_MODE;
6417 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6418 SetRoundOp = AMDGPU::S_ROUND_MODE;
6419 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6420 SetDenormOp = AMDGPU::S_DENORM_MODE;
6421 }
6422
6423 if (SetRoundOp || SetDenormOp) {
6425 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6426 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6427 unsigned ImmVal = Def->getOperand(1).getImm();
6428 if (SetRoundOp) {
6429 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6430 .addImm(ImmVal & 0xf);
6431
6432 // If we also have the denorm mode, get just the denorm mode bits.
6433 ImmVal >>= 4;
6434 }
6435
6436 if (SetDenormOp) {
6437 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6438 .addImm(ImmVal & 0xf);
6439 }
6440
6441 MI.eraseFromParent();
6442 return BB;
6443 }
6444 }
6445 }
6446
6447 // If only FP bits are touched, used the no side effects pseudo.
6448 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6449 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6450 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6451
6452 return BB;
6453 }
6454 case AMDGPU::S_INVERSE_BALLOT_U32:
6455 case AMDGPU::S_INVERSE_BALLOT_U64:
6456 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6457 // necessary. After that they are equivalent to a COPY.
6458 MI.setDesc(TII->get(AMDGPU::COPY));
6459 return BB;
6460 case AMDGPU::ENDPGM_TRAP: {
6461 const DebugLoc &DL = MI.getDebugLoc();
6462 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6463 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6464 MI.addOperand(MachineOperand::CreateImm(0));
6465 return BB;
6466 }
6467
6468 // We need a block split to make the real endpgm a terminator. We also don't
6469 // want to break phis in successor blocks, so we can't just delete to the
6470 // end of the block.
6471
6472 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6474 MF->push_back(TrapBB);
6475 // clang-format off
6476 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6477 .addImm(0);
6478 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6479 .addMBB(TrapBB);
6480 // clang-format on
6481
6482 BB->addSuccessor(TrapBB);
6483 MI.eraseFromParent();
6484 return SplitBB;
6485 }
6486 case AMDGPU::SIMULATED_TRAP: {
6487 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6489 MachineBasicBlock *SplitBB =
6490 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6491 MI.eraseFromParent();
6492 return SplitBB;
6493 }
6494 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6495 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6497
6498 // During ISel, it's difficult to propagate the original EXEC mask to use as
6499 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6500 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6501 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6502 Register OriginalExec = Setup->getOperand(0).getReg();
6503 MF->getRegInfo().clearKillFlags(OriginalExec);
6504 MI.getOperand(0).setReg(OriginalExec);
6505 return BB;
6506 }
6507 default:
6508 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6509 if (!MI.mayStore())
6511 return BB;
6512 }
6514 }
6515}
6516
6518 // This currently forces unfolding various combinations of fsub into fma with
6519 // free fneg'd operands. As long as we have fast FMA (controlled by
6520 // isFMAFasterThanFMulAndFAdd), we should perform these.
6521
6522 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6523 // most of these combines appear to be cycle neutral but save on instruction
6524 // count / code size.
6525 return true;
6526}
6527
6529
6531 EVT VT) const {
6532 if (!VT.isVector()) {
6533 return MVT::i1;
6534 }
6535 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6536}
6537
6539 // TODO: Should i16 be used always if legal? For now it would force VALU
6540 // shifts.
6541 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6542}
6543
6545 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6546 ? Ty.changeElementSize(16)
6547 : Ty.changeElementSize(32);
6548}
6549
6550// Answering this is somewhat tricky and depends on the specific device which
6551// have different rates for fma or all f64 operations.
6552//
6553// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6554// regardless of which device (although the number of cycles differs between
6555// devices), so it is always profitable for f64.
6556//
6557// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6558// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6559// which we can always do even without fused FP ops since it returns the same
6560// result as the separate operations and since it is always full
6561// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6562// however does not support denormals, so we do report fma as faster if we have
6563// a fast fma device and require denormals.
6564//
6566 EVT VT) const {
6567 VT = VT.getScalarType();
6568
6569 switch (VT.getSimpleVT().SimpleTy) {
6570 case MVT::f32: {
6571 // If mad is not available this depends only on if f32 fma is full rate.
6572 if (!Subtarget->hasMadMacF32Insts())
6573 return Subtarget->hasFastFMAF32();
6574
6575 // Otherwise f32 mad is always full rate and returns the same result as
6576 // the separate operations so should be preferred over fma.
6577 // However does not support denormals.
6579 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6580
6581 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6582 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6583 }
6584 case MVT::f64:
6585 return true;
6586 case MVT::f16:
6587 case MVT::bf16:
6588 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6589 default:
6590 break;
6591 }
6592
6593 return false;
6594}
6595
6597 LLT Ty) const {
6598 switch (Ty.getScalarSizeInBits()) {
6599 case 16:
6600 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6601 case 32:
6602 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6603 case 64:
6604 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6605 default:
6606 break;
6607 }
6608
6609 return false;
6610}
6611
6613 if (!Ty.isScalar())
6614 return false;
6615
6616 if (Ty.getScalarSizeInBits() == 16)
6617 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6618 if (Ty.getScalarSizeInBits() == 32)
6619 return Subtarget->hasMadMacF32Insts() &&
6620 denormalModeIsFlushAllF32(*MI.getMF());
6621
6622 return false;
6623}
6624
6626 const SDNode *N) const {
6627 // TODO: Check future ftz flag
6628 // v_mad_f32/v_mac_f32 do not support denormals.
6629 EVT VT = N->getValueType(0);
6630 if (VT == MVT::f32)
6631 return Subtarget->hasMadMacF32Insts() &&
6633 if (VT == MVT::f16) {
6634 return Subtarget->hasMadF16() &&
6636 }
6637
6638 return false;
6639}
6640
6641//===----------------------------------------------------------------------===//
6642// Custom DAG Lowering Operations
6643//===----------------------------------------------------------------------===//
6644
6645// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6646// wider vector type is legal.
6648 SelectionDAG &DAG) const {
6649 unsigned Opc = Op.getOpcode();
6650 EVT VT = Op.getValueType();
6651 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6652 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6653 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6654 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6655
6656 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6657
6658 SDLoc SL(Op);
6659 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6660 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6661
6662 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6663}
6664
6665// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6666// regression whereby extra unnecessary instructions were added to codegen
6667// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6668// instructions to extract the result from the vector.
6670 [[maybe_unused]] EVT VT = Op.getValueType();
6671
6672 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6673 VT == MVT::v16i32) &&
6674 "Unexpected ValueType.");
6675
6676 return DAG.UnrollVectorOp(Op.getNode());
6677}
6678
6679// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6680// wider vector type is legal.
6682 SelectionDAG &DAG) const {
6683 unsigned Opc = Op.getOpcode();
6684 EVT VT = Op.getValueType();
6685 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6686 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6687 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6688 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6689 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6690 VT == MVT::v32bf16);
6691
6692 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6693 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6694
6695 SDLoc SL(Op);
6696
6697 SDValue OpLo =
6698 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6699 SDValue OpHi =
6700 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6701
6702 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6703}
6704
6706 SelectionDAG &DAG) const {
6707 unsigned Opc = Op.getOpcode();
6708 EVT VT = Op.getValueType();
6709 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6710 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6711 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6712 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6713 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6714 VT == MVT::v32bf16);
6715
6716 SDValue Op0 = Op.getOperand(0);
6717 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6718 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6719 : std::pair(Op0, Op0);
6720
6721 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6722 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6723
6724 SDLoc SL(Op);
6725 auto ResVT = DAG.GetSplitDestVTs(VT);
6726
6727 SDValue OpLo =
6728 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6729 SDValue OpHi =
6730 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6731
6732 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6733}
6734
6736 switch (Op.getOpcode()) {
6737 default:
6739 case ISD::BRCOND:
6740 return LowerBRCOND(Op, DAG);
6741 case ISD::RETURNADDR:
6742 return LowerRETURNADDR(Op, DAG);
6743 case ISD::LOAD: {
6744 SDValue Result = LowerLOAD(Op, DAG);
6745 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6746 "Load should return a value and a chain");
6747 return Result;
6748 }
6749 case ISD::FSQRT: {
6750 EVT VT = Op.getValueType();
6751 if (VT == MVT::f32)
6752 return lowerFSQRTF32(Op, DAG);
6753 if (VT == MVT::f64)
6754 return lowerFSQRTF64(Op, DAG);
6755 return SDValue();
6756 }
6757 case ISD::FSIN:
6758 case ISD::FCOS:
6759 return LowerTrig(Op, DAG);
6760 case ISD::SELECT:
6761 return LowerSELECT(Op, DAG);
6762 case ISD::FDIV:
6763 return LowerFDIV(Op, DAG);
6764 case ISD::FFREXP:
6765 return LowerFFREXP(Op, DAG);
6766 case ISD::ATOMIC_CMP_SWAP:
6767 return LowerATOMIC_CMP_SWAP(Op, DAG);
6768 case ISD::STORE:
6769 return LowerSTORE(Op, DAG);
6770 case ISD::GlobalAddress: {
6773 return LowerGlobalAddress(MFI, Op, DAG);
6774 }
6776 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6778 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6780 return LowerINTRINSIC_VOID(Op, DAG);
6781 case ISD::ADDRSPACECAST:
6782 return lowerADDRSPACECAST(Op, DAG);
6784 return lowerINSERT_SUBVECTOR(Op, DAG);
6786 return lowerINSERT_VECTOR_ELT(Op, DAG);
6788 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6790 return lowerVECTOR_SHUFFLE(Op, DAG);
6792 return lowerSCALAR_TO_VECTOR(Op, DAG);
6793 case ISD::BUILD_VECTOR:
6794 return lowerBUILD_VECTOR(Op, DAG);
6795 case ISD::FP_ROUND:
6797 return lowerFP_ROUND(Op, DAG);
6798 case ISD::TRAP:
6799 return lowerTRAP(Op, DAG);
6800 case ISD::DEBUGTRAP:
6801 return lowerDEBUGTRAP(Op, DAG);
6802 case ISD::ABS:
6803 case ISD::FABS:
6804 case ISD::FNEG:
6805 case ISD::FCANONICALIZE:
6806 case ISD::BSWAP:
6807 return splitUnaryVectorOp(Op, DAG);
6808 case ISD::FMINNUM:
6809 case ISD::FMAXNUM:
6810 return lowerFMINNUM_FMAXNUM(Op, DAG);
6811 case ISD::FMINIMUMNUM:
6812 case ISD::FMAXIMUMNUM:
6813 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6814 case ISD::FMINIMUM:
6815 case ISD::FMAXIMUM:
6816 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6817 case ISD::FLDEXP:
6818 case ISD::STRICT_FLDEXP:
6819 return lowerFLDEXP(Op, DAG);
6820 case ISD::FMA:
6821 return splitTernaryVectorOp(Op, DAG);
6822 case ISD::FP_TO_SINT:
6823 case ISD::FP_TO_UINT:
6824 return LowerFP_TO_INT(Op, DAG);
6825 case ISD::SHL:
6826 case ISD::SRA:
6827 case ISD::SRL:
6828 case ISD::ADD:
6829 case ISD::SUB:
6830 case ISD::SMIN:
6831 case ISD::SMAX:
6832 case ISD::UMIN:
6833 case ISD::UMAX:
6834 case ISD::FADD:
6835 case ISD::FMUL:
6836 case ISD::FMINNUM_IEEE:
6837 case ISD::FMAXNUM_IEEE:
6838 case ISD::UADDSAT:
6839 case ISD::USUBSAT:
6840 case ISD::SADDSAT:
6841 case ISD::SSUBSAT:
6842 return splitBinaryVectorOp(Op, DAG);
6843 case ISD::FCOPYSIGN:
6844 return lowerFCOPYSIGN(Op, DAG);
6845 case ISD::MUL:
6846 return lowerMUL(Op, DAG);
6847 case ISD::SMULO:
6848 case ISD::UMULO:
6849 return lowerXMULO(Op, DAG);
6850 case ISD::SMUL_LOHI:
6851 case ISD::UMUL_LOHI:
6852 return lowerXMUL_LOHI(Op, DAG);
6853 case ISD::DYNAMIC_STACKALLOC:
6854 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6855 case ISD::STACKSAVE:
6856 return LowerSTACKSAVE(Op, DAG);
6857 case ISD::GET_ROUNDING:
6858 return lowerGET_ROUNDING(Op, DAG);
6859 case ISD::SET_ROUNDING:
6860 return lowerSET_ROUNDING(Op, DAG);
6861 case ISD::PREFETCH:
6862 return lowerPREFETCH(Op, DAG);
6863 case ISD::FP_EXTEND:
6865 return lowerFP_EXTEND(Op, DAG);
6866 case ISD::GET_FPENV:
6867 return lowerGET_FPENV(Op, DAG);
6868 case ISD::SET_FPENV:
6869 return lowerSET_FPENV(Op, DAG);
6870 case ISD::ROTR:
6871 return lowerROTR(Op, DAG);
6872 }
6873 return SDValue();
6874}
6875
6876// Used for D16: Casts the result of an instruction into the right vector,
6877// packs values if loads return unpacked values.
6879 const SDLoc &DL, SelectionDAG &DAG,
6880 bool Unpacked) {
6881 if (!LoadVT.isVector())
6882 return Result;
6883
6884 // Cast back to the original packed type or to a larger type that is a
6885 // multiple of 32 bit for D16. Widening the return type is a required for
6886 // legalization.
6887 EVT FittingLoadVT = LoadVT;
6888 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6889 FittingLoadVT =
6891 LoadVT.getVectorNumElements() + 1);
6892 }
6893
6894 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6895 // Truncate to v2i16/v4i16.
6896 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6897
6898 // Workaround legalizer not scalarizing truncate after vector op
6899 // legalization but not creating intermediate vector trunc.
6901 DAG.ExtractVectorElements(Result, Elts);
6902 for (SDValue &Elt : Elts)
6903 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6904
6905 // Pad illegal v1i16/v3fi6 to v4i16
6906 if ((LoadVT.getVectorNumElements() % 2) == 1)
6907 Elts.push_back(DAG.getPOISON(MVT::i16));
6908
6909 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6910
6911 // Bitcast to original type (v2f16/v4f16).
6912 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6913 }
6914
6915 // Cast back to the original packed type.
6916 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6917}
6918
6919SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6920 SelectionDAG &DAG,
6922 bool IsIntrinsic) const {
6923 SDLoc DL(M);
6924
6925 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6926 EVT LoadVT = M->getValueType(0);
6927
6928 EVT EquivLoadVT = LoadVT;
6929 if (LoadVT.isVector()) {
6930 if (Unpacked) {
6931 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6932 LoadVT.getVectorNumElements());
6933 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6934 // Widen v3f16 to legal type
6935 EquivLoadVT =
6937 LoadVT.getVectorNumElements() + 1);
6938 }
6939 }
6940
6941 // Change from v4f16/v2f16 to EquivLoadVT.
6942 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6943
6945 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6946 M->getMemoryVT(), M->getMemOperand());
6947
6948 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6949
6950 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6951}
6952
6953SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6954 SelectionDAG &DAG,
6955 ArrayRef<SDValue> Ops) const {
6956 SDLoc DL(M);
6957 EVT LoadVT = M->getValueType(0);
6958 EVT EltType = LoadVT.getScalarType();
6959 EVT IntVT = LoadVT.changeTypeToInteger();
6960
6961 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6962
6963 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6964 bool IsTFE = M->getNumValues() == 3;
6965
6966 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6968 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6969 : AMDGPUISD::BUFFER_LOAD;
6970
6971 if (IsD16) {
6972 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6973 }
6974
6975 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6976 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6977 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6978 IsTFE);
6979
6980 if (isTypeLegal(LoadVT)) {
6981 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6982 M->getMemOperand(), DAG);
6983 }
6984
6985 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6986 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6987 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6988 M->getMemOperand(), DAG);
6989 return DAG.getMergeValues(
6990 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6991 DL);
6992}
6993
6995 SelectionDAG &DAG) {
6996 EVT VT = N->getValueType(0);
6997 unsigned CondCode = N->getConstantOperandVal(3);
6998 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6999 return DAG.getPOISON(VT);
7000
7001 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7002
7003 SDValue LHS = N->getOperand(1);
7004 SDValue RHS = N->getOperand(2);
7005
7006 SDLoc DL(N);
7007
7008 EVT CmpVT = LHS.getValueType();
7009 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7010 unsigned PromoteOp =
7012 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7013 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7014 }
7015
7016 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7017
7018 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7019 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7020
7021 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7022 DAG.getCondCode(CCOpcode));
7023 if (VT.bitsEq(CCVT))
7024 return SetCC;
7025 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7026}
7027
7029 SelectionDAG &DAG) {
7030 EVT VT = N->getValueType(0);
7031
7032 unsigned CondCode = N->getConstantOperandVal(3);
7033 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7034 return DAG.getPOISON(VT);
7035
7036 SDValue Src0 = N->getOperand(1);
7037 SDValue Src1 = N->getOperand(2);
7038 EVT CmpVT = Src0.getValueType();
7039 SDLoc SL(N);
7040
7041 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7042 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7043 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7044 }
7045
7046 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7047 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7048 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7049 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7050 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7051 DAG.getCondCode(CCOpcode));
7052 if (VT.bitsEq(CCVT))
7053 return SetCC;
7054 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7055}
7056
7058 SelectionDAG &DAG) {
7059 EVT VT = N->getValueType(0);
7060 SDValue Src = N->getOperand(1);
7061 SDLoc SL(N);
7062
7063 if (Src.getOpcode() == ISD::SETCC) {
7064 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7065 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7066 Src.getOperand(1), Src.getOperand(2));
7067 }
7068 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7069 // (ballot 0) -> 0
7070 if (Arg->isZero())
7071 return DAG.getConstant(0, SL, VT);
7072
7073 // (ballot 1) -> EXEC/EXEC_LO
7074 if (Arg->isOne()) {
7075 Register Exec;
7076 if (VT.getScalarSizeInBits() == 32)
7077 Exec = AMDGPU::EXEC_LO;
7078 else if (VT.getScalarSizeInBits() == 64)
7079 Exec = AMDGPU::EXEC;
7080 else
7081 return SDValue();
7082
7083 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7084 }
7085 }
7086
7087 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7088 // ISD::SETNE)
7089 return DAG.getNode(
7090 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7091 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7092}
7093
7095 SelectionDAG &DAG) {
7096 EVT VT = N->getValueType(0);
7097 unsigned ValSize = VT.getSizeInBits();
7098 unsigned IID = N->getConstantOperandVal(0);
7099 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7100 IID == Intrinsic::amdgcn_permlanex16;
7101 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7102 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7103 SDLoc SL(N);
7104 MVT IntVT = MVT::getIntegerVT(ValSize);
7105 const GCNSubtarget *ST = TLI.getSubtarget();
7106 unsigned SplitSize = 32;
7107 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7108 ST->hasDPALU_DPP() &&
7109 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7110 SplitSize = 64;
7111
7112 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7113 SDValue Src2, MVT ValT) -> SDValue {
7115 switch (IID) {
7116 case Intrinsic::amdgcn_permlane16:
7117 case Intrinsic::amdgcn_permlanex16:
7118 case Intrinsic::amdgcn_update_dpp:
7119 Operands.push_back(N->getOperand(6));
7120 Operands.push_back(N->getOperand(5));
7121 Operands.push_back(N->getOperand(4));
7122 [[fallthrough]];
7123 case Intrinsic::amdgcn_writelane:
7124 Operands.push_back(Src2);
7125 [[fallthrough]];
7126 case Intrinsic::amdgcn_readlane:
7127 case Intrinsic::amdgcn_set_inactive:
7128 case Intrinsic::amdgcn_set_inactive_chain_arg:
7129 case Intrinsic::amdgcn_mov_dpp8:
7130 Operands.push_back(Src1);
7131 [[fallthrough]];
7132 case Intrinsic::amdgcn_readfirstlane:
7133 case Intrinsic::amdgcn_permlane64:
7134 Operands.push_back(Src0);
7135 break;
7136 default:
7137 llvm_unreachable("unhandled lane op");
7138 }
7139
7140 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7141 std::reverse(Operands.begin(), Operands.end());
7142
7143 if (SDNode *GL = N->getGluedNode()) {
7144 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7145 GL = GL->getOperand(0).getNode();
7146 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7147 SDValue(GL, 0)));
7148 }
7149
7150 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7151 };
7152
7153 SDValue Src0 = N->getOperand(1);
7154 SDValue Src1, Src2;
7155 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7156 IID == Intrinsic::amdgcn_mov_dpp8 ||
7157 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7158 Src1 = N->getOperand(2);
7159 if (IID == Intrinsic::amdgcn_writelane ||
7160 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7161 Src2 = N->getOperand(3);
7162 }
7163
7164 if (ValSize == SplitSize) {
7165 // Already legal
7166 return SDValue();
7167 }
7168
7169 if (ValSize < 32) {
7170 bool IsFloat = VT.isFloatingPoint();
7171 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7172 SL, MVT::i32);
7173
7174 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7175 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7176 SL, MVT::i32);
7177 }
7178
7179 if (IID == Intrinsic::amdgcn_writelane) {
7180 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7181 SL, MVT::i32);
7182 }
7183
7184 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7185 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7186 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7187 }
7188
7189 if (ValSize % SplitSize != 0)
7190 return SDValue();
7191
7192 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7193 EVT VT = N->getValueType(0);
7194 unsigned NE = VT.getVectorNumElements();
7195 EVT EltVT = VT.getVectorElementType();
7197 unsigned NumOperands = N->getNumOperands();
7198 SmallVector<SDValue, 4> Operands(NumOperands);
7199 SDNode *GL = N->getGluedNode();
7200
7201 // only handle convergencectrl_glue
7202 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7203
7204 for (unsigned i = 0; i != NE; ++i) {
7205 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7206 ++j) {
7207 SDValue Operand = N->getOperand(j);
7208 EVT OperandVT = Operand.getValueType();
7209 if (OperandVT.isVector()) {
7210 // A vector operand; extract a single element.
7211 EVT OperandEltVT = OperandVT.getVectorElementType();
7212 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7213 Operand, DAG.getVectorIdxConstant(i, SL));
7214 } else {
7215 // A scalar operand; just use it as is.
7216 Operands[j] = Operand;
7217 }
7218 }
7219
7220 if (GL)
7221 Operands[NumOperands - 1] =
7222 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7223 SDValue(GL->getOperand(0).getNode(), 0));
7224
7225 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7226 }
7227
7228 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7229 return DAG.getBuildVector(VecVT, SL, Scalars);
7230 };
7231
7232 if (VT.isVector()) {
7233 switch (MVT::SimpleValueType EltTy =
7235 case MVT::i32:
7236 case MVT::f32:
7237 if (SplitSize == 32) {
7238 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7239 return unrollLaneOp(LaneOp.getNode());
7240 }
7241 [[fallthrough]];
7242 case MVT::i16:
7243 case MVT::f16:
7244 case MVT::bf16: {
7245 unsigned SubVecNumElt =
7246 SplitSize / VT.getVectorElementType().getSizeInBits();
7247 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7249 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7250 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7251 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7252 DAG.getConstant(EltIdx, SL, MVT::i32));
7253
7254 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7255 IsPermLane16)
7256 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7257 DAG.getConstant(EltIdx, SL, MVT::i32));
7258
7259 if (IID == Intrinsic::amdgcn_writelane)
7260 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7261 DAG.getConstant(EltIdx, SL, MVT::i32));
7262
7263 Pieces.push_back(
7264 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7265 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7266 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7267 EltIdx += SubVecNumElt;
7268 }
7269 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7270 }
7271 default:
7272 // Handle all other cases by bitcasting to i32 vectors
7273 break;
7274 }
7275 }
7276
7277 MVT VecVT =
7278 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7279 Src0 = DAG.getBitcast(VecVT, Src0);
7280
7281 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7282 Src1 = DAG.getBitcast(VecVT, Src1);
7283
7284 if (IID == Intrinsic::amdgcn_writelane)
7285 Src2 = DAG.getBitcast(VecVT, Src2);
7286
7287 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7288 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7289 return DAG.getBitcast(VT, UnrolledLaneOp);
7290}
7291
7294 SelectionDAG &DAG) const {
7295 switch (N->getOpcode()) {
7297 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7298 Results.push_back(Res);
7299 return;
7300 }
7302 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7303 Results.push_back(Res);
7304 return;
7305 }
7307 unsigned IID = N->getConstantOperandVal(0);
7308 switch (IID) {
7309 case Intrinsic::amdgcn_make_buffer_rsrc:
7310 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7311 return;
7312 case Intrinsic::amdgcn_cvt_pkrtz: {
7313 SDValue Src0 = N->getOperand(1);
7314 SDValue Src1 = N->getOperand(2);
7315 SDLoc SL(N);
7316 SDValue Cvt =
7317 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7318 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7319 return;
7320 }
7321 case Intrinsic::amdgcn_cvt_pknorm_i16:
7322 case Intrinsic::amdgcn_cvt_pknorm_u16:
7323 case Intrinsic::amdgcn_cvt_pk_i16:
7324 case Intrinsic::amdgcn_cvt_pk_u16: {
7325 SDValue Src0 = N->getOperand(1);
7326 SDValue Src1 = N->getOperand(2);
7327 SDLoc SL(N);
7328 unsigned Opcode;
7329
7330 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7332 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7334 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7336 else
7338
7339 EVT VT = N->getValueType(0);
7340 if (isTypeLegal(VT))
7341 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7342 else {
7343 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7344 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7345 }
7346 return;
7347 }
7348 case Intrinsic::amdgcn_s_buffer_load: {
7349 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7350 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7351 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7352 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7353 // s_buffer_load_i8.
7354 if (!Subtarget->hasScalarSubwordLoads())
7355 return;
7356 SDValue Op = SDValue(N, 0);
7357 SDValue Rsrc = Op.getOperand(1);
7358 SDValue Offset = Op.getOperand(2);
7359 SDValue CachePolicy = Op.getOperand(3);
7360 EVT VT = Op.getValueType();
7361 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7362 SDLoc DL(Op);
7364 const DataLayout &DataLayout = DAG.getDataLayout();
7365 Align Alignment =
7371 VT.getStoreSize(), Alignment);
7372 SDValue LoadVal;
7373 if (!Offset->isDivergent()) {
7374 SDValue Ops[] = {Rsrc, // source register
7375 Offset, CachePolicy};
7376 SDValue BufferLoad =
7378 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7379 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7380 } else {
7381 SDValue Ops[] = {
7382 DAG.getEntryNode(), // Chain
7383 Rsrc, // rsrc
7384 DAG.getConstant(0, DL, MVT::i32), // vindex
7385 {}, // voffset
7386 {}, // soffset
7387 {}, // offset
7388 CachePolicy, // cachepolicy
7389 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7390 };
7391 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7392 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7393 }
7394 Results.push_back(LoadVal);
7395 return;
7396 }
7397 case Intrinsic::amdgcn_dead: {
7398 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7399 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7400 return;
7401 }
7402 }
7403 break;
7404 }
7406 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7407 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7408 // FIXME: Hacky
7409 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7410 Results.push_back(Res.getOperand(I));
7411 }
7412 } else {
7413 Results.push_back(Res);
7414 Results.push_back(Res.getValue(1));
7415 }
7416 return;
7417 }
7418
7419 break;
7420 }
7421 case ISD::SELECT: {
7422 SDLoc SL(N);
7423 EVT VT = N->getValueType(0);
7424 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7425 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7426 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7427
7428 EVT SelectVT = NewVT;
7429 if (NewVT.bitsLT(MVT::i32)) {
7430 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7431 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7432 SelectVT = MVT::i32;
7433 }
7434
7435 SDValue NewSelect =
7436 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7437
7438 if (NewVT != SelectVT)
7439 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7440 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7441 return;
7442 }
7443 case ISD::FNEG: {
7444 if (N->getValueType(0) != MVT::v2f16)
7445 break;
7446
7447 SDLoc SL(N);
7448 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7449
7450 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7451 DAG.getConstant(0x80008000, SL, MVT::i32));
7452 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7453 return;
7454 }
7455 case ISD::FABS: {
7456 if (N->getValueType(0) != MVT::v2f16)
7457 break;
7458
7459 SDLoc SL(N);
7460 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7461
7462 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7463 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7464 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7465 return;
7466 }
7467 case ISD::FSQRT: {
7468 if (N->getValueType(0) != MVT::f16)
7469 break;
7470 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7471 break;
7472 }
7473 default:
7475 break;
7476 }
7477}
7478
7479/// Helper function for LowerBRCOND
7480static SDNode *findUser(SDValue Value, unsigned Opcode) {
7481
7482 for (SDUse &U : Value->uses()) {
7483 if (U.get() != Value)
7484 continue;
7485
7486 if (U.getUser()->getOpcode() == Opcode)
7487 return U.getUser();
7488 }
7489 return nullptr;
7490}
7491
7492unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7493 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7494 switch (Intr->getConstantOperandVal(1)) {
7495 case Intrinsic::amdgcn_if:
7496 return AMDGPUISD::IF;
7497 case Intrinsic::amdgcn_else:
7498 return AMDGPUISD::ELSE;
7499 case Intrinsic::amdgcn_loop:
7500 return AMDGPUISD::LOOP;
7501 case Intrinsic::amdgcn_end_cf:
7502 llvm_unreachable("should not occur");
7503 default:
7504 return 0;
7505 }
7506 }
7507
7508 // break, if_break, else_break are all only used as inputs to loop, not
7509 // directly as branch conditions.
7510 return 0;
7511}
7512
7519
7521 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7522 return false;
7523
7524 // FIXME: Either avoid relying on address space here or change the default
7525 // address space for functions to avoid the explicit check.
7526 return (GV->getValueType()->isFunctionTy() ||
7529}
7530
7532 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7533}
7534
7536 if (!GV->hasExternalLinkage())
7537 return true;
7538
7539 const auto OS = getTargetMachine().getTargetTriple().getOS();
7540 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7541}
7542
7543/// This transforms the control flow intrinsics to get the branch destination as
7544/// last parameter, also switches branch target with BR if the need arise
7545SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7546 SDLoc DL(BRCOND);
7547
7548 SDNode *Intr = BRCOND.getOperand(1).getNode();
7549 SDValue Target = BRCOND.getOperand(2);
7550 SDNode *BR = nullptr;
7551 SDNode *SetCC = nullptr;
7552
7553 if (Intr->getOpcode() == ISD::SETCC) {
7554 // As long as we negate the condition everything is fine
7555 SetCC = Intr;
7556 Intr = SetCC->getOperand(0).getNode();
7557
7558 } else {
7559 // Get the target from BR if we don't negate the condition
7560 BR = findUser(BRCOND, ISD::BR);
7561 assert(BR && "brcond missing unconditional branch user");
7562 Target = BR->getOperand(1);
7563 }
7564
7565 unsigned CFNode = isCFIntrinsic(Intr);
7566 if (CFNode == 0) {
7567 // This is a uniform branch so we don't need to legalize.
7568 return BRCOND;
7569 }
7570
7571 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7573
7574 assert(!SetCC ||
7575 (SetCC->getConstantOperandVal(1) == 1 &&
7576 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7577 ISD::SETNE));
7578
7579 // operands of the new intrinsic call
7581 if (HaveChain)
7582 Ops.push_back(BRCOND.getOperand(0));
7583
7584 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7585 Ops.push_back(Target);
7586
7587 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7588
7589 // build the new intrinsic call
7590 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7591
7592 if (!HaveChain) {
7593 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7594
7596 }
7597
7598 if (BR) {
7599 // Give the branch instruction our target
7600 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7601 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7602 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7603 }
7604
7605 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7606
7607 // Copy the intrinsic results to registers
7608 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7609 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7610 if (!CopyToReg)
7611 continue;
7612
7613 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7614 SDValue(Result, i - 1), SDValue());
7615
7616 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7617 }
7618
7619 // Remove the old intrinsic from the chain
7620 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7621 Intr->getOperand(0));
7622
7623 return Chain;
7624}
7625
7626SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7627 MVT VT = Op.getSimpleValueType();
7628 SDLoc DL(Op);
7629 // Checking the depth
7630 if (Op.getConstantOperandVal(0) != 0)
7631 return DAG.getConstant(0, DL, VT);
7632
7633 MachineFunction &MF = DAG.getMachineFunction();
7634 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7635 // Check for kernel and shader functions
7636 if (Info->isEntryFunction())
7637 return DAG.getConstant(0, DL, VT);
7638
7639 MachineFrameInfo &MFI = MF.getFrameInfo();
7640 // There is a call to @llvm.returnaddress in this function
7641 MFI.setReturnAddressIsTaken(true);
7642
7643 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7644 // Get the return address reg and mark it as an implicit live-in
7645 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7646 getRegClassFor(VT, Op.getNode()->isDivergent()));
7647
7648 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7649}
7650
7651SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7652 const SDLoc &DL, EVT VT) const {
7653 return Op.getValueType().bitsLE(VT)
7654 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7655 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7656 DAG.getTargetConstant(0, DL, MVT::i32));
7657}
7658
7659SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7660 SelectionDAG &DAG) const {
7661 EVT DstVT = Op.getValueType();
7662 unsigned NumElts = DstVT.getVectorNumElements();
7663 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7664
7665 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7666
7667 SDLoc DL(Op);
7668 unsigned Opc = Op.getOpcode();
7669 SDValue Flags = Op.getOperand(1);
7670 EVT HalfDstVT =
7671 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7672 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7673 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7674
7675 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7676}
7677
7678SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7679 SDValue Src = Op.getOperand(0);
7680 EVT SrcVT = Src.getValueType();
7681 EVT DstVT = Op.getValueType();
7682
7683 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7684 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7685 if (SrcVT.getScalarType() != MVT::f32)
7686 return SDValue();
7687 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7688 }
7689
7690 if (SrcVT.getScalarType() != MVT::f64)
7691 return Op;
7692
7693 SDLoc DL(Op);
7694 if (DstVT == MVT::f16) {
7695 // TODO: Handle strictfp
7696 if (Op.getOpcode() != ISD::FP_ROUND)
7697 return Op;
7698
7699 if (!Subtarget->has16BitInsts()) {
7700 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7701 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7702 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7703 }
7704 if (Op->getFlags().hasApproximateFuncs()) {
7705 SDValue Flags = Op.getOperand(1);
7706 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7707 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7708 }
7709 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7710 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7711 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7712 }
7713
7714 assert(DstVT.getScalarType() == MVT::bf16 &&
7715 "custom lower FP_ROUND for f16 or bf16");
7716 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7717
7718 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7719 // hardware f32 -> bf16 instruction.
7720 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7721 MVT::f32;
7722 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7723 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7724 DAG.getTargetConstant(0, DL, MVT::i32));
7725}
7726
7727SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7728 SelectionDAG &DAG) const {
7729 EVT VT = Op.getValueType();
7730 const MachineFunction &MF = DAG.getMachineFunction();
7731 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7732 bool IsIEEEMode = Info->getMode().IEEE;
7733
7734 // FIXME: Assert during selection that this is only selected for
7735 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7736 // mode functions, but this happens to be OK since it's only done in cases
7737 // where there is known no sNaN.
7738 if (IsIEEEMode)
7739 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7740
7741 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7742 VT == MVT::v16bf16)
7743 return splitBinaryVectorOp(Op, DAG);
7744 return Op;
7745}
7746
7747SDValue
7748SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7749 SelectionDAG &DAG) const {
7750 EVT VT = Op.getValueType();
7751 const MachineFunction &MF = DAG.getMachineFunction();
7752 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7753 bool IsIEEEMode = Info->getMode().IEEE;
7754
7755 if (IsIEEEMode)
7756 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7757
7758 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7759 VT == MVT::v16bf16)
7760 return splitBinaryVectorOp(Op, DAG);
7761 return Op;
7762}
7763
7764SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7765 SelectionDAG &DAG) const {
7766 EVT VT = Op.getValueType();
7767 if (VT.isVector())
7768 return splitBinaryVectorOp(Op, DAG);
7769
7770 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7771 !Subtarget->hasMinimum3Maximum3F16() &&
7772 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7773 "should not need to widen f16 minimum/maximum to v2f16");
7774
7775 // Widen f16 operation to v2f16
7776
7777 // fminimum f16:x, f16:y ->
7778 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7779 // (v2f16 (scalar_to_vector y))), 0
7780 SDLoc SL(Op);
7781 SDValue WideSrc0 =
7782 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7783 SDValue WideSrc1 =
7784 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7785
7786 SDValue Widened =
7787 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7788
7789 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7790 DAG.getConstant(0, SL, MVT::i32));
7791}
7792
7793SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7794 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7795 EVT VT = Op.getValueType();
7796 assert(VT == MVT::f16);
7797
7798 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7799 EVT ExpVT = Exp.getValueType();
7800 if (ExpVT == MVT::i16)
7801 return Op;
7802
7803 SDLoc DL(Op);
7804
7805 // Correct the exponent type for f16 to i16.
7806 // Clamp the range of the exponent to the instruction's range.
7807
7808 // TODO: This should be a generic narrowing legalization, and can easily be
7809 // for GlobalISel.
7810
7811 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7812 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7813
7814 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7815 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7816
7817 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7818
7819 if (IsStrict) {
7820 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7821 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7822 }
7823
7824 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7825}
7826
7828 switch (Op->getOpcode()) {
7829 case ISD::SRA:
7830 case ISD::SMIN:
7831 case ISD::SMAX:
7832 return ISD::SIGN_EXTEND;
7833 case ISD::SRL:
7834 case ISD::UMIN:
7835 case ISD::UMAX:
7836 return ISD::ZERO_EXTEND;
7837 case ISD::ADD:
7838 case ISD::SUB:
7839 case ISD::AND:
7840 case ISD::OR:
7841 case ISD::XOR:
7842 case ISD::SHL:
7843 case ISD::SELECT:
7844 case ISD::MUL:
7845 // operation result won't be influenced by garbage high bits.
7846 // TODO: are all of those cases correct, and are there more?
7847 return ISD::ANY_EXTEND;
7848 case ISD::SETCC: {
7849 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7851 }
7852 default:
7853 llvm_unreachable("unexpected opcode!");
7854 }
7855}
7856
7857SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7858 DAGCombinerInfo &DCI) const {
7859 const unsigned Opc = Op.getOpcode();
7860 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7861 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7862 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7863 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7864 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7865
7866 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7867 : Op->getOperand(0).getValueType();
7868 auto ExtTy = OpTy.changeElementType(MVT::i32);
7869
7870 if (DCI.isBeforeLegalizeOps() ||
7871 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7872 return SDValue();
7873
7874 auto &DAG = DCI.DAG;
7875
7876 SDLoc DL(Op);
7877 SDValue LHS;
7878 SDValue RHS;
7879 if (Opc == ISD::SELECT) {
7880 LHS = Op->getOperand(1);
7881 RHS = Op->getOperand(2);
7882 } else {
7883 LHS = Op->getOperand(0);
7884 RHS = Op->getOperand(1);
7885 }
7886
7887 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7888 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7889
7890 // Special case: for shifts, the RHS always needs a zext.
7891 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7892 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7893 else
7894 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7895
7896 // setcc always return i1/i1 vec so no need to truncate after.
7897 if (Opc == ISD::SETCC) {
7898 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7899 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7900 }
7901
7902 // For other ops, we extend the operation's return type as well so we need to
7903 // truncate back to the original type.
7904 SDValue NewVal;
7905 if (Opc == ISD::SELECT)
7906 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7907 else
7908 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7909
7910 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7911}
7912
7913SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7914 SDValue Mag = Op.getOperand(0);
7915 EVT MagVT = Mag.getValueType();
7916
7917 if (MagVT.getVectorNumElements() > 2)
7918 return splitBinaryVectorOp(Op, DAG);
7919
7920 SDValue Sign = Op.getOperand(1);
7921 EVT SignVT = Sign.getValueType();
7922
7923 if (MagVT == SignVT)
7924 return Op;
7925
7926 // fcopysign v2f16:mag, v2f32:sign ->
7927 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7928
7929 SDLoc SL(Op);
7930 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7931 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7932
7933 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7934
7935 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7936}
7937
7938// Custom lowering for vector multiplications and s_mul_u64.
7939SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7940 EVT VT = Op.getValueType();
7941
7942 // Split vector operands.
7943 if (VT.isVector())
7944 return splitBinaryVectorOp(Op, DAG);
7945
7946 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7947
7948 // There are four ways to lower s_mul_u64:
7949 //
7950 // 1. If all the operands are uniform, then we lower it as it is.
7951 //
7952 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7953 // multiplications because there is not a vector equivalent of s_mul_u64.
7954 //
7955 // 3. If the cost model decides that it is more efficient to use vector
7956 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7957 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7958 //
7959 // 4. If the cost model decides to use vector registers and both of the
7960 // operands are zero-extended/sign-extended from 32-bits, then we split the
7961 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7962 // possible to check if the operands are zero-extended or sign-extended in
7963 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7964 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7965 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7966 // If the cost model decides that we have to use vector registers, then
7967 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7968 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7969 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7970 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7971 // SIInstrInfo.cpp .
7972
7973 if (Op->isDivergent())
7974 return SDValue();
7975
7976 SDValue Op0 = Op.getOperand(0);
7977 SDValue Op1 = Op.getOperand(1);
7978 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7979 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7980 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7981 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7982 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7983 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7984 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7985 SDLoc SL(Op);
7986 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7987 return SDValue(
7988 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7989 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7990 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7991 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7992 return SDValue(
7993 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7994 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7995 return Op;
7996}
7997
7998SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7999 EVT VT = Op.getValueType();
8000 SDLoc SL(Op);
8001 SDValue LHS = Op.getOperand(0);
8002 SDValue RHS = Op.getOperand(1);
8003 bool isSigned = Op.getOpcode() == ISD::SMULO;
8004
8005 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8006 const APInt &C = RHSC->getAPIntValue();
8007 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8008 if (C.isPowerOf2()) {
8009 // smulo(x, signed_min) is same as umulo(x, signed_min).
8010 bool UseArithShift = isSigned && !C.isMinSignedValue();
8011 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8012 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8013 SDValue Overflow =
8014 DAG.getSetCC(SL, MVT::i1,
8015 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8016 Result, ShiftAmt),
8017 LHS, ISD::SETNE);
8018 return DAG.getMergeValues({Result, Overflow}, SL);
8019 }
8020 }
8021
8022 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8023 SDValue Top =
8024 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8025
8026 SDValue Sign = isSigned
8027 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8028 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8029 SL, MVT::i32))
8030 : DAG.getConstant(0, SL, VT);
8031 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8032
8033 return DAG.getMergeValues({Result, Overflow}, SL);
8034}
8035
8036SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8037 if (Op->isDivergent()) {
8038 // Select to V_MAD_[IU]64_[IU]32.
8039 return Op;
8040 }
8041 if (Subtarget->hasSMulHi()) {
8042 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8043 return SDValue();
8044 }
8045 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8046 // calculate the high part, so we might as well do the whole thing with
8047 // V_MAD_[IU]64_[IU]32.
8048 return Op;
8049}
8050
8051SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8052 if (!Subtarget->isTrapHandlerEnabled() ||
8053 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8054 return lowerTrapEndpgm(Op, DAG);
8055
8056 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8057 : lowerTrapHsaQueuePtr(Op, DAG);
8058}
8059
8060SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8061 SDLoc SL(Op);
8062 SDValue Chain = Op.getOperand(0);
8063 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8064}
8065
8066SDValue
8067SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8068 const SDLoc &DL, Align Alignment,
8069 ImplicitParameter Param) const {
8070 MachineFunction &MF = DAG.getMachineFunction();
8071 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8072 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8073 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8074 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8077}
8078
8079SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8080 SelectionDAG &DAG) const {
8081 SDLoc SL(Op);
8082 SDValue Chain = Op.getOperand(0);
8083
8084 SDValue QueuePtr;
8085 // For code object version 5, QueuePtr is passed through implicit kernarg.
8086 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8088 QueuePtr =
8089 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8090 } else {
8091 MachineFunction &MF = DAG.getMachineFunction();
8092 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8093 Register UserSGPR = Info->getQueuePtrUserSGPR();
8094
8095 if (UserSGPR == AMDGPU::NoRegister) {
8096 // We probably are in a function incorrectly marked with
8097 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8098 // trap, so just use a null pointer.
8099 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8100 } else {
8101 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8102 MVT::i64);
8103 }
8104 }
8105
8106 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8107 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8108
8109 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8110 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8111 ToReg.getValue(1)};
8112 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8113}
8114
8115SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8116 SDLoc SL(Op);
8117 SDValue Chain = Op.getOperand(0);
8118
8119 // We need to simulate the 's_trap 2' instruction on targets that run in
8120 // PRIV=1 (where it is treated as a nop).
8121 if (Subtarget->hasPrivEnabledTrap2NopBug())
8122 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8123
8124 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8125 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8126 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8127}
8128
8129SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8130 SDLoc SL(Op);
8131 SDValue Chain = Op.getOperand(0);
8132 MachineFunction &MF = DAG.getMachineFunction();
8133
8134 if (!Subtarget->isTrapHandlerEnabled() ||
8135 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8136 LLVMContext &Ctx = MF.getFunction().getContext();
8137 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8138 "debugtrap handler not supported",
8139 Op.getDebugLoc(), DS_Warning));
8140 return Chain;
8141 }
8142
8143 uint64_t TrapID =
8144 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8145 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8146 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8147}
8148
8149SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8150 SelectionDAG &DAG) const {
8151 if (Subtarget->hasApertureRegs()) {
8152 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8153 ? AMDGPU::SRC_SHARED_BASE
8154 : AMDGPU::SRC_PRIVATE_BASE;
8155 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8156 !Subtarget->hasGloballyAddressableScratch()) &&
8157 "Cannot use src_private_base with globally addressable scratch!");
8158 // Note: this feature (register) is broken. When used as a 32-bit operand,
8159 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8160 // bits.
8161 //
8162 // To work around the issue, directly emit a 64 bit mov from this register
8163 // then extract the high bits. Note that this shouldn't even result in a
8164 // shift being emitted and simply become a pair of registers (e.g.):
8165 // s_mov_b64 s[6:7], src_shared_base
8166 // v_mov_b32_e32 v1, s7
8167 //
8168 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
8169 // coalescing would kick in and it would think it's okay to use the "HI"
8170 // subregister directly (instead of extracting the HI 32 bits) which is an
8171 // artificial (unusable) register.
8172 // Register TableGen definitions would need an overhaul to get rid of the
8173 // artificial "HI" aperture registers and prevent this kind of issue from
8174 // happening.
8175 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
8176 DAG.getRegister(ApertureRegNo, MVT::i64));
8177 return DAG.getNode(
8178 ISD::TRUNCATE, DL, MVT::i32,
8179 DAG.getNode(ISD::SRL, DL, MVT::i64,
8180 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
8181 }
8182
8183 // For code object version 5, private_base and shared_base are passed through
8184 // implicit kernargs.
8185 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8189 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8190 }
8191
8192 MachineFunction &MF = DAG.getMachineFunction();
8193 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8194 Register UserSGPR = Info->getQueuePtrUserSGPR();
8195 if (UserSGPR == AMDGPU::NoRegister) {
8196 // We probably are in a function incorrectly marked with
8197 // amdgpu-no-queue-ptr. This is undefined.
8198 return DAG.getPOISON(MVT::i32);
8199 }
8200
8201 SDValue QueuePtr =
8202 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8203
8204 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8205 // private_segment_aperture_base_hi.
8206 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8207
8208 SDValue Ptr =
8209 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8210
8211 // TODO: Use custom target PseudoSourceValue.
8212 // TODO: We should use the value from the IR intrinsic call, but it might not
8213 // be available and how do we get it?
8214 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8215 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8216 commonAlignment(Align(64), StructOffset),
8219}
8220
8221/// Return true if the value is a known valid address, such that a null check is
8222/// not necessary.
8224 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8226 return true;
8227
8228 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8229 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8230
8231 // TODO: Search through arithmetic, handle arguments and loads
8232 // marked nonnull.
8233 return false;
8234}
8235
8236SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8237 SelectionDAG &DAG) const {
8238 SDLoc SL(Op);
8239
8240 const AMDGPUTargetMachine &TM =
8241 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8242
8243 unsigned DestAS, SrcAS;
8244 SDValue Src;
8245 bool IsNonNull = false;
8246 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8247 SrcAS = ASC->getSrcAddressSpace();
8248 Src = ASC->getOperand(0);
8249 DestAS = ASC->getDestAddressSpace();
8250 } else {
8251 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8252 Op.getConstantOperandVal(0) ==
8253 Intrinsic::amdgcn_addrspacecast_nonnull);
8254 Src = Op->getOperand(1);
8255 SrcAS = Op->getConstantOperandVal(2);
8256 DestAS = Op->getConstantOperandVal(3);
8257 IsNonNull = true;
8258 }
8259
8260 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8261
8262 // flat -> local/private
8263 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8264 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8265 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8266 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8267
8268 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8269 Subtarget->hasGloballyAddressableScratch()) {
8270 // flat -> private with globally addressable scratch: subtract
8271 // src_flat_scratch_base_lo.
8272 SDValue FlatScratchBaseLo(
8273 DAG.getMachineNode(
8274 AMDGPU::S_MOV_B32, SL, MVT::i32,
8275 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8276 0);
8277 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8278 }
8279
8280 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8281 return Ptr;
8282
8283 unsigned NullVal = TM.getNullPointerValue(DestAS);
8284 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8285 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8286
8287 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8288 SegmentNullPtr);
8289 }
8290 }
8291
8292 // local/private -> flat
8293 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8294 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8295 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8296 SDValue CvtPtr;
8297 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8298 Subtarget->hasGloballyAddressableScratch()) {
8299 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8300 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8301 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8302 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8303 ThreadID = DAG.getNode(
8304 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8305 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8306 AllOnes, ThreadID);
8307 if (Subtarget->isWave64())
8308 ThreadID = DAG.getNode(
8309 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8310 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8311 AllOnes, ThreadID);
8312 SDValue ShAmt = DAG.getShiftAmountConstant(
8313 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8314 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8315 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8316 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8317 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8318 // 64-bit hi:lo value.
8319 SDValue FlatScratchBase = {
8320 DAG.getMachineNode(
8321 AMDGPU::S_MOV_B64, SL, MVT::i64,
8322 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8323 0};
8324 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8325 } else {
8326 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8327 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8328 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8329 }
8330
8331 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8332 return CvtPtr;
8333
8334 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8335 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8336
8337 SDValue NonNull =
8338 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8339
8340 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8341 FlatNullPtr);
8342 }
8343 }
8344
8345 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8346 Op.getValueType() == MVT::i64) {
8347 const SIMachineFunctionInfo *Info =
8348 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8349 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8350 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8351 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8352 }
8353
8354 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8355 Src.getValueType() == MVT::i64)
8356 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8357
8358 // global <-> flat are no-ops and never emitted.
8359
8360 // Invalid casts are poison.
8361 return DAG.getPOISON(Op->getValueType(0));
8362}
8363
8364// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8365// the small vector and inserting them into the big vector. That is better than
8366// the default expansion of doing it via a stack slot. Even though the use of
8367// the stack slot would be optimized away afterwards, the stack slot itself
8368// remains.
8369SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8370 SelectionDAG &DAG) const {
8371 SDValue Vec = Op.getOperand(0);
8372 SDValue Ins = Op.getOperand(1);
8373 SDValue Idx = Op.getOperand(2);
8374 EVT VecVT = Vec.getValueType();
8375 EVT InsVT = Ins.getValueType();
8376 EVT EltVT = VecVT.getVectorElementType();
8377 unsigned InsNumElts = InsVT.getVectorNumElements();
8378 unsigned IdxVal = Idx->getAsZExtVal();
8379 SDLoc SL(Op);
8380
8381 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8382 // Insert 32-bit registers at a time.
8383 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8384
8385 unsigned VecNumElts = VecVT.getVectorNumElements();
8386 EVT NewVecVT =
8387 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8388 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8390 MVT::i32, InsNumElts / 2);
8391
8392 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8393 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8394
8395 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8396 SDValue Elt;
8397 if (InsNumElts == 2) {
8398 Elt = Ins;
8399 } else {
8400 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8401 DAG.getConstant(I, SL, MVT::i32));
8402 }
8403 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8404 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8405 }
8406
8407 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8408 }
8409
8410 for (unsigned I = 0; I != InsNumElts; ++I) {
8411 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8412 DAG.getConstant(I, SL, MVT::i32));
8413 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8414 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8415 }
8416 return Vec;
8417}
8418
8419SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8420 SelectionDAG &DAG) const {
8421 SDValue Vec = Op.getOperand(0);
8422 SDValue InsVal = Op.getOperand(1);
8423 SDValue Idx = Op.getOperand(2);
8424 EVT VecVT = Vec.getValueType();
8425 EVT EltVT = VecVT.getVectorElementType();
8426 unsigned VecSize = VecVT.getSizeInBits();
8427 unsigned EltSize = EltVT.getSizeInBits();
8428 SDLoc SL(Op);
8429
8430 // Specially handle the case of v4i16 with static indexing.
8431 unsigned NumElts = VecVT.getVectorNumElements();
8432 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8433 if (NumElts == 4 && EltSize == 16 && KIdx) {
8434 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8435
8436 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8437 DAG.getConstant(0, SL, MVT::i32));
8438 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8439 DAG.getConstant(1, SL, MVT::i32));
8440
8441 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8442 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8443
8444 unsigned Idx = KIdx->getZExtValue();
8445 bool InsertLo = Idx < 2;
8446 SDValue InsHalf = DAG.getNode(
8447 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8448 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8449 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8450
8451 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8452
8453 SDValue Concat =
8454 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8455 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8456
8457 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8458 }
8459
8460 // Static indexing does not lower to stack access, and hence there is no need
8461 // for special custom lowering to avoid stack access.
8462 if (isa<ConstantSDNode>(Idx))
8463 return SDValue();
8464
8465 // Avoid stack access for dynamic indexing by custom lowering to
8466 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8467
8468 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8469
8470 MVT IntVT = MVT::getIntegerVT(VecSize);
8471
8472 // Convert vector index to bit-index and get the required bit mask.
8473 assert(isPowerOf2_32(EltSize));
8474 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8475 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8476 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8477 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8478 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8479
8480 // 1. Create a congruent vector with the target value in each element.
8481 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8482 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8483
8484 // 2. Mask off all other indices except the required index within (1).
8485 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8486
8487 // 3. Mask off the required index within the target vector.
8488 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8489 SDValue RHS =
8490 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8491
8492 // 4. Get (2) and (3) ORed into the target vector.
8493 SDValue BFI =
8494 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8495
8496 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8497}
8498
8499SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8500 SelectionDAG &DAG) const {
8501 SDLoc SL(Op);
8502
8503 EVT ResultVT = Op.getValueType();
8504 SDValue Vec = Op.getOperand(0);
8505 SDValue Idx = Op.getOperand(1);
8506 EVT VecVT = Vec.getValueType();
8507 unsigned VecSize = VecVT.getSizeInBits();
8508 EVT EltVT = VecVT.getVectorElementType();
8509
8510 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8511
8512 // Make sure we do any optimizations that will make it easier to fold
8513 // source modifiers before obscuring it with bit operations.
8514
8515 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8516 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8517 return Combined;
8518
8519 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8520 SDValue Lo, Hi;
8521 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8522
8523 if (VecSize == 128) {
8524 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8525 Lo = DAG.getBitcast(LoVT,
8526 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8527 DAG.getConstant(0, SL, MVT::i32)));
8528 Hi = DAG.getBitcast(HiVT,
8529 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8530 DAG.getConstant(1, SL, MVT::i32)));
8531 } else if (VecSize == 256) {
8532 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8533 SDValue Parts[4];
8534 for (unsigned P = 0; P < 4; ++P) {
8535 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8536 DAG.getConstant(P, SL, MVT::i32));
8537 }
8538
8539 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8540 Parts[0], Parts[1]));
8541 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8542 Parts[2], Parts[3]));
8543 } else {
8544 assert(VecSize == 512);
8545
8546 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8547 SDValue Parts[8];
8548 for (unsigned P = 0; P < 8; ++P) {
8549 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8550 DAG.getConstant(P, SL, MVT::i32));
8551 }
8552
8553 Lo = DAG.getBitcast(LoVT,
8554 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8555 Parts[0], Parts[1], Parts[2], Parts[3]));
8556 Hi = DAG.getBitcast(HiVT,
8557 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8558 Parts[4], Parts[5], Parts[6], Parts[7]));
8559 }
8560
8561 EVT IdxVT = Idx.getValueType();
8562 unsigned NElem = VecVT.getVectorNumElements();
8563 assert(isPowerOf2_32(NElem));
8564 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8565 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8566 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8567 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8568 }
8569
8570 assert(VecSize <= 64);
8571
8572 MVT IntVT = MVT::getIntegerVT(VecSize);
8573
8574 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8575 SDValue VecBC = peekThroughBitcasts(Vec);
8576 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8577 SDValue Src = VecBC.getOperand(0);
8578 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8579 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8580 }
8581
8582 unsigned EltSize = EltVT.getSizeInBits();
8583 assert(isPowerOf2_32(EltSize));
8584
8585 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8586
8587 // Convert vector index to bit-index (* EltSize)
8588 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8589
8590 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8591 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8592
8593 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8594 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8595 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8596 }
8597
8598 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8599}
8600
8601static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8602 assert(Elt % 2 == 0);
8603 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8604}
8605
8606static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8607 assert(Elt % 2 == 0);
8608 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8609 !(Mask[Elt + 1] & 1);
8610}
8611
8612SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8613 SelectionDAG &DAG) const {
8614 SDLoc SL(Op);
8615 EVT ResultVT = Op.getValueType();
8616 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8617 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8618 const int NewSrcNumElts = 2;
8619 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8620 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8621
8622 // Break up the shuffle into registers sized pieces.
8623 //
8624 // We're trying to form sub-shuffles that the register allocation pipeline
8625 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8626 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8627 // pair of copies into a consecutive register copy, so use the ordinary
8628 // extract_vector_elt lowering unless we can use the shuffle.
8629 //
8630 // TODO: This is a bit of hack, and we should probably always use
8631 // extract_subvector for the largest possible subvector we can (or at least
8632 // use it for PackVT aligned pieces). However we have worse support for
8633 // combines on them don't directly treat extract_subvector / insert_subvector
8634 // as legal. The DAG scheduler also ends up doing a worse job with the
8635 // extract_subvectors.
8636 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8637
8638 // vector_shuffle <0,1,6,7> lhs, rhs
8639 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8640 //
8641 // vector_shuffle <6,7,2,3> lhs, rhs
8642 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8643 //
8644 // vector_shuffle <6,7,0,1> lhs, rhs
8645 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8646
8647 // Avoid scalarizing when both halves are reading from consecutive elements.
8648
8649 // If we're treating 2 element shuffles as legal, also create odd-to-even
8650 // shuffles of neighboring pairs.
8651 //
8652 // vector_shuffle <3,2,7,6> lhs, rhs
8653 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8654 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8655
8657 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8658 if (ShouldUseConsecutiveExtract &&
8660 const int Idx = SVN->getMaskElt(I);
8661 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8662 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8663 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8664 SVN->getOperand(VecIdx),
8665 DAG.getConstant(EltIdx, SL, MVT::i32));
8666 Pieces.push_back(SubVec);
8667 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8669 int Idx0 = SVN->getMaskElt(I);
8670 int Idx1 = SVN->getMaskElt(I + 1);
8671
8672 SDValue SrcOp0 = SVN->getOperand(0);
8673 SDValue SrcOp1 = SrcOp0;
8674 if (Idx0 >= SrcNumElts) {
8675 SrcOp0 = SVN->getOperand(1);
8676 Idx0 -= SrcNumElts;
8677 }
8678
8679 if (Idx1 >= SrcNumElts) {
8680 SrcOp1 = SVN->getOperand(1);
8681 Idx1 -= SrcNumElts;
8682 }
8683
8684 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8685 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8686
8687 // Extract nearest even aligned piece.
8688 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8689 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8690 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8691 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8692
8693 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8694 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8695
8696 SDValue Result0 = SubVec0;
8697 SDValue Result1 = SubVec0;
8698
8699 if (SubVec0 != SubVec1) {
8700 NewMaskIdx1 += NewSrcNumElts;
8701 Result1 = SubVec1;
8702 } else {
8703 Result1 = DAG.getPOISON(PackVT);
8704 }
8705
8706 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8707 {NewMaskIdx0, NewMaskIdx1});
8708 Pieces.push_back(Shuf);
8709 } else {
8710 const int Idx0 = SVN->getMaskElt(I);
8711 const int Idx1 = SVN->getMaskElt(I + 1);
8712 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8713 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8714 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8715 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8716
8717 SDValue Vec0 = SVN->getOperand(VecIdx0);
8718 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8719 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8720
8721 SDValue Vec1 = SVN->getOperand(VecIdx1);
8722 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8723 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8724 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8725 }
8726 }
8727
8728 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8729}
8730
8731SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8732 SelectionDAG &DAG) const {
8733 SDValue SVal = Op.getOperand(0);
8734 EVT ResultVT = Op.getValueType();
8735 EVT SValVT = SVal.getValueType();
8736 SDValue UndefVal = DAG.getPOISON(SValVT);
8737 SDLoc SL(Op);
8738
8740 VElts.push_back(SVal);
8741 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8742 VElts.push_back(UndefVal);
8743
8744 return DAG.getBuildVector(ResultVT, SL, VElts);
8745}
8746
8747SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8748 SelectionDAG &DAG) const {
8749 SDLoc SL(Op);
8750 EVT VT = Op.getValueType();
8751
8752 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8753 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8754
8755 SDValue Lo = Op.getOperand(0);
8756 SDValue Hi = Op.getOperand(1);
8757
8758 // Avoid adding defined bits with the zero_extend.
8759 if (Hi.isUndef()) {
8760 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8761 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8762 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8763 }
8764
8765 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8766 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8767
8768 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8769 DAG.getConstant(16, SL, MVT::i32));
8770 if (Lo.isUndef())
8771 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8772
8773 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8774 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8775
8776 SDValue Or =
8777 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8778 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8779 }
8780
8781 // Split into 2-element chunks.
8782 const unsigned NumParts = VT.getVectorNumElements() / 2;
8783 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8784 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8785
8787 for (unsigned P = 0; P < NumParts; ++P) {
8788 SDValue Vec = DAG.getBuildVector(
8789 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8790 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8791 }
8792
8793 SDValue Blend =
8794 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8795 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8796}
8797
8799 const GlobalAddressSDNode *GA) const {
8800 // OSes that use ELF REL relocations (instead of RELA) can only store a
8801 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8802 // which can create arbitrary 64-bit addends. (This is only a problem for
8803 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8804 // the high 32 bits of the addend.)
8805 //
8806 // This should be kept in sync with how HasRelocationAddend is initialized in
8807 // the constructor of ELFAMDGPUAsmBackend.
8808 if (!Subtarget->isAmdHsaOS())
8809 return false;
8810
8811 // We can fold offsets for anything that doesn't require a GOT relocation.
8812 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8816}
8817
8818static SDValue
8820 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8821 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8822 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8823 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8824 // lowered to the following code sequence:
8825 //
8826 // For constant address space:
8827 // s_getpc_b64 s[0:1]
8828 // s_add_u32 s0, s0, $symbol
8829 // s_addc_u32 s1, s1, 0
8830 //
8831 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8832 // a fixup or relocation is emitted to replace $symbol with a literal
8833 // constant, which is a pc-relative offset from the encoding of the $symbol
8834 // operand to the global variable.
8835 //
8836 // For global address space:
8837 // s_getpc_b64 s[0:1]
8838 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8839 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8840 //
8841 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8842 // fixups or relocations are emitted to replace $symbol@*@lo and
8843 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8844 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8845 // operand to the global variable.
8846 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8847 assert(GAFlags != SIInstrInfo::MO_NONE);
8848
8849 SDValue Ptr =
8850 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8851 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8852 }
8853
8854 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8855 SDValue PtrHi;
8856 if (GAFlags == SIInstrInfo::MO_NONE)
8857 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8858 else
8859 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8860 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8861}
8862
8863SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8864 SDValue Op,
8865 SelectionDAG &DAG) const {
8866 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8867 SDLoc DL(GSD);
8868 EVT PtrVT = Op.getValueType();
8869
8870 const GlobalValue *GV = GSD->getGlobal();
8876 GV->hasExternalLinkage()) {
8877 Type *Ty = GV->getValueType();
8878 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8879 // zero-sized type in other languages to declare the dynamic shared
8880 // memory which size is not known at the compile time. They will be
8881 // allocated by the runtime and placed directly after the static
8882 // allocated ones. They all share the same offset.
8883 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8884 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8885 // Adjust alignment for that dynamic shared memory array.
8888 MFI->setUsesDynamicLDS(true);
8889 return SDValue(
8890 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8891 }
8892 }
8894 }
8895
8897 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8899 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8900 }
8901
8902 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8903 if (Subtarget->has64BitLiterals()) {
8905 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8906 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8907 0);
8908 }
8909
8910 SDValue AddrLo = DAG.getTargetGlobalAddress(
8911 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8912 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8913
8914 SDValue AddrHi = DAG.getTargetGlobalAddress(
8915 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8916 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8917
8918 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8919 }
8920
8921 if (shouldEmitFixup(GV))
8922 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8923
8924 if (shouldEmitPCReloc(GV))
8925 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8927
8928 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8930 PointerType *PtrTy =
8932 const DataLayout &DataLayout = DAG.getDataLayout();
8933 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8934 MachinePointerInfo PtrInfo =
8936
8937 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8940}
8941
8943 const SDLoc &DL, SDValue V) const {
8944 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8945 // the destination register.
8946 //
8947 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8948 // so we will end up with redundant moves to m0.
8949 //
8950 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8951
8952 // A Null SDValue creates a glue result.
8953 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8954 V, Chain);
8955 return SDValue(M0, 0);
8956}
8957
8958SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8959 MVT VT,
8960 unsigned Offset) const {
8961 SDLoc SL(Op);
8962 SDValue Param = lowerKernargMemParameter(
8963 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8964 // The local size values will have the hi 16-bits as zero.
8965 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8966 DAG.getValueType(VT));
8967}
8968
8970 EVT VT) {
8973 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8974 return DAG.getPOISON(VT);
8975}
8976
8978 EVT VT) {
8981 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8982 return DAG.getPOISON(VT);
8983}
8984
8986 ArrayRef<SDValue> Elts) {
8987 assert(!Elts.empty());
8988 MVT Type;
8989 unsigned NumElts = Elts.size();
8990
8991 if (NumElts <= 12) {
8992 Type = MVT::getVectorVT(MVT::f32, NumElts);
8993 } else {
8994 assert(Elts.size() <= 16);
8995 Type = MVT::v16f32;
8996 NumElts = 16;
8997 }
8998
8999 SmallVector<SDValue, 16> VecElts(NumElts);
9000 for (unsigned i = 0; i < Elts.size(); ++i) {
9001 SDValue Elt = Elts[i];
9002 if (Elt.getValueType() != MVT::f32)
9003 Elt = DAG.getBitcast(MVT::f32, Elt);
9004 VecElts[i] = Elt;
9005 }
9006 for (unsigned i = Elts.size(); i < NumElts; ++i)
9007 VecElts[i] = DAG.getPOISON(MVT::f32);
9008
9009 if (NumElts == 1)
9010 return VecElts[0];
9011 return DAG.getBuildVector(Type, DL, VecElts);
9012}
9013
9014static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9015 SDValue Src, int ExtraElts) {
9016 EVT SrcVT = Src.getValueType();
9017
9019
9020 if (SrcVT.isVector())
9021 DAG.ExtractVectorElements(Src, Elts);
9022 else
9023 Elts.push_back(Src);
9024
9025 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9026 while (ExtraElts--)
9027 Elts.push_back(Undef);
9028
9029 return DAG.getBuildVector(CastVT, DL, Elts);
9030}
9031
9032// Re-construct the required return value for a image load intrinsic.
9033// This is more complicated due to the optional use TexFailCtrl which means the
9034// required return type is an aggregate
9036 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9037 bool Unpacked, bool IsD16, int DMaskPop,
9038 int NumVDataDwords, bool IsAtomicPacked16Bit,
9039 const SDLoc &DL) {
9040 // Determine the required return type. This is the same regardless of
9041 // IsTexFail flag
9042 EVT ReqRetVT = ResultTypes[0];
9043 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9044 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9045 ? (ReqRetNumElts + 1) / 2
9046 : ReqRetNumElts;
9047
9048 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9049
9050 MVT DataDwordVT =
9051 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9052
9053 MVT MaskPopVT =
9054 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9055
9056 SDValue Data(Result, 0);
9057 SDValue TexFail;
9058
9059 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9060 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9061 if (MaskPopVT.isVector()) {
9062 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9063 SDValue(Result, 0), ZeroIdx);
9064 } else {
9065 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9066 SDValue(Result, 0), ZeroIdx);
9067 }
9068 }
9069
9070 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9071 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9072 NumDataDwords - MaskPopDwords);
9073
9074 if (IsD16)
9075 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9076
9077 EVT LegalReqRetVT = ReqRetVT;
9078 if (!ReqRetVT.isVector()) {
9079 if (!Data.getValueType().isInteger())
9080 Data = DAG.getNode(ISD::BITCAST, DL,
9081 Data.getValueType().changeTypeToInteger(), Data);
9082 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9083 } else {
9084 // We need to widen the return vector to a legal type
9085 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9086 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9087 LegalReqRetVT =
9089 ReqRetVT.getVectorNumElements() + 1);
9090 }
9091 }
9092 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9093
9094 if (IsTexFail) {
9095 TexFail =
9096 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9097 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9098
9099 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9100 }
9101
9102 if (Result->getNumValues() == 1)
9103 return Data;
9104
9105 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9106}
9107
9108static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9109 SDValue *LWE, bool &IsTexFail) {
9110 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9111
9112 uint64_t Value = TexFailCtrlConst->getZExtValue();
9113 if (Value) {
9114 IsTexFail = true;
9115 }
9116
9117 SDLoc DL(TexFailCtrlConst);
9118 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9119 Value &= ~(uint64_t)0x1;
9120 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9121 Value &= ~(uint64_t)0x2;
9122
9123 return Value == 0;
9124}
9125
9127 MVT PackVectorVT,
9128 SmallVectorImpl<SDValue> &PackedAddrs,
9129 unsigned DimIdx, unsigned EndIdx,
9130 unsigned NumGradients) {
9131 SDLoc DL(Op);
9132 for (unsigned I = DimIdx; I < EndIdx; I++) {
9133 SDValue Addr = Op.getOperand(I);
9134
9135 // Gradients are packed with undef for each coordinate.
9136 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9137 // 1D: undef,dx/dh; undef,dx/dv
9138 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9139 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9140 if (((I + 1) >= EndIdx) ||
9141 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9142 I == DimIdx + NumGradients - 1))) {
9143 if (Addr.getValueType() != MVT::i16)
9144 Addr = DAG.getBitcast(MVT::i16, Addr);
9145 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9146 } else {
9147 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9148 I++;
9149 }
9150 Addr = DAG.getBitcast(MVT::f32, Addr);
9151 PackedAddrs.push_back(Addr);
9152 }
9153}
9154
9155SDValue SITargetLowering::lowerImage(SDValue Op,
9157 SelectionDAG &DAG, bool WithChain) const {
9158 SDLoc DL(Op);
9159 MachineFunction &MF = DAG.getMachineFunction();
9160 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9161 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9163 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9164 unsigned IntrOpcode = Intr->BaseOpcode;
9165 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9166 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9167 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9168
9169 SmallVector<EVT, 3> ResultTypes(Op->values());
9170 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9171 bool IsD16 = false;
9172 bool IsG16 = false;
9173 bool IsA16 = false;
9174 SDValue VData;
9175 int NumVDataDwords = 0;
9176 bool AdjustRetType = false;
9177 bool IsAtomicPacked16Bit = false;
9178
9179 // Offset of intrinsic arguments
9180 const unsigned ArgOffset = WithChain ? 2 : 1;
9181
9182 unsigned DMask;
9183 unsigned DMaskLanes = 0;
9184
9185 if (BaseOpcode->Atomic) {
9186 VData = Op.getOperand(2);
9187
9188 IsAtomicPacked16Bit =
9189 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9190 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9191
9192 bool Is64Bit = VData.getValueSizeInBits() == 64;
9193 if (BaseOpcode->AtomicX2) {
9194 SDValue VData2 = Op.getOperand(3);
9195 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9196 {VData, VData2});
9197 if (Is64Bit)
9198 VData = DAG.getBitcast(MVT::v4i32, VData);
9199
9200 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9201 DMask = Is64Bit ? 0xf : 0x3;
9202 NumVDataDwords = Is64Bit ? 4 : 2;
9203 } else {
9204 DMask = Is64Bit ? 0x3 : 0x1;
9205 NumVDataDwords = Is64Bit ? 2 : 1;
9206 }
9207 } else {
9208 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9209 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9210
9211 if (BaseOpcode->Store) {
9212 VData = Op.getOperand(2);
9213
9214 MVT StoreVT = VData.getSimpleValueType();
9215 if (StoreVT.getScalarType() == MVT::f16) {
9216 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9217 return Op; // D16 is unsupported for this instruction
9218
9219 IsD16 = true;
9220 VData = handleD16VData(VData, DAG, true);
9221 }
9222
9223 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9224 } else if (!BaseOpcode->NoReturn) {
9225 // Work out the num dwords based on the dmask popcount and underlying type
9226 // and whether packing is supported.
9227 MVT LoadVT = ResultTypes[0].getSimpleVT();
9228 if (LoadVT.getScalarType() == MVT::f16) {
9229 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9230 return Op; // D16 is unsupported for this instruction
9231
9232 IsD16 = true;
9233 }
9234
9235 // Confirm that the return type is large enough for the dmask specified
9236 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9237 (!LoadVT.isVector() && DMaskLanes > 1))
9238 return Op;
9239
9240 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9241 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9242 // instructions.
9243 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9244 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9245 NumVDataDwords = (DMaskLanes + 1) / 2;
9246 else
9247 NumVDataDwords = DMaskLanes;
9248
9249 AdjustRetType = true;
9250 }
9251 }
9252
9253 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9255
9256 // Check for 16 bit addresses or derivatives and pack if true.
9257 MVT VAddrVT =
9258 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9259 MVT VAddrScalarVT = VAddrVT.getScalarType();
9260 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9261 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9262
9263 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9264 VAddrScalarVT = VAddrVT.getScalarType();
9265 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9266 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9267
9268 // Push back extra arguments.
9269 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9270 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9271 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9272 // Special handling of bias when A16 is on. Bias is of type half but
9273 // occupies full 32-bit.
9274 SDValue Bias = DAG.getBuildVector(
9275 MVT::v2f16, DL,
9276 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9277 VAddrs.push_back(Bias);
9278 } else {
9279 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9280 "Bias needs to be converted to 16 bit in A16 mode");
9281 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9282 }
9283 }
9284
9285 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9286 // 16 bit gradients are supported, but are tied to the A16 control
9287 // so both gradients and addresses must be 16 bit
9288 LLVM_DEBUG(
9289 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9290 "require 16 bit args for both gradients and addresses");
9291 return Op;
9292 }
9293
9294 if (IsA16) {
9295 if (!ST->hasA16()) {
9296 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9297 "support 16 bit addresses\n");
9298 return Op;
9299 }
9300 }
9301
9302 // We've dealt with incorrect input so we know that if IsA16, IsG16
9303 // are set then we have to compress/pack operands (either address,
9304 // gradient or both)
9305 // In the case where a16 and gradients are tied (no G16 support) then we
9306 // have already verified that both IsA16 and IsG16 are true
9307 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9308 // Activate g16
9309 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9311 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9312 }
9313
9314 // Add gradients (packed or unpacked)
9315 if (IsG16) {
9316 // Pack the gradients
9317 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9318 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9319 ArgOffset + Intr->GradientStart,
9320 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9321 } else {
9322 for (unsigned I = ArgOffset + Intr->GradientStart;
9323 I < ArgOffset + Intr->CoordStart; I++)
9324 VAddrs.push_back(Op.getOperand(I));
9325 }
9326
9327 // Add addresses (packed or unpacked)
9328 if (IsA16) {
9329 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9330 ArgOffset + Intr->CoordStart, VAddrEnd,
9331 0 /* No gradients */);
9332 } else {
9333 // Add uncompressed address
9334 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9335 VAddrs.push_back(Op.getOperand(I));
9336 }
9337
9338 // If the register allocator cannot place the address registers contiguously
9339 // without introducing moves, then using the non-sequential address encoding
9340 // is always preferable, since it saves VALU instructions and is usually a
9341 // wash in terms of code size or even better.
9342 //
9343 // However, we currently have no way of hinting to the register allocator that
9344 // MIMG addresses should be placed contiguously when it is possible to do so,
9345 // so force non-NSA for the common 2-address case as a heuristic.
9346 //
9347 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9348 // allocation when possible.
9349 //
9350 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9351 // set of the remaining addresses.
9352 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9353 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9354 const bool UseNSA = ST->hasNSAEncoding() &&
9355 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9356 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9357 const bool UsePartialNSA =
9358 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9359
9360 SDValue VAddr;
9361 if (UsePartialNSA) {
9362 VAddr = getBuildDwordsVector(DAG, DL,
9363 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9364 } else if (!UseNSA) {
9365 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9366 }
9367
9368 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9369 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9370 SDValue Unorm;
9371 if (!BaseOpcode->Sampler) {
9372 Unorm = True;
9373 } else {
9374 uint64_t UnormConst =
9375 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9376
9377 Unorm = UnormConst ? True : False;
9378 }
9379
9380 SDValue TFE;
9381 SDValue LWE;
9382 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9383 bool IsTexFail = false;
9384 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9385 return Op;
9386
9387 if (IsTexFail) {
9388 if (!DMaskLanes) {
9389 // Expecting to get an error flag since TFC is on - and dmask is 0
9390 // Force dmask to be at least 1 otherwise the instruction will fail
9391 DMask = 0x1;
9392 DMaskLanes = 1;
9393 NumVDataDwords = 1;
9394 }
9395 NumVDataDwords += 1;
9396 AdjustRetType = true;
9397 }
9398
9399 // Has something earlier tagged that the return type needs adjusting
9400 // This happens if the instruction is a load or has set TexFailCtrl flags
9401 if (AdjustRetType) {
9402 // NumVDataDwords reflects the true number of dwords required in the return
9403 // type
9404 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9405 // This is a no-op load. This can be eliminated
9406 SDValue Undef = DAG.getPOISON(Op.getValueType());
9407 if (isa<MemSDNode>(Op))
9408 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9409 return Undef;
9410 }
9411
9412 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9413 MVT::i32, NumVDataDwords)
9414 : MVT::i32;
9415
9416 ResultTypes[0] = NewVT;
9417 if (ResultTypes.size() == 3) {
9418 // Original result was aggregate type used for TexFailCtrl results
9419 // The actual instruction returns as a vector type which has now been
9420 // created. Remove the aggregate result.
9421 ResultTypes.erase(&ResultTypes[1]);
9422 }
9423 }
9424
9425 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9426 if (BaseOpcode->Atomic)
9427 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9428 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9430 return Op;
9431
9433 if (BaseOpcode->Store || BaseOpcode->Atomic)
9434 Ops.push_back(VData); // vdata
9435 if (UsePartialNSA) {
9436 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9437 Ops.push_back(VAddr);
9438 } else if (UseNSA)
9439 append_range(Ops, VAddrs);
9440 else
9441 Ops.push_back(VAddr);
9442 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9443 EVT RsrcVT = Rsrc.getValueType();
9444 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9445 return Op;
9446 Ops.push_back(Rsrc);
9447 if (BaseOpcode->Sampler) {
9448 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9449 if (Samp.getValueType() != MVT::v4i32)
9450 return Op;
9451 Ops.push_back(Samp);
9452 }
9453 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9454 if (IsGFX10Plus)
9455 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9456 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9457 Ops.push_back(Unorm);
9458 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9459 Ops.push_back(IsA16 && // r128, a16 for gfx9
9460 ST->hasFeature(AMDGPU::FeatureR128A16)
9461 ? True
9462 : False);
9463 if (IsGFX10Plus)
9464 Ops.push_back(IsA16 ? True : False);
9465
9466 if (!Subtarget->hasGFX90AInsts())
9467 Ops.push_back(TFE); // tfe
9468 else if (TFE->getAsZExtVal()) {
9469 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9471 "TFE is not supported on this GPU", DL.getDebugLoc()));
9472 }
9473
9474 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9475 Ops.push_back(LWE); // lwe
9476 if (!IsGFX10Plus)
9477 Ops.push_back(DimInfo->DA ? True : False);
9478 if (BaseOpcode->HasD16)
9479 Ops.push_back(IsD16 ? True : False);
9480 if (isa<MemSDNode>(Op))
9481 Ops.push_back(Op.getOperand(0)); // chain
9482
9483 int NumVAddrDwords =
9484 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9485 int Opcode = -1;
9486
9487 if (IsGFX12Plus) {
9488 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9489 NumVDataDwords, NumVAddrDwords);
9490 } else if (IsGFX11Plus) {
9491 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9492 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9493 : AMDGPU::MIMGEncGfx11Default,
9494 NumVDataDwords, NumVAddrDwords);
9495 } else if (IsGFX10Plus) {
9496 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9497 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9498 : AMDGPU::MIMGEncGfx10Default,
9499 NumVDataDwords, NumVAddrDwords);
9500 } else {
9501 if (Subtarget->hasGFX90AInsts()) {
9502 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9503 NumVDataDwords, NumVAddrDwords);
9504 if (Opcode == -1) {
9505 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9507 "requested image instruction is not supported on this GPU",
9508 DL.getDebugLoc()));
9509
9510 unsigned Idx = 0;
9511 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9512 for (EVT VT : OrigResultTypes) {
9513 if (VT == MVT::Other)
9514 RetValues[Idx++] = Op.getOperand(0); // Chain
9515 else
9516 RetValues[Idx++] = DAG.getPOISON(VT);
9517 }
9518
9519 return DAG.getMergeValues(RetValues, DL);
9520 }
9521 }
9522 if (Opcode == -1 &&
9523 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9524 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9525 NumVDataDwords, NumVAddrDwords);
9526 if (Opcode == -1)
9527 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9528 NumVDataDwords, NumVAddrDwords);
9529 }
9530 if (Opcode == -1)
9531 return Op;
9532
9533 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9534 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9535 MachineMemOperand *MemRef = MemOp->getMemOperand();
9536 DAG.setNodeMemRefs(NewNode, {MemRef});
9537 }
9538
9539 if (BaseOpcode->AtomicX2) {
9541 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9542 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9543 }
9544 if (BaseOpcode->NoReturn)
9545 return SDValue(NewNode, 0);
9546 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9547 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9548 NumVDataDwords, IsAtomicPacked16Bit, DL);
9549}
9550
9551SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9552 SDValue Offset, SDValue CachePolicy,
9553 SelectionDAG &DAG) const {
9554 MachineFunction &MF = DAG.getMachineFunction();
9555
9556 const DataLayout &DataLayout = DAG.getDataLayout();
9557 Align Alignment =
9558 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9559
9560 MachineMemOperand *MMO = MF.getMachineMemOperand(
9561 MachinePointerInfo(),
9564 VT.getStoreSize(), Alignment);
9565
9566 if (!Offset->isDivergent()) {
9567 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9568
9569 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9570 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9571 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9572 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9573 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9574 SDValue BufferLoad =
9576 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9577 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9578 }
9579
9580 // Widen vec3 load to vec4.
9581 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9582 !Subtarget->hasScalarDwordx3Loads()) {
9583 EVT WidenedVT =
9585 auto WidenedOp = DAG.getMemIntrinsicNode(
9586 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9587 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9588 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9589 DAG.getVectorIdxConstant(0, DL));
9590 return Subvector;
9591 }
9592
9594 DAG.getVTList(VT), Ops, VT, MMO);
9595 }
9596
9597 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9598 // assume that the buffer is unswizzled.
9599 SDValue Ops[] = {
9600 DAG.getEntryNode(), // Chain
9601 Rsrc, // rsrc
9602 DAG.getConstant(0, DL, MVT::i32), // vindex
9603 {}, // voffset
9604 {}, // soffset
9605 {}, // offset
9606 CachePolicy, // cachepolicy
9607 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9608 };
9609 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9610 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9611 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9612 }
9613
9615 unsigned NumLoads = 1;
9616 MVT LoadVT = VT.getSimpleVT();
9617 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9618 assert((LoadVT.getScalarType() == MVT::i32 ||
9619 LoadVT.getScalarType() == MVT::f32));
9620
9621 if (NumElts == 8 || NumElts == 16) {
9622 NumLoads = NumElts / 4;
9623 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9624 }
9625
9626 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9627
9628 // Use the alignment to ensure that the required offsets will fit into the
9629 // immediate offsets.
9630 setBufferOffsets(Offset, DAG, &Ops[3],
9631 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9632
9633 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9634 for (unsigned i = 0; i < NumLoads; ++i) {
9635 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9636 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9637 LoadVT, MMO, DAG));
9638 }
9639
9640 if (NumElts == 8 || NumElts == 16)
9641 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9642
9643 return Loads[0];
9644}
9645
9646SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9647 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9648 if (!Subtarget->hasArchitectedSGPRs())
9649 return {};
9650 SDLoc SL(Op);
9651 MVT VT = MVT::i32;
9652 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9653 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9654 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9655}
9656
9657SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9658 AMDGPU::Hwreg::Id HwReg,
9659 unsigned LowBit,
9660 unsigned Width) const {
9661 SDLoc SL(Op);
9662 using namespace AMDGPU::Hwreg;
9663 return {DAG.getMachineNode(
9664 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9665 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9666 SL, MVT::i32)),
9667 0};
9668}
9669
9670SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9671 unsigned Dim,
9672 const ArgDescriptor &Arg) const {
9673 SDLoc SL(Op);
9674 MachineFunction &MF = DAG.getMachineFunction();
9675 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9676 if (MaxID == 0)
9677 return DAG.getConstant(0, SL, MVT::i32);
9678
9679 // It's undefined behavior if a function marked with the amdgpu-no-*
9680 // attributes uses the corresponding intrinsic.
9681 if (!Arg)
9682 return DAG.getPOISON(Op->getValueType(0));
9683
9684 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9685 SDLoc(DAG.getEntryNode()), Arg);
9686
9687 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9688 // masking operations anyway.
9689 //
9690 // TODO: We could assert the top bit is 0 for the source copy.
9691 if (Arg.isMasked())
9692 return Val;
9693
9694 // Preserve the known bits after expansion to a copy.
9695 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9696 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9697 DAG.getValueType(SmallVT));
9698}
9699
9700SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9701 SelectionDAG &DAG) const {
9702 MachineFunction &MF = DAG.getMachineFunction();
9703 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9704
9705 EVT VT = Op.getValueType();
9706 SDLoc DL(Op);
9707 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9708
9709 // TODO: Should this propagate fast-math-flags?
9710
9711 switch (IntrinsicID) {
9712 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9713 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9714 return emitNonHSAIntrinsicError(DAG, DL, VT);
9715 return getPreloadedValue(DAG, *MFI, VT,
9717 }
9718 case Intrinsic::amdgcn_dispatch_ptr:
9719 case Intrinsic::amdgcn_queue_ptr: {
9720 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9721 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9722 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9723 DL.getDebugLoc()));
9724 return DAG.getPOISON(VT);
9725 }
9726
9727 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9730 return getPreloadedValue(DAG, *MFI, VT, RegID);
9731 }
9732 case Intrinsic::amdgcn_implicitarg_ptr: {
9733 if (MFI->isEntryFunction())
9734 return getImplicitArgPtr(DAG, DL);
9735 return getPreloadedValue(DAG, *MFI, VT,
9737 }
9738 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9740 // This only makes sense to call in a kernel, so just lower to null.
9741 return DAG.getConstant(0, DL, VT);
9742 }
9743
9744 return getPreloadedValue(DAG, *MFI, VT,
9746 }
9747 case Intrinsic::amdgcn_dispatch_id: {
9748 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9749 }
9750 case Intrinsic::amdgcn_rcp:
9751 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9752 case Intrinsic::amdgcn_rsq:
9753 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9754 case Intrinsic::amdgcn_rsq_legacy:
9755 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9756 return emitRemovedIntrinsicError(DAG, DL, VT);
9757 return SDValue();
9758 case Intrinsic::amdgcn_rcp_legacy:
9759 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9760 return emitRemovedIntrinsicError(DAG, DL, VT);
9761 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9762 case Intrinsic::amdgcn_rsq_clamp: {
9763 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9764 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9765
9766 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9767 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9768 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9769
9770 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9771 SDValue Tmp =
9772 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9773 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9774 DAG.getConstantFP(Min, DL, VT));
9775 }
9776 case Intrinsic::r600_read_ngroups_x:
9777 if (Subtarget->isAmdHsaOS())
9778 return emitNonHSAIntrinsicError(DAG, DL, VT);
9779
9780 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9782 false);
9783 case Intrinsic::r600_read_ngroups_y:
9784 if (Subtarget->isAmdHsaOS())
9785 return emitNonHSAIntrinsicError(DAG, DL, VT);
9786
9787 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9789 false);
9790 case Intrinsic::r600_read_ngroups_z:
9791 if (Subtarget->isAmdHsaOS())
9792 return emitNonHSAIntrinsicError(DAG, DL, VT);
9793
9794 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9796 false);
9797 case Intrinsic::r600_read_local_size_x:
9798 if (Subtarget->isAmdHsaOS())
9799 return emitNonHSAIntrinsicError(DAG, DL, VT);
9800
9801 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9803 case Intrinsic::r600_read_local_size_y:
9804 if (Subtarget->isAmdHsaOS())
9805 return emitNonHSAIntrinsicError(DAG, DL, VT);
9806
9807 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9809 case Intrinsic::r600_read_local_size_z:
9810 if (Subtarget->isAmdHsaOS())
9811 return emitNonHSAIntrinsicError(DAG, DL, VT);
9812
9813 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9815 case Intrinsic::amdgcn_workgroup_id_x:
9816 return lowerWorkGroupId(DAG, *MFI, VT,
9820 case Intrinsic::amdgcn_workgroup_id_y:
9821 return lowerWorkGroupId(DAG, *MFI, VT,
9825 case Intrinsic::amdgcn_workgroup_id_z:
9826 return lowerWorkGroupId(DAG, *MFI, VT,
9830 case Intrinsic::amdgcn_cluster_id_x:
9831 return Subtarget->hasClusters()
9832 ? getPreloadedValue(DAG, *MFI, VT,
9834 : DAG.getPOISON(VT);
9835 case Intrinsic::amdgcn_cluster_id_y:
9836 return Subtarget->hasClusters()
9837 ? getPreloadedValue(DAG, *MFI, VT,
9839 : DAG.getPOISON(VT);
9840 case Intrinsic::amdgcn_cluster_id_z:
9841 return Subtarget->hasClusters()
9842 ? getPreloadedValue(DAG, *MFI, VT,
9844 : DAG.getPOISON(VT);
9845 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9846 return Subtarget->hasClusters()
9847 ? getPreloadedValue(
9848 DAG, *MFI, VT,
9850 : DAG.getPOISON(VT);
9851 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9852 return Subtarget->hasClusters()
9853 ? getPreloadedValue(
9854 DAG, *MFI, VT,
9856 : DAG.getPOISON(VT);
9857 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9858 return Subtarget->hasClusters()
9859 ? getPreloadedValue(
9860 DAG, *MFI, VT,
9862 : DAG.getPOISON(VT);
9863 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9864 return Subtarget->hasClusters()
9865 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9866 : SDValue();
9867 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9868 return Subtarget->hasClusters()
9869 ? getPreloadedValue(
9870 DAG, *MFI, VT,
9872 : DAG.getPOISON(VT);
9873 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9874 return Subtarget->hasClusters()
9875 ? getPreloadedValue(
9876 DAG, *MFI, VT,
9878 : DAG.getPOISON(VT);
9879 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9880 return Subtarget->hasClusters()
9881 ? getPreloadedValue(
9882 DAG, *MFI, VT,
9884 : DAG.getPOISON(VT);
9885 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9886 return Subtarget->hasClusters()
9887 ? getPreloadedValue(
9888 DAG, *MFI, VT,
9890 : DAG.getPOISON(VT);
9891 case Intrinsic::amdgcn_wave_id:
9892 return lowerWaveID(DAG, Op);
9893 case Intrinsic::amdgcn_lds_kernel_id: {
9894 if (MFI->isEntryFunction())
9895 return getLDSKernelId(DAG, DL);
9896 return getPreloadedValue(DAG, *MFI, VT,
9898 }
9899 case Intrinsic::amdgcn_workitem_id_x:
9900 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9901 case Intrinsic::amdgcn_workitem_id_y:
9902 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9903 case Intrinsic::amdgcn_workitem_id_z:
9904 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9905 case Intrinsic::amdgcn_wavefrontsize:
9906 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9907 SDLoc(Op), MVT::i32);
9908 case Intrinsic::amdgcn_s_buffer_load: {
9909 unsigned CPol = Op.getConstantOperandVal(3);
9910 // s_buffer_load, because of how it's optimized, can't be volatile
9911 // so reject ones with the volatile bit set.
9912 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9915 return Op;
9916 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9917 Op.getOperand(3), DAG);
9918 }
9919 case Intrinsic::amdgcn_fdiv_fast:
9920 return lowerFDIV_FAST(Op, DAG);
9921 case Intrinsic::amdgcn_sin:
9922 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9923
9924 case Intrinsic::amdgcn_cos:
9925 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9926
9927 case Intrinsic::amdgcn_mul_u24:
9928 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9929 Op.getOperand(2));
9930 case Intrinsic::amdgcn_mul_i24:
9931 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9932 Op.getOperand(2));
9933
9934 case Intrinsic::amdgcn_log_clamp: {
9935 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9936 return SDValue();
9937
9938 return emitRemovedIntrinsicError(DAG, DL, VT);
9939 }
9940 case Intrinsic::amdgcn_fract:
9941 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9942
9943 case Intrinsic::amdgcn_class:
9944 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9945 Op.getOperand(2));
9946 case Intrinsic::amdgcn_div_fmas:
9947 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9948 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9949
9950 case Intrinsic::amdgcn_div_fixup:
9951 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9952 Op.getOperand(2), Op.getOperand(3));
9953
9954 case Intrinsic::amdgcn_div_scale: {
9955 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9956
9957 // Translate to the operands expected by the machine instruction. The
9958 // first parameter must be the same as the first instruction.
9959 SDValue Numerator = Op.getOperand(1);
9960 SDValue Denominator = Op.getOperand(2);
9961
9962 // Note this order is opposite of the machine instruction's operations,
9963 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9964 // intrinsic has the numerator as the first operand to match a normal
9965 // division operation.
9966
9967 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9968
9969 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9970 Denominator, Numerator);
9971 }
9972 case Intrinsic::amdgcn_icmp: {
9973 // There is a Pat that handles this variant, so return it as-is.
9974 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9975 Op.getConstantOperandVal(2) == 0 &&
9976 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9977 return Op;
9978 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9979 }
9980 case Intrinsic::amdgcn_fcmp: {
9981 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9982 }
9983 case Intrinsic::amdgcn_ballot:
9984 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9985 case Intrinsic::amdgcn_fmed3:
9986 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9987 Op.getOperand(2), Op.getOperand(3));
9988 case Intrinsic::amdgcn_fdot2:
9989 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9990 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9991 case Intrinsic::amdgcn_fmul_legacy:
9992 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9993 Op.getOperand(2));
9994 case Intrinsic::amdgcn_sffbh:
9995 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9996 case Intrinsic::amdgcn_sbfe:
9997 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9998 Op.getOperand(2), Op.getOperand(3));
9999 case Intrinsic::amdgcn_ubfe:
10000 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10001 Op.getOperand(2), Op.getOperand(3));
10002 case Intrinsic::amdgcn_cvt_pkrtz:
10003 case Intrinsic::amdgcn_cvt_pknorm_i16:
10004 case Intrinsic::amdgcn_cvt_pknorm_u16:
10005 case Intrinsic::amdgcn_cvt_pk_i16:
10006 case Intrinsic::amdgcn_cvt_pk_u16: {
10007 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10008 EVT VT = Op.getValueType();
10009 unsigned Opcode;
10010
10011 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10013 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10015 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10017 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10019 else
10021
10022 if (isTypeLegal(VT))
10023 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10024
10025 SDValue Node =
10026 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10027 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10028 }
10029 case Intrinsic::amdgcn_fmad_ftz:
10030 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10031 Op.getOperand(2), Op.getOperand(3));
10032
10033 case Intrinsic::amdgcn_if_break:
10034 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10035 Op->getOperand(1), Op->getOperand(2)),
10036 0);
10037
10038 case Intrinsic::amdgcn_groupstaticsize: {
10040 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10041 return Op;
10042
10043 const Module *M = MF.getFunction().getParent();
10044 const GlobalValue *GV =
10045 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10046 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10048 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10049 }
10050 case Intrinsic::amdgcn_is_shared:
10051 case Intrinsic::amdgcn_is_private: {
10052 SDLoc SL(Op);
10053 SDValue SrcVec =
10054 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10055 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10056 DAG.getConstant(1, SL, MVT::i32));
10057
10058 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10060 : AMDGPUAS::PRIVATE_ADDRESS;
10061 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10062 Subtarget->hasGloballyAddressableScratch()) {
10063 SDValue FlatScratchBaseHi(
10064 DAG.getMachineNode(
10065 AMDGPU::S_MOV_B32, DL, MVT::i32,
10066 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10067 0);
10068 // Test bits 63..58 against the aperture address.
10069 return DAG.getSetCC(
10070 SL, MVT::i1,
10071 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10072 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10073 }
10074
10075 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10076 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10077 }
10078 case Intrinsic::amdgcn_perm:
10079 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10080 Op.getOperand(2), Op.getOperand(3));
10081 case Intrinsic::amdgcn_reloc_constant: {
10082 Module *M = MF.getFunction().getParent();
10083 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10084 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10085 auto *RelocSymbol = cast<GlobalVariable>(
10086 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10087 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10089 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10090 }
10091 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10092 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10093 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10094 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10095 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10096 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10097 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10098 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10099 if (Op.getOperand(4).getValueType() == MVT::i32)
10100 return SDValue();
10101
10102 SDLoc SL(Op);
10103 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10104 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10105 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10106 Op.getOperand(3), IndexKeyi32);
10107 }
10108 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10109 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10110 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10111 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10112 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10113 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10114 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10115 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10116 if (Op.getOperand(4).getValueType() == MVT::i64)
10117 return SDValue();
10118
10119 SDLoc SL(Op);
10120 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10121 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10122 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10123 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10124 Op.getOperand(6)});
10125 }
10126 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10127 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10128 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10129 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10130 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10131 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10132 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10133 ? MVT::i64
10134 : MVT::i32;
10135 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10136 return SDValue();
10137
10138 SDLoc SL(Op);
10139 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10140 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10141 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10142 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10143 IndexKey, Op.getOperand(7),
10144 Op.getOperand(8)}); // No clamp operand
10145 }
10146 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10147 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10148 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10149 if (Op.getOperand(6).getValueType() == MVT::i32)
10150 return SDValue();
10151
10152 SDLoc SL(Op);
10153 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10154 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10155 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10156 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10157 IndexKeyi32, Op.getOperand(7)});
10158 }
10159 case Intrinsic::amdgcn_addrspacecast_nonnull:
10160 return lowerADDRSPACECAST(Op, DAG);
10161 case Intrinsic::amdgcn_readlane:
10162 case Intrinsic::amdgcn_readfirstlane:
10163 case Intrinsic::amdgcn_writelane:
10164 case Intrinsic::amdgcn_permlane16:
10165 case Intrinsic::amdgcn_permlanex16:
10166 case Intrinsic::amdgcn_permlane64:
10167 case Intrinsic::amdgcn_set_inactive:
10168 case Intrinsic::amdgcn_set_inactive_chain_arg:
10169 case Intrinsic::amdgcn_mov_dpp8:
10170 case Intrinsic::amdgcn_update_dpp:
10171 return lowerLaneOp(*this, Op.getNode(), DAG);
10172 case Intrinsic::amdgcn_dead: {
10174 for (const EVT ValTy : Op.getNode()->values())
10175 Poisons.push_back(DAG.getPOISON(ValTy));
10176 return DAG.getMergeValues(Poisons, SDLoc(Op));
10177 }
10178 default:
10179 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10181 return lowerImage(Op, ImageDimIntr, DAG, false);
10182
10183 return Op;
10184 }
10185}
10186
10187// On targets not supporting constant in soffset field, turn zero to
10188// SGPR_NULL to avoid generating an extra s_mov with zero.
10190 const GCNSubtarget *Subtarget) {
10191 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10192 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10193 return SOffset;
10194}
10195
10196SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10197 SelectionDAG &DAG,
10198 unsigned NewOpcode) const {
10199 SDLoc DL(Op);
10200
10201 SDValue VData = Op.getOperand(2);
10202 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10203 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10204 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10205 SDValue Ops[] = {
10206 Op.getOperand(0), // Chain
10207 VData, // vdata
10208 Rsrc, // rsrc
10209 DAG.getConstant(0, DL, MVT::i32), // vindex
10210 VOffset, // voffset
10211 SOffset, // soffset
10212 Offset, // offset
10213 Op.getOperand(6), // cachepolicy
10214 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10215 };
10216
10217 auto *M = cast<MemSDNode>(Op);
10218
10219 EVT MemVT = VData.getValueType();
10220 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10221 M->getMemOperand());
10222}
10223
10224SDValue
10225SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10226 unsigned NewOpcode) const {
10227 SDLoc DL(Op);
10228
10229 SDValue VData = Op.getOperand(2);
10230 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10231 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10232 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10233 SDValue Ops[] = {
10234 Op.getOperand(0), // Chain
10235 VData, // vdata
10236 Rsrc, // rsrc
10237 Op.getOperand(4), // vindex
10238 VOffset, // voffset
10239 SOffset, // soffset
10240 Offset, // offset
10241 Op.getOperand(7), // cachepolicy
10242 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10243 };
10244
10245 auto *M = cast<MemSDNode>(Op);
10246
10247 EVT MemVT = VData.getValueType();
10248 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10249 M->getMemOperand());
10250}
10251
10252SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10253 SelectionDAG &DAG) const {
10254 unsigned IntrID = Op.getConstantOperandVal(1);
10255 SDLoc DL(Op);
10256
10257 switch (IntrID) {
10258 case Intrinsic::amdgcn_ds_ordered_add:
10259 case Intrinsic::amdgcn_ds_ordered_swap: {
10260 MemSDNode *M = cast<MemSDNode>(Op);
10261 SDValue Chain = M->getOperand(0);
10262 SDValue M0 = M->getOperand(2);
10263 SDValue Value = M->getOperand(3);
10264 unsigned IndexOperand = M->getConstantOperandVal(7);
10265 unsigned WaveRelease = M->getConstantOperandVal(8);
10266 unsigned WaveDone = M->getConstantOperandVal(9);
10267
10268 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10269 IndexOperand &= ~0x3f;
10270 unsigned CountDw = 0;
10271
10272 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10273 CountDw = (IndexOperand >> 24) & 0xf;
10274 IndexOperand &= ~(0xf << 24);
10275
10276 if (CountDw < 1 || CountDw > 4) {
10277 const Function &Fn = DAG.getMachineFunction().getFunction();
10278 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10279 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10280 DL.getDebugLoc()));
10281 CountDw = 1;
10282 }
10283 }
10284
10285 if (IndexOperand) {
10286 const Function &Fn = DAG.getMachineFunction().getFunction();
10287 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10288 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10289 }
10290
10291 if (WaveDone && !WaveRelease) {
10292 // TODO: Move this to IR verifier
10293 const Function &Fn = DAG.getMachineFunction().getFunction();
10294 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10295 Fn, "ds_ordered_count: wave_done requires wave_release",
10296 DL.getDebugLoc()));
10297 }
10298
10299 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10300 unsigned ShaderType =
10302 unsigned Offset0 = OrderedCountIndex << 2;
10303 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10304
10305 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10306 Offset1 |= (CountDw - 1) << 6;
10307
10308 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10309 Offset1 |= ShaderType << 2;
10310
10311 unsigned Offset = Offset0 | (Offset1 << 8);
10312
10313 SDValue Ops[] = {
10314 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10315 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10316 };
10318 M->getVTList(), Ops, M->getMemoryVT(),
10319 M->getMemOperand());
10320 }
10321 case Intrinsic::amdgcn_raw_buffer_load:
10322 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10323 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10324 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10325 case Intrinsic::amdgcn_raw_buffer_load_format:
10326 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10327 const bool IsFormat =
10328 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10329 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10330
10331 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10332 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10333 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10334 SDValue Ops[] = {
10335 Op.getOperand(0), // Chain
10336 Rsrc, // rsrc
10337 DAG.getConstant(0, DL, MVT::i32), // vindex
10338 VOffset, // voffset
10339 SOffset, // soffset
10340 Offset, // offset
10341 Op.getOperand(5), // cachepolicy, swizzled buffer
10342 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10343 };
10344
10345 auto *M = cast<MemSDNode>(Op);
10346 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10347 }
10348 case Intrinsic::amdgcn_struct_buffer_load:
10349 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10350 case Intrinsic::amdgcn_struct_buffer_load_format:
10351 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10352 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10353 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10354 const bool IsFormat =
10355 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10356 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10357
10358 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10359 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10360 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10361 SDValue Ops[] = {
10362 Op.getOperand(0), // Chain
10363 Rsrc, // rsrc
10364 Op.getOperand(3), // vindex
10365 VOffset, // voffset
10366 SOffset, // soffset
10367 Offset, // offset
10368 Op.getOperand(6), // cachepolicy, swizzled buffer
10369 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10370 };
10371
10372 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10373 }
10374 case Intrinsic::amdgcn_raw_tbuffer_load:
10375 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10376 MemSDNode *M = cast<MemSDNode>(Op);
10377 EVT LoadVT = Op.getValueType();
10378 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10379 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10380 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10381
10382 SDValue Ops[] = {
10383 Op.getOperand(0), // Chain
10384 Rsrc, // rsrc
10385 DAG.getConstant(0, DL, MVT::i32), // vindex
10386 VOffset, // voffset
10387 SOffset, // soffset
10388 Offset, // offset
10389 Op.getOperand(5), // format
10390 Op.getOperand(6), // cachepolicy, swizzled buffer
10391 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10392 };
10393
10394 if (LoadVT.getScalarType() == MVT::f16)
10395 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10396 Ops);
10397 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10398 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10399 DAG);
10400 }
10401 case Intrinsic::amdgcn_struct_tbuffer_load:
10402 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10403 MemSDNode *M = cast<MemSDNode>(Op);
10404 EVT LoadVT = Op.getValueType();
10405 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10406 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10407 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10408
10409 SDValue Ops[] = {
10410 Op.getOperand(0), // Chain
10411 Rsrc, // rsrc
10412 Op.getOperand(3), // vindex
10413 VOffset, // voffset
10414 SOffset, // soffset
10415 Offset, // offset
10416 Op.getOperand(6), // format
10417 Op.getOperand(7), // cachepolicy, swizzled buffer
10418 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10419 };
10420
10421 if (LoadVT.getScalarType() == MVT::f16)
10422 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10423 Ops);
10424 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10425 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10426 DAG);
10427 }
10428 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10430 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10431 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10432 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10433 return lowerStructBufferAtomicIntrin(Op, DAG,
10435 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10436 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10437 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10438 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10440 return lowerStructBufferAtomicIntrin(Op, DAG,
10442 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10444 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10445 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10446 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10447 return lowerStructBufferAtomicIntrin(Op, DAG,
10449 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10451 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10452 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10454 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10455 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10457 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10458 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10460 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10461 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10463 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10464 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10466 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10467 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10469 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10470 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10471 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10472 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10473 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10474 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10475 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10476 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10478 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10479 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10481 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10482 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10483 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10484 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10485 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10486 return lowerRawBufferAtomicIntrin(Op, DAG,
10488 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10490 return lowerStructBufferAtomicIntrin(Op, DAG,
10492 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10493 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10494 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10495 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10496 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10497 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10498 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10500 return lowerStructBufferAtomicIntrin(Op, DAG,
10502 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10503 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10504 return lowerStructBufferAtomicIntrin(Op, DAG,
10506 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10507 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10508 return lowerStructBufferAtomicIntrin(Op, DAG,
10510 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10511 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10512 return lowerStructBufferAtomicIntrin(Op, DAG,
10514 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10515 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10516 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10517 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10518 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10519 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10520 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10521 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10522 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10523 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10525 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10526 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10527 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10528 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10529 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10530 return lowerStructBufferAtomicIntrin(Op, DAG,
10532
10533 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10534 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10535 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10536 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10537 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10538 SDValue Ops[] = {
10539 Op.getOperand(0), // Chain
10540 Op.getOperand(2), // src
10541 Op.getOperand(3), // cmp
10542 Rsrc, // rsrc
10543 DAG.getConstant(0, DL, MVT::i32), // vindex
10544 VOffset, // voffset
10545 SOffset, // soffset
10546 Offset, // offset
10547 Op.getOperand(7), // cachepolicy
10548 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10549 };
10550 EVT VT = Op.getValueType();
10551 auto *M = cast<MemSDNode>(Op);
10552
10554 Op->getVTList(), Ops, VT,
10555 M->getMemOperand());
10556 }
10557 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10558 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10559 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10560 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10561 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10562 SDValue Ops[] = {
10563 Op.getOperand(0), // Chain
10564 Op.getOperand(2), // src
10565 Op.getOperand(3), // cmp
10566 Rsrc, // rsrc
10567 Op.getOperand(5), // vindex
10568 VOffset, // voffset
10569 SOffset, // soffset
10570 Offset, // offset
10571 Op.getOperand(8), // cachepolicy
10572 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10573 };
10574 EVT VT = Op.getValueType();
10575 auto *M = cast<MemSDNode>(Op);
10576
10578 Op->getVTList(), Ops, VT,
10579 M->getMemOperand());
10580 }
10581 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10582 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10583 MemSDNode *M = cast<MemSDNode>(Op);
10584 SDValue NodePtr = M->getOperand(2);
10585 SDValue RayExtent = M->getOperand(3);
10586 SDValue InstanceMask = M->getOperand(4);
10587 SDValue RayOrigin = M->getOperand(5);
10588 SDValue RayDir = M->getOperand(6);
10589 SDValue Offsets = M->getOperand(7);
10590 SDValue TDescr = M->getOperand(8);
10591
10592 assert(NodePtr.getValueType() == MVT::i64);
10593 assert(RayDir.getValueType() == MVT::v3f32);
10594
10595 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10596 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10597 return SDValue();
10598 }
10599
10600 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10601 const unsigned NumVDataDwords = 10;
10602 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10603 int Opcode = AMDGPU::getMIMGOpcode(
10604 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10605 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10606 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10607 assert(Opcode != -1);
10608
10610 Ops.push_back(NodePtr);
10611 Ops.push_back(DAG.getBuildVector(
10612 MVT::v2i32, DL,
10613 {DAG.getBitcast(MVT::i32, RayExtent),
10614 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10615 Ops.push_back(RayOrigin);
10616 Ops.push_back(RayDir);
10617 Ops.push_back(Offsets);
10618 Ops.push_back(TDescr);
10619 Ops.push_back(M->getChain());
10620
10621 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10622 MachineMemOperand *MemRef = M->getMemOperand();
10623 DAG.setNodeMemRefs(NewNode, {MemRef});
10624 return SDValue(NewNode, 0);
10625 }
10626 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10627 MemSDNode *M = cast<MemSDNode>(Op);
10628 SDValue NodePtr = M->getOperand(2);
10629 SDValue RayExtent = M->getOperand(3);
10630 SDValue RayOrigin = M->getOperand(4);
10631 SDValue RayDir = M->getOperand(5);
10632 SDValue RayInvDir = M->getOperand(6);
10633 SDValue TDescr = M->getOperand(7);
10634
10635 assert(NodePtr.getValueType() == MVT::i32 ||
10636 NodePtr.getValueType() == MVT::i64);
10637 assert(RayDir.getValueType() == MVT::v3f16 ||
10638 RayDir.getValueType() == MVT::v3f32);
10639
10640 if (!Subtarget->hasGFX10_AEncoding()) {
10641 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10642 return SDValue();
10643 }
10644
10645 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10646 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10647 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10648 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10649 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10650 const unsigned NumVDataDwords = 4;
10651 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10652 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10653 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10654 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10655 IsGFX12Plus;
10656 const unsigned BaseOpcodes[2][2] = {
10657 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10658 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10659 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10660 int Opcode;
10661 if (UseNSA) {
10662 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10663 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10664 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10665 : AMDGPU::MIMGEncGfx10NSA,
10666 NumVDataDwords, NumVAddrDwords);
10667 } else {
10668 assert(!IsGFX12Plus);
10669 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10670 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10671 : AMDGPU::MIMGEncGfx10Default,
10672 NumVDataDwords, NumVAddrDwords);
10673 }
10674 assert(Opcode != -1);
10675
10677
10678 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10680 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10681 if (Lanes[0].getValueSizeInBits() == 32) {
10682 for (unsigned I = 0; I < 3; ++I)
10683 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10684 } else {
10685 if (IsAligned) {
10686 Ops.push_back(DAG.getBitcast(
10687 MVT::i32,
10688 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10689 Ops.push_back(Lanes[2]);
10690 } else {
10691 SDValue Elt0 = Ops.pop_back_val();
10692 Ops.push_back(DAG.getBitcast(
10693 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10694 Ops.push_back(DAG.getBitcast(
10695 MVT::i32,
10696 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10697 }
10698 }
10699 };
10700
10701 if (UseNSA && IsGFX11Plus) {
10702 Ops.push_back(NodePtr);
10703 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10704 Ops.push_back(RayOrigin);
10705 if (IsA16) {
10706 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10707 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10708 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10709 for (unsigned I = 0; I < 3; ++I) {
10710 MergedLanes.push_back(DAG.getBitcast(
10711 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10712 {DirLanes[I], InvDirLanes[I]})));
10713 }
10714 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10715 } else {
10716 Ops.push_back(RayDir);
10717 Ops.push_back(RayInvDir);
10718 }
10719 } else {
10720 if (Is64)
10721 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10722 2);
10723 else
10724 Ops.push_back(NodePtr);
10725
10726 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10727 packLanes(RayOrigin, true);
10728 packLanes(RayDir, true);
10729 packLanes(RayInvDir, false);
10730 }
10731
10732 if (!UseNSA) {
10733 // Build a single vector containing all the operands so far prepared.
10734 if (NumVAddrDwords > 12) {
10735 SDValue Undef = DAG.getPOISON(MVT::i32);
10736 Ops.append(16 - Ops.size(), Undef);
10737 }
10738 assert(Ops.size() >= 8 && Ops.size() <= 12);
10739 SDValue MergedOps =
10740 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10741 Ops.clear();
10742 Ops.push_back(MergedOps);
10743 }
10744
10745 Ops.push_back(TDescr);
10746 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10747 Ops.push_back(M->getChain());
10748
10749 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10750 MachineMemOperand *MemRef = M->getMemOperand();
10751 DAG.setNodeMemRefs(NewNode, {MemRef});
10752 return SDValue(NewNode, 0);
10753 }
10754 case Intrinsic::amdgcn_global_atomic_fmin_num:
10755 case Intrinsic::amdgcn_global_atomic_fmax_num:
10756 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10757 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10758 MemSDNode *M = cast<MemSDNode>(Op);
10759 SDValue Ops[] = {
10760 M->getOperand(0), // Chain
10761 M->getOperand(2), // Ptr
10762 M->getOperand(3) // Value
10763 };
10764 unsigned Opcode = 0;
10765 switch (IntrID) {
10766 case Intrinsic::amdgcn_global_atomic_fmin_num:
10767 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10768 Opcode = ISD::ATOMIC_LOAD_FMIN;
10769 break;
10770 }
10771 case Intrinsic::amdgcn_global_atomic_fmax_num:
10772 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10773 Opcode = ISD::ATOMIC_LOAD_FMAX;
10774 break;
10775 }
10776 default:
10777 llvm_unreachable("unhandled atomic opcode");
10778 }
10779 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10780 Ops, M->getMemOperand());
10781 }
10782 case Intrinsic::amdgcn_s_get_barrier_state:
10783 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10784 SDValue Chain = Op->getOperand(0);
10786 unsigned Opc;
10787
10788 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10789 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10790 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10791 BarID = (BarID >> 4) & 0x3F;
10792 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10793 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10794 Ops.push_back(K);
10795 Ops.push_back(Chain);
10796 } else {
10797 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10798 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10799 SDValue M0Val;
10800 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10801 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10802 M0Val = SDValue(
10803 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10804 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10805 0);
10806 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10807 } else
10808 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10809 }
10810
10811 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10812 return SDValue(NewMI, 0);
10813 }
10814 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10815 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10816 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10817 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10818 SDValue Chain = Op->getOperand(0);
10819 SDValue Ptr = Op->getOperand(2);
10820 EVT VT = Op->getValueType(0);
10821 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10822 Chain, Ptr, MII->getMemOperand());
10823 }
10824 default:
10825
10826 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10828 return lowerImage(Op, ImageDimIntr, DAG, true);
10829
10830 return SDValue();
10831 }
10832}
10833
10834// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10835// dwordx4 if on SI and handle TFE loads.
10836SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10837 SDVTList VTList,
10838 ArrayRef<SDValue> Ops, EVT MemVT,
10839 MachineMemOperand *MMO,
10840 SelectionDAG &DAG) const {
10841 LLVMContext &C = *DAG.getContext();
10842 MachineFunction &MF = DAG.getMachineFunction();
10843 EVT VT = VTList.VTs[0];
10844
10845 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10846 bool IsTFE = VTList.NumVTs == 3;
10847 if (IsTFE) {
10848 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10849 unsigned NumOpDWords = NumValueDWords + 1;
10850 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10851 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10852 MachineMemOperand *OpDWordsMMO =
10853 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10854 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10855 OpDWordsVT, OpDWordsMMO, DAG);
10856 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10857 DAG.getVectorIdxConstant(NumValueDWords, DL));
10858 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10859 SDValue ValueDWords =
10860 NumValueDWords == 1
10861 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10863 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10864 ZeroIdx);
10865 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10866 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10867 }
10868
10869 if (!Subtarget->hasDwordx3LoadStores() &&
10870 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10871 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10872 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10873 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10874 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10875 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10876 WidenedMemVT, WidenedMMO);
10878 DAG.getVectorIdxConstant(0, DL));
10879 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10880 }
10881
10882 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10883}
10884
10885SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10886 bool ImageStore) const {
10887 EVT StoreVT = VData.getValueType();
10888
10889 // No change for f16 and legal vector D16 types.
10890 if (!StoreVT.isVector())
10891 return VData;
10892
10893 SDLoc DL(VData);
10894 unsigned NumElements = StoreVT.getVectorNumElements();
10895
10896 if (Subtarget->hasUnpackedD16VMem()) {
10897 // We need to unpack the packed data to store.
10898 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10899 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10900
10901 EVT EquivStoreVT =
10902 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10903 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10904 return DAG.UnrollVectorOp(ZExt.getNode());
10905 }
10906
10907 // The sq block of gfx8.1 does not estimate register use correctly for d16
10908 // image store instructions. The data operand is computed as if it were not a
10909 // d16 image instruction.
10910 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10911 // Bitcast to i16
10912 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10913 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10914
10915 // Decompose into scalars
10917 DAG.ExtractVectorElements(IntVData, Elts);
10918
10919 // Group pairs of i16 into v2i16 and bitcast to i32
10920 SmallVector<SDValue, 4> PackedElts;
10921 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10922 SDValue Pair =
10923 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10924 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10925 PackedElts.push_back(IntPair);
10926 }
10927 if ((NumElements % 2) == 1) {
10928 // Handle v3i16
10929 unsigned I = Elts.size() / 2;
10930 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10931 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10932 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10933 PackedElts.push_back(IntPair);
10934 }
10935
10936 // Pad using UNDEF
10937 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10938
10939 // Build final vector
10940 EVT VecVT =
10941 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10942 return DAG.getBuildVector(VecVT, DL, PackedElts);
10943 }
10944
10945 if (NumElements == 3) {
10946 EVT IntStoreVT =
10948 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10949
10950 EVT WidenedStoreVT = EVT::getVectorVT(
10951 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10952 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10953 WidenedStoreVT.getStoreSizeInBits());
10954 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10955 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10956 }
10957
10958 assert(isTypeLegal(StoreVT));
10959 return VData;
10960}
10961
10962SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10963 SelectionDAG &DAG) const {
10964 SDLoc DL(Op);
10965 SDValue Chain = Op.getOperand(0);
10966 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10967 MachineFunction &MF = DAG.getMachineFunction();
10968
10969 switch (IntrinsicID) {
10970 case Intrinsic::amdgcn_exp_compr: {
10971 if (!Subtarget->hasCompressedExport()) {
10972 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10974 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10975 }
10976 SDValue Src0 = Op.getOperand(4);
10977 SDValue Src1 = Op.getOperand(5);
10978 // Hack around illegal type on SI by directly selecting it.
10979 if (isTypeLegal(Src0.getValueType()))
10980 return SDValue();
10981
10982 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10983 SDValue Undef = DAG.getPOISON(MVT::f32);
10984 const SDValue Ops[] = {
10985 Op.getOperand(2), // tgt
10986 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10987 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10988 Undef, // src2
10989 Undef, // src3
10990 Op.getOperand(7), // vm
10991 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10992 Op.getOperand(3), // en
10993 Op.getOperand(0) // Chain
10994 };
10995
10996 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10997 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10998 }
10999
11000 case Intrinsic::amdgcn_struct_tbuffer_store:
11001 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11002 SDValue VData = Op.getOperand(2);
11003 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11004 if (IsD16)
11005 VData = handleD16VData(VData, DAG);
11006 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11007 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11008 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11009 SDValue Ops[] = {
11010 Chain,
11011 VData, // vdata
11012 Rsrc, // rsrc
11013 Op.getOperand(4), // vindex
11014 VOffset, // voffset
11015 SOffset, // soffset
11016 Offset, // offset
11017 Op.getOperand(7), // format
11018 Op.getOperand(8), // cachepolicy, swizzled buffer
11019 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11020 };
11021 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11023 MemSDNode *M = cast<MemSDNode>(Op);
11024 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11025 M->getMemoryVT(), M->getMemOperand());
11026 }
11027
11028 case Intrinsic::amdgcn_raw_tbuffer_store:
11029 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11030 SDValue VData = Op.getOperand(2);
11031 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11032 if (IsD16)
11033 VData = handleD16VData(VData, DAG);
11034 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11035 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11036 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11037 SDValue Ops[] = {
11038 Chain,
11039 VData, // vdata
11040 Rsrc, // rsrc
11041 DAG.getConstant(0, DL, MVT::i32), // vindex
11042 VOffset, // voffset
11043 SOffset, // soffset
11044 Offset, // offset
11045 Op.getOperand(6), // format
11046 Op.getOperand(7), // cachepolicy, swizzled buffer
11047 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11048 };
11049 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11051 MemSDNode *M = cast<MemSDNode>(Op);
11052 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11053 M->getMemoryVT(), M->getMemOperand());
11054 }
11055
11056 case Intrinsic::amdgcn_raw_buffer_store:
11057 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11058 case Intrinsic::amdgcn_raw_buffer_store_format:
11059 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11060 const bool IsFormat =
11061 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11062 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11063
11064 SDValue VData = Op.getOperand(2);
11065 EVT VDataVT = VData.getValueType();
11066 EVT EltType = VDataVT.getScalarType();
11067 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11068 if (IsD16) {
11069 VData = handleD16VData(VData, DAG);
11070 VDataVT = VData.getValueType();
11071 }
11072
11073 if (!isTypeLegal(VDataVT)) {
11074 VData =
11075 DAG.getNode(ISD::BITCAST, DL,
11076 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11077 }
11078
11079 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11080 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11081 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11082 SDValue Ops[] = {
11083 Chain,
11084 VData,
11085 Rsrc,
11086 DAG.getConstant(0, DL, MVT::i32), // vindex
11087 VOffset, // voffset
11088 SOffset, // soffset
11089 Offset, // offset
11090 Op.getOperand(6), // cachepolicy, swizzled buffer
11091 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11092 };
11093 unsigned Opc =
11096 MemSDNode *M = cast<MemSDNode>(Op);
11097
11098 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11099 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11100 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11101
11102 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11103 M->getMemoryVT(), M->getMemOperand());
11104 }
11105
11106 case Intrinsic::amdgcn_struct_buffer_store:
11107 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11108 case Intrinsic::amdgcn_struct_buffer_store_format:
11109 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11110 const bool IsFormat =
11111 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11112 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11113
11114 SDValue VData = Op.getOperand(2);
11115 EVT VDataVT = VData.getValueType();
11116 EVT EltType = VDataVT.getScalarType();
11117 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11118
11119 if (IsD16) {
11120 VData = handleD16VData(VData, DAG);
11121 VDataVT = VData.getValueType();
11122 }
11123
11124 if (!isTypeLegal(VDataVT)) {
11125 VData =
11126 DAG.getNode(ISD::BITCAST, DL,
11127 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11128 }
11129
11130 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11131 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11132 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11133 SDValue Ops[] = {
11134 Chain,
11135 VData,
11136 Rsrc,
11137 Op.getOperand(4), // vindex
11138 VOffset, // voffset
11139 SOffset, // soffset
11140 Offset, // offset
11141 Op.getOperand(7), // cachepolicy, swizzled buffer
11142 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11143 };
11144 unsigned Opc =
11147 MemSDNode *M = cast<MemSDNode>(Op);
11148
11149 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11150 EVT VDataType = VData.getValueType().getScalarType();
11151 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11152 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11153
11154 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11155 M->getMemoryVT(), M->getMemOperand());
11156 }
11157 case Intrinsic::amdgcn_raw_buffer_load_lds:
11158 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11159 case Intrinsic::amdgcn_struct_buffer_load_lds:
11160 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11161 if (!Subtarget->hasVMemToLDSLoad())
11162 return SDValue();
11163 unsigned Opc;
11164 bool HasVIndex =
11165 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11166 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11167 unsigned OpOffset = HasVIndex ? 1 : 0;
11168 SDValue VOffset = Op.getOperand(5 + OpOffset);
11169 bool HasVOffset = !isNullConstant(VOffset);
11170 unsigned Size = Op->getConstantOperandVal(4);
11171
11172 switch (Size) {
11173 default:
11174 return SDValue();
11175 case 1:
11176 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11177 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11178 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11179 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11180 break;
11181 case 2:
11182 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11183 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11184 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11185 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11186 break;
11187 case 4:
11188 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11189 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11190 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11191 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11192 break;
11193 case 12:
11194 if (!Subtarget->hasLDSLoadB96_B128())
11195 return SDValue();
11196 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11197 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11198 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11199 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11200 break;
11201 case 16:
11202 if (!Subtarget->hasLDSLoadB96_B128())
11203 return SDValue();
11204 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11205 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11206 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11207 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11208 break;
11209 }
11210
11211 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11212
11214
11215 if (HasVIndex && HasVOffset)
11216 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11217 {Op.getOperand(5), // VIndex
11218 VOffset}));
11219 else if (HasVIndex)
11220 Ops.push_back(Op.getOperand(5));
11221 else if (HasVOffset)
11222 Ops.push_back(VOffset);
11223
11224 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11225 Ops.push_back(Rsrc);
11226 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11227 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11228 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11229 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11230 Ops.push_back(DAG.getTargetConstant(
11231 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11232 DL, MVT::i8)); // cpol
11233 Ops.push_back(DAG.getTargetConstant(
11234 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11235 ? 1
11236 : 0,
11237 DL, MVT::i8)); // swz
11238 Ops.push_back(M0Val.getValue(0)); // Chain
11239 Ops.push_back(M0Val.getValue(1)); // Glue
11240
11241 auto *M = cast<MemSDNode>(Op);
11242 MachineMemOperand *LoadMMO = M->getMemOperand();
11243 // Don't set the offset value here because the pointer points to the base of
11244 // the buffer.
11245 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11246
11247 MachinePointerInfo StorePtrI = LoadPtrI;
11248 LoadPtrI.V = PoisonValue::get(
11252
11253 auto F = LoadMMO->getFlags() &
11255 LoadMMO =
11257 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11258
11259 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11260 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11261 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11262
11263 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11264 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11265
11266 return SDValue(Load, 0);
11267 }
11268 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11269 // for "trust me" that the remaining cases are global pointers until
11270 // such time as we can put two mem operands on an intrinsic.
11271 case Intrinsic::amdgcn_load_to_lds:
11272 case Intrinsic::amdgcn_global_load_lds: {
11273 if (!Subtarget->hasVMemToLDSLoad())
11274 return SDValue();
11275
11276 unsigned Opc;
11277 unsigned Size = Op->getConstantOperandVal(4);
11278 switch (Size) {
11279 default:
11280 return SDValue();
11281 case 1:
11282 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11283 break;
11284 case 2:
11285 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11286 break;
11287 case 4:
11288 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11289 break;
11290 case 12:
11291 if (!Subtarget->hasLDSLoadB96_B128())
11292 return SDValue();
11293 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11294 break;
11295 case 16:
11296 if (!Subtarget->hasLDSLoadB96_B128())
11297 return SDValue();
11298 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11299 break;
11300 }
11301
11302 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11303
11305
11306 SDValue Addr = Op.getOperand(2); // Global ptr
11307 SDValue VOffset;
11308 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11309 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11310 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
11311 SDValue LHS = Addr.getOperand(0);
11312 SDValue RHS = Addr.getOperand(1);
11313
11314 if (LHS->isDivergent())
11315 std::swap(LHS, RHS);
11316
11317 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11318 RHS.getOperand(0).getValueType() == MVT::i32) {
11319 // add (i64 sgpr), (zero_extend (i32 vgpr))
11320 Addr = LHS;
11321 VOffset = RHS.getOperand(0);
11322 }
11323 }
11324
11325 Ops.push_back(Addr);
11326 if (!Addr->isDivergent()) {
11328 if (!VOffset)
11329 VOffset =
11330 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11331 DAG.getTargetConstant(0, DL, MVT::i32)),
11332 0);
11333 Ops.push_back(VOffset);
11334 }
11335
11336 Ops.push_back(Op.getOperand(5)); // Offset
11337 Ops.push_back(Op.getOperand(6)); // CPol
11338 Ops.push_back(M0Val.getValue(0)); // Chain
11339 Ops.push_back(M0Val.getValue(1)); // Glue
11340
11341 auto *M = cast<MemSDNode>(Op);
11342 MachineMemOperand *LoadMMO = M->getMemOperand();
11343 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11344 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11345 MachinePointerInfo StorePtrI = LoadPtrI;
11346 LoadPtrI.V = PoisonValue::get(
11350 auto F = LoadMMO->getFlags() &
11352 LoadMMO =
11354 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11355 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11356 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11357 LoadMMO->getAAInfo());
11358
11359 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11360 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11361
11362 return SDValue(Load, 0);
11363 }
11364 case Intrinsic::amdgcn_end_cf:
11365 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11366 Op->getOperand(2), Chain),
11367 0);
11368 case Intrinsic::amdgcn_s_barrier_init:
11369 case Intrinsic::amdgcn_s_barrier_signal_var: {
11370 // these two intrinsics have two operands: barrier pointer and member count
11371 SDValue Chain = Op->getOperand(0);
11373 SDValue BarOp = Op->getOperand(2);
11374 SDValue CntOp = Op->getOperand(3);
11375 SDValue M0Val;
11376 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11377 ? AMDGPU::S_BARRIER_INIT_M0
11378 : AMDGPU::S_BARRIER_SIGNAL_M0;
11379 // extract the BarrierID from bits 4-9 of BarOp
11380 SDValue BarID;
11381 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11382 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11383 BarID =
11384 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11385 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11386 0);
11387 // Member count should be put into M0[ShAmt:+6]
11388 // Barrier ID should be put into M0[5:0]
11389 M0Val =
11390 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11391 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11392 0);
11393 constexpr unsigned ShAmt = 16;
11394 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11395 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11396
11397 M0Val = SDValue(
11398 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11399
11400 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11401
11402 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11403 return SDValue(NewMI, 0);
11404 }
11405 case Intrinsic::amdgcn_s_barrier_join: {
11406 // these three intrinsics have one operand: barrier pointer
11407 SDValue Chain = Op->getOperand(0);
11409 SDValue BarOp = Op->getOperand(2);
11410 unsigned Opc;
11411
11412 if (isa<ConstantSDNode>(BarOp)) {
11413 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11414 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11415
11416 // extract the BarrierID from bits 4-9 of the immediate
11417 unsigned BarID = (BarVal >> 4) & 0x3F;
11418 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11419 Ops.push_back(K);
11420 Ops.push_back(Chain);
11421 } else {
11422 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11423
11424 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11425 SDValue M0Val;
11426 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11427 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11428 M0Val =
11429 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11430 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11431 0);
11432 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11433 }
11434
11435 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11436 return SDValue(NewMI, 0);
11437 }
11438 case Intrinsic::amdgcn_s_prefetch_data: {
11439 // For non-global address space preserve the chain and remove the call.
11441 return Op.getOperand(0);
11442 return Op;
11443 }
11444 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11445 SDValue Ops[] = {
11446 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11447 Op.getOperand(3), // offset
11448 Op.getOperand(4), // length
11449 };
11450
11451 MemSDNode *M = cast<MemSDNode>(Op);
11453 Op->getVTList(), Ops, M->getMemoryVT(),
11454 M->getMemOperand());
11455 }
11456 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11457 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11458 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11459 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11460 SDValue Chain = Op->getOperand(0);
11461 SDValue Ptr = Op->getOperand(2);
11462 SDValue Val = Op->getOperand(3);
11463 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11464 Ptr, MII->getMemOperand());
11465 }
11466 default: {
11467 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11469 return lowerImage(Op, ImageDimIntr, DAG, true);
11470
11471 return Op;
11472 }
11473 }
11474}
11475
11476// Return whether the operation has NoUnsignedWrap property.
11477static bool isNoUnsignedWrap(SDValue Addr) {
11478 return (Addr.getOpcode() == ISD::ADD &&
11479 Addr->getFlags().hasNoUnsignedWrap()) ||
11480 Addr->getOpcode() == ISD::OR;
11481}
11482
11484 EVT PtrVT) const {
11485 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11486}
11487
11488// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11489// offset (the offset that is included in bounds checking and swizzling, to be
11490// split between the instruction's voffset and immoffset fields) and soffset
11491// (the offset that is excluded from bounds checking and swizzling, to go in
11492// the instruction's soffset field). This function takes the first kind of
11493// offset and figures out how to split it between voffset and immoffset.
11494std::pair<SDValue, SDValue>
11495SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11496 SDLoc DL(Offset);
11497 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11498 SDValue N0 = Offset;
11499 ConstantSDNode *C1 = nullptr;
11500
11501 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11502 N0 = SDValue();
11503 else if (DAG.isBaseWithConstantOffset(N0)) {
11504 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11505 // being added, so we can only safely match a 32-bit addition with no
11506 // unsigned overflow.
11507 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11508 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11509 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11510 N0 = N0.getOperand(0);
11511 }
11512 }
11513
11514 if (C1) {
11515 unsigned ImmOffset = C1->getZExtValue();
11516 // If the immediate value is too big for the immoffset field, put only bits
11517 // that would normally fit in the immoffset field. The remaining value that
11518 // is copied/added for the voffset field is a large power of 2, and it
11519 // stands more chance of being CSEd with the copy/add for another similar
11520 // load/store.
11521 // However, do not do that rounding down if that is a negative
11522 // number, as it appears to be illegal to have a negative offset in the
11523 // vgpr, even if adding the immediate offset makes it positive.
11524 unsigned Overflow = ImmOffset & ~MaxImm;
11525 ImmOffset -= Overflow;
11526 if ((int32_t)Overflow < 0) {
11527 Overflow += ImmOffset;
11528 ImmOffset = 0;
11529 }
11530 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11531 if (Overflow) {
11532 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11533 if (!N0)
11534 N0 = OverflowVal;
11535 else {
11536 SDValue Ops[] = {N0, OverflowVal};
11537 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11538 }
11539 }
11540 }
11541 if (!N0)
11542 N0 = DAG.getConstant(0, DL, MVT::i32);
11543 if (!C1)
11544 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11545 return {N0, SDValue(C1, 0)};
11546}
11547
11548// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11549// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11550// pointed to by Offsets.
11551void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11552 SelectionDAG &DAG, SDValue *Offsets,
11553 Align Alignment) const {
11554 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11555 SDLoc DL(CombinedOffset);
11556 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11557 uint32_t Imm = C->getZExtValue();
11558 uint32_t SOffset, ImmOffset;
11559 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11560 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11561 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11562 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11563 return;
11564 }
11565 }
11566 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11567 SDValue N0 = CombinedOffset.getOperand(0);
11568 SDValue N1 = CombinedOffset.getOperand(1);
11569 uint32_t SOffset, ImmOffset;
11570 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11571 if (Offset >= 0 &&
11572 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11573 Offsets[0] = N0;
11574 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11575 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11576 return;
11577 }
11578 }
11579
11580 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11581 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11582 : DAG.getConstant(0, DL, MVT::i32);
11583
11584 Offsets[0] = CombinedOffset;
11585 Offsets[1] = SOffsetZero;
11586 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11587}
11588
11589SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11590 SelectionDAG &DAG) const {
11591 if (!MaybePointer.getValueType().isScalarInteger())
11592 return MaybePointer;
11593
11594 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11595 return Rsrc;
11596}
11597
11598// Wrap a global or flat pointer into a buffer intrinsic using the flags
11599// specified in the intrinsic.
11600SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11601 SelectionDAG &DAG) const {
11602 SDLoc Loc(Op);
11603
11604 SDValue Pointer = Op->getOperand(1);
11605 SDValue Stride = Op->getOperand(2);
11606 SDValue NumRecords = Op->getOperand(3);
11607 SDValue Flags = Op->getOperand(4);
11608
11609 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11610 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11611 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11612 std::optional<uint32_t> ConstStride = std::nullopt;
11613 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
11614 ConstStride = ConstNode->getZExtValue();
11615
11616 SDValue NewHighHalf = Masked;
11617 if (!ConstStride || *ConstStride != 0) {
11618 SDValue ShiftedStride;
11619 if (ConstStride) {
11620 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
11621 } else {
11622 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11623 ShiftedStride =
11624 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11625 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11626 }
11627 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11628 }
11629
11630 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
11631 NewHighHalf, NumRecords, Flags);
11632 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11633 return RsrcPtr;
11634}
11635
11636// Handle 8 bit and 16 bit buffer loads
11637SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11638 EVT LoadVT, SDLoc DL,
11640 MachineMemOperand *MMO,
11641 bool IsTFE) const {
11642 EVT IntVT = LoadVT.changeTypeToInteger();
11643
11644 if (IsTFE) {
11645 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11648 MachineFunction &MF = DAG.getMachineFunction();
11649 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11650 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11651 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11652 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11653 DAG.getConstant(1, DL, MVT::i32));
11654 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11655 DAG.getConstant(0, DL, MVT::i32));
11656 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11657 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11658 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11659 }
11660
11661 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11664
11665 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11666 SDValue BufferLoad =
11667 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11668 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11669 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11670
11671 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11672}
11673
11674// Handle 8 bit and 16 bit buffer stores
11675SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11676 EVT VDataType, SDLoc DL,
11677 SDValue Ops[],
11678 MemSDNode *M) const {
11679 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11680 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11681
11682 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11683 Ops[1] = BufferStoreExt;
11684 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11685 : AMDGPUISD::BUFFER_STORE_SHORT;
11686 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11687 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11688 M->getMemOperand());
11689}
11690
11692 SDValue Op, const SDLoc &SL, EVT VT) {
11693 if (VT.bitsLT(Op.getValueType()))
11694 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11695
11696 switch (ExtType) {
11697 case ISD::SEXTLOAD:
11698 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11699 case ISD::ZEXTLOAD:
11700 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11701 case ISD::EXTLOAD:
11702 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11703 case ISD::NON_EXTLOAD:
11704 return Op;
11705 }
11706
11707 llvm_unreachable("invalid ext type");
11708}
11709
11710// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11711// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11712SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11713 DAGCombinerInfo &DCI) const {
11714 SelectionDAG &DAG = DCI.DAG;
11715 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11716 return SDValue();
11717
11718 // FIXME: Constant loads should all be marked invariant.
11719 unsigned AS = Ld->getAddressSpace();
11720 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11722 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11723 return SDValue();
11724
11725 // Don't do this early, since it may interfere with adjacent load merging for
11726 // illegal types. We can avoid losing alignment information for exotic types
11727 // pre-legalize.
11728 EVT MemVT = Ld->getMemoryVT();
11729 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11730 MemVT.getSizeInBits() >= 32)
11731 return SDValue();
11732
11733 SDLoc SL(Ld);
11734
11735 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11736 "unexpected vector extload");
11737
11738 // TODO: Drop only high part of range.
11739 SDValue Ptr = Ld->getBasePtr();
11740 SDValue NewLoad = DAG.getLoad(
11741 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11742 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11743 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11744 nullptr); // Drop ranges
11745
11746 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11747 if (MemVT.isFloatingPoint()) {
11749 "unexpected fp extload");
11750 TruncVT = MemVT.changeTypeToInteger();
11751 }
11752
11753 SDValue Cvt = NewLoad;
11754 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11755 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11756 DAG.getValueType(TruncVT));
11757 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11759 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11760 } else {
11762 }
11763
11764 EVT VT = Ld->getValueType(0);
11765 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11766
11767 DCI.AddToWorklist(Cvt.getNode());
11768
11769 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11770 // the appropriate extension from the 32-bit load.
11771 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11772 DCI.AddToWorklist(Cvt.getNode());
11773
11774 // Handle conversion back to floating point if necessary.
11775 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11776
11777 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11778}
11779
11781 const SIMachineFunctionInfo &Info) {
11782 // TODO: Should check if the address can definitely not access stack.
11783 if (Info.isEntryFunction())
11784 return Info.getUserSGPRInfo().hasFlatScratchInit();
11785 return true;
11786}
11787
11788SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11789 SDLoc DL(Op);
11790 LoadSDNode *Load = cast<LoadSDNode>(Op);
11791 ISD::LoadExtType ExtType = Load->getExtensionType();
11792 EVT MemVT = Load->getMemoryVT();
11793 MachineMemOperand *MMO = Load->getMemOperand();
11794
11795 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11796 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11797 return SDValue();
11798
11799 // FIXME: Copied from PPC
11800 // First, load into 32 bits, then truncate to 1 bit.
11801
11802 SDValue Chain = Load->getChain();
11803 SDValue BasePtr = Load->getBasePtr();
11804
11805 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11806
11807 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11808 RealMemVT, MMO);
11809
11810 if (!MemVT.isVector()) {
11811 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11812 NewLD.getValue(1)};
11813
11814 return DAG.getMergeValues(Ops, DL);
11815 }
11816
11818 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11819 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11820 DAG.getConstant(I, DL, MVT::i32));
11821
11822 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11823 }
11824
11825 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11826
11827 return DAG.getMergeValues(Ops, DL);
11828 }
11829
11830 if (!MemVT.isVector())
11831 return SDValue();
11832
11833 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11834 "Custom lowering for non-i32 vectors hasn't been implemented.");
11835
11836 Align Alignment = Load->getAlign();
11837 unsigned AS = Load->getAddressSpace();
11838 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11839 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11840 return SplitVectorLoad(Op, DAG);
11841 }
11842
11843 MachineFunction &MF = DAG.getMachineFunction();
11844 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11845 // If there is a possibility that flat instruction access scratch memory
11846 // then we need to use the same legalization rules we use for private.
11847 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11848 !Subtarget->hasMultiDwordFlatScratchAddressing())
11849 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11852
11853 unsigned NumElements = MemVT.getVectorNumElements();
11854
11855 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11857 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11858 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11860 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11861 Alignment >= Align(4) && NumElements < 32) {
11862 if (MemVT.isPow2VectorType() ||
11863 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11864 return SDValue();
11865 return WidenOrSplitVectorLoad(Op, DAG);
11866 }
11867 // Non-uniform loads will be selected to MUBUF instructions, so they
11868 // have the same legalization requirements as global and private
11869 // loads.
11870 //
11871 }
11872 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11875 if (NumElements > 4)
11876 return SplitVectorLoad(Op, DAG);
11877 // v3 loads not supported on SI.
11878 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11879 return WidenOrSplitVectorLoad(Op, DAG);
11880
11881 // v3 and v4 loads are supported for private and global memory.
11882 return SDValue();
11883 }
11884 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11885 // Depending on the setting of the private_element_size field in the
11886 // resource descriptor, we can only make private accesses up to a certain
11887 // size.
11888 switch (Subtarget->getMaxPrivateElementSize()) {
11889 case 4: {
11890 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11891 return DAG.getMergeValues({Op0, Op1}, DL);
11892 }
11893 case 8:
11894 if (NumElements > 2)
11895 return SplitVectorLoad(Op, DAG);
11896 return SDValue();
11897 case 16:
11898 // Same as global/flat
11899 if (NumElements > 4)
11900 return SplitVectorLoad(Op, DAG);
11901 // v3 loads not supported on SI.
11902 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11903 return WidenOrSplitVectorLoad(Op, DAG);
11904
11905 return SDValue();
11906 default:
11907 llvm_unreachable("unsupported private_element_size");
11908 }
11909 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11910 unsigned Fast = 0;
11911 auto Flags = Load->getMemOperand()->getFlags();
11913 Load->getAlign(), Flags, &Fast) &&
11914 Fast > 1)
11915 return SDValue();
11916
11917 if (MemVT.isVector())
11918 return SplitVectorLoad(Op, DAG);
11919 }
11920
11922 MemVT, *Load->getMemOperand())) {
11923 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11924 return DAG.getMergeValues({Op0, Op1}, DL);
11925 }
11926
11927 return SDValue();
11928}
11929
11930SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11931 EVT VT = Op.getValueType();
11932 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11933 VT.getSizeInBits() == 512)
11934 return splitTernaryVectorOp(Op, DAG);
11935
11936 assert(VT.getSizeInBits() == 64);
11937
11938 SDLoc DL(Op);
11939 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11940
11941 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11942 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11943
11944 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11945 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11946
11947 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11948 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11949
11950 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11951
11952 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11953 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11954
11955 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11956
11957 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11958 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11959}
11960
11961// Catch division cases where we can use shortcuts with rcp and rsq
11962// instructions.
11963SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11964 SelectionDAG &DAG) const {
11965 SDLoc SL(Op);
11966 SDValue LHS = Op.getOperand(0);
11967 SDValue RHS = Op.getOperand(1);
11968 EVT VT = Op.getValueType();
11969 const SDNodeFlags Flags = Op->getFlags();
11970
11971 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11972
11973 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11974 // Without !fpmath accuracy information, we can't do more because we don't
11975 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11976 // f16 is always accurate enough
11977 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11978 return SDValue();
11979
11980 if (CLHS->isExactlyValue(1.0)) {
11981 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11982 // the CI documentation has a worst case error of 1 ulp.
11983 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11984 // use it as long as we aren't trying to use denormals.
11985 //
11986 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11987
11988 // 1.0 / sqrt(x) -> rsq(x)
11989
11990 // XXX - Is afn sufficient to do this for f64? The maximum ULP
11991 // error seems really high at 2^29 ULP.
11992 // 1.0 / x -> rcp(x)
11993 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
11994 }
11995
11996 // Same as for 1.0, but expand the sign out of the constant.
11997 if (CLHS->isExactlyValue(-1.0)) {
11998 // -1.0 / x -> rcp (fneg x)
11999 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12000 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12001 }
12002 }
12003
12004 // For f16 and bf16 require afn or arcp.
12005 // For f32 require afn.
12006 if (!AllowInaccurateRcp &&
12007 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12008 return SDValue();
12009
12010 // Turn into multiply by the reciprocal.
12011 // x / y -> x * (1.0 / y)
12012 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12013 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12014}
12015
12016SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12017 SelectionDAG &DAG) const {
12018 SDLoc SL(Op);
12019 SDValue X = Op.getOperand(0);
12020 SDValue Y = Op.getOperand(1);
12021 EVT VT = Op.getValueType();
12022 const SDNodeFlags Flags = Op->getFlags();
12023
12024 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12025 if (!AllowInaccurateDiv)
12026 return SDValue();
12027
12028 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12029 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12030
12031 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12032 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12033
12034 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12035 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12036 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12037 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12038 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12039 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12040}
12041
12042static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12043 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12044 SDNodeFlags Flags) {
12045 if (GlueChain->getNumValues() <= 1) {
12046 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12047 }
12048
12049 assert(GlueChain->getNumValues() == 3);
12050
12051 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12052 switch (Opcode) {
12053 default:
12054 llvm_unreachable("no chain equivalent for opcode");
12055 case ISD::FMUL:
12056 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12057 break;
12058 }
12059
12060 return DAG.getNode(Opcode, SL, VTList,
12061 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12062 Flags);
12063}
12064
12065static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12066 EVT VT, SDValue A, SDValue B, SDValue C,
12067 SDValue GlueChain, SDNodeFlags Flags) {
12068 if (GlueChain->getNumValues() <= 1) {
12069 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12070 }
12071
12072 assert(GlueChain->getNumValues() == 3);
12073
12074 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12075 switch (Opcode) {
12076 default:
12077 llvm_unreachable("no chain equivalent for opcode");
12078 case ISD::FMA:
12079 Opcode = AMDGPUISD::FMA_W_CHAIN;
12080 break;
12081 }
12082
12083 return DAG.getNode(Opcode, SL, VTList,
12084 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12085 Flags);
12086}
12087
12088SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12089 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12090 return FastLowered;
12091
12092 SDLoc SL(Op);
12093 EVT VT = Op.getValueType();
12094 SDValue LHS = Op.getOperand(0);
12095 SDValue RHS = Op.getOperand(1);
12096
12097 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12098 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12099
12100 if (VT == MVT::bf16) {
12101 SDValue ExtDiv =
12102 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12103 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12104 DAG.getTargetConstant(0, SL, MVT::i32));
12105 }
12106
12107 assert(VT == MVT::f16);
12108
12109 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12110 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12111 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12112 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12113 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12114 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12115 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12116 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12117 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12118 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12119 // q16.u = opx(V_CVT_F16_F32, q32.u);
12120 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12121
12122 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12123 unsigned FMADOpCode =
12125 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12126 SDValue Rcp =
12127 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12128 SDValue Quot =
12129 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12130 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12131 Op->getFlags());
12132 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12133 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12134 Op->getFlags());
12135 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12136 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12137 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12138 DAG.getConstant(0xff800000, SL, MVT::i32));
12139 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12140 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12141 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12142 DAG.getTargetConstant(0, SL, MVT::i32));
12143 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12144 Op->getFlags());
12145}
12146
12147// Faster 2.5 ULP division that does not support denormals.
12148SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12149 SDNodeFlags Flags = Op->getFlags();
12150 SDLoc SL(Op);
12151 SDValue LHS = Op.getOperand(1);
12152 SDValue RHS = Op.getOperand(2);
12153
12154 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12155
12156 const APFloat K0Val(0x1p+96f);
12157 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12158
12159 const APFloat K1Val(0x1p-32f);
12160 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12161
12162 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12163
12164 EVT SetCCVT =
12165 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12166
12167 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12168
12169 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12170
12171 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12172
12173 // rcp does not support denormals.
12174 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12175
12176 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12177
12178 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12179}
12180
12181// Returns immediate value for setting the F32 denorm mode when using the
12182// S_DENORM_MODE instruction.
12185 const GCNSubtarget *ST) {
12186 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12187 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12188 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12189 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12190}
12191
12192SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12193 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12194 return FastLowered;
12195
12196 // The selection matcher assumes anything with a chain selecting to a
12197 // mayRaiseFPException machine instruction. Since we're introducing a chain
12198 // here, we need to explicitly report nofpexcept for the regular fdiv
12199 // lowering.
12200 SDNodeFlags Flags = Op->getFlags();
12201 Flags.setNoFPExcept(true);
12202
12203 SDLoc SL(Op);
12204 SDValue LHS = Op.getOperand(0);
12205 SDValue RHS = Op.getOperand(1);
12206
12207 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12208
12209 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12210
12211 SDValue DenominatorScaled =
12212 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12213 SDValue NumeratorScaled =
12214 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12215
12216 // Denominator is scaled to not be denormal, so using rcp is ok.
12217 SDValue ApproxRcp =
12218 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12219 SDValue NegDivScale0 =
12220 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12221
12222 using namespace AMDGPU::Hwreg;
12223 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12224 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12225
12226 const MachineFunction &MF = DAG.getMachineFunction();
12227 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12228 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12229
12230 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12231 const bool HasDynamicDenormals =
12232 (DenormMode.Input == DenormalMode::Dynamic) ||
12233 (DenormMode.Output == DenormalMode::Dynamic);
12234
12235 SDValue SavedDenormMode;
12236
12237 if (!PreservesDenormals) {
12238 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12239 // lowering. The chain dependence is insufficient, and we need glue. We do
12240 // not need the glue variants in a strictfp function.
12241
12242 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12243
12244 SDValue Glue = DAG.getEntryNode();
12245 if (HasDynamicDenormals) {
12246 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12247 DAG.getVTList(MVT::i32, MVT::Glue),
12248 {BitField, Glue});
12249 SavedDenormMode = SDValue(GetReg, 0);
12250
12251 Glue = DAG.getMergeValues(
12252 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12253 }
12254
12255 SDNode *EnableDenorm;
12256 if (Subtarget->hasDenormModeInst()) {
12257 const SDValue EnableDenormValue =
12259
12260 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12261 EnableDenormValue)
12262 .getNode();
12263 } else {
12264 const SDValue EnableDenormValue =
12265 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12266 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12267 {EnableDenormValue, BitField, Glue});
12268 }
12269
12270 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12271 SDValue(EnableDenorm, 1)};
12272
12273 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12274 }
12275
12276 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12277 ApproxRcp, One, NegDivScale0, Flags);
12278
12279 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12280 ApproxRcp, Fma0, Flags);
12281
12282 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12283 Fma1, Flags);
12284
12285 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12286 NumeratorScaled, Mul, Flags);
12287
12288 SDValue Fma3 =
12289 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12290
12291 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12292 NumeratorScaled, Fma3, Flags);
12293
12294 if (!PreservesDenormals) {
12295 SDNode *DisableDenorm;
12296 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12297 const SDValue DisableDenormValue = getSPDenormModeValue(
12298 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12299
12300 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12301 DisableDenorm =
12302 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12303 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12304 .getNode();
12305 } else {
12306 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12307 const SDValue DisableDenormValue =
12308 HasDynamicDenormals
12309 ? SavedDenormMode
12310 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12311
12312 DisableDenorm = DAG.getMachineNode(
12313 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12314 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12315 }
12316
12317 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12318 SDValue(DisableDenorm, 0), DAG.getRoot());
12319 DAG.setRoot(OutputChain);
12320 }
12321
12322 SDValue Scale = NumeratorScaled.getValue(1);
12323 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12324 {Fma4, Fma1, Fma3, Scale}, Flags);
12325
12326 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12327}
12328
12329SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12330 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12331 return FastLowered;
12332
12333 SDLoc SL(Op);
12334 SDValue X = Op.getOperand(0);
12335 SDValue Y = Op.getOperand(1);
12336
12337 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12338
12339 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12340
12341 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12342
12343 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12344
12345 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12346
12347 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12348
12349 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12350
12351 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12352
12353 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12354
12355 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12356 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12357
12358 SDValue Fma4 =
12359 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12360
12361 SDValue Scale;
12362
12363 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12364 // Workaround a hardware bug on SI where the condition output from div_scale
12365 // is not usable.
12366
12367 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12368
12369 // Figure out if the scale to use for div_fmas.
12370 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12371 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12372 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12373 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12374
12375 SDValue NumHi =
12376 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12377 SDValue DenHi =
12378 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12379
12380 SDValue Scale0Hi =
12381 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12382 SDValue Scale1Hi =
12383 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12384
12385 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12386 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12387 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12388 } else {
12389 Scale = DivScale1.getValue(1);
12390 }
12391
12392 SDValue Fmas =
12393 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12394
12395 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12396}
12397
12398SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12399 EVT VT = Op.getValueType();
12400
12401 if (VT == MVT::f32)
12402 return LowerFDIV32(Op, DAG);
12403
12404 if (VT == MVT::f64)
12405 return LowerFDIV64(Op, DAG);
12406
12407 if (VT == MVT::f16 || VT == MVT::bf16)
12408 return LowerFDIV16(Op, DAG);
12409
12410 llvm_unreachable("Unexpected type for fdiv");
12411}
12412
12413SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12414 SDLoc dl(Op);
12415 SDValue Val = Op.getOperand(0);
12416 EVT VT = Val.getValueType();
12417 EVT ResultExpVT = Op->getValueType(1);
12418 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12419
12420 SDValue Mant = DAG.getNode(
12422 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12423
12424 SDValue Exp = DAG.getNode(
12425 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12426 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12427
12428 if (Subtarget->hasFractBug()) {
12429 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12430 SDValue Inf =
12432
12433 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12434 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12435 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12436 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12437 }
12438
12439 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12440 return DAG.getMergeValues({Mant, CastExp}, dl);
12441}
12442
12443SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12444 SDLoc DL(Op);
12445 StoreSDNode *Store = cast<StoreSDNode>(Op);
12446 EVT VT = Store->getMemoryVT();
12447
12448 if (VT == MVT::i1) {
12449 return DAG.getTruncStore(
12450 Store->getChain(), DL,
12451 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12452 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12453 }
12454
12455 assert(VT.isVector() &&
12456 Store->getValue().getValueType().getScalarType() == MVT::i32);
12457
12458 unsigned AS = Store->getAddressSpace();
12459 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12460 Store->getAlign().value() < VT.getStoreSize() &&
12461 VT.getSizeInBits() > 32) {
12462 return SplitVectorStore(Op, DAG);
12463 }
12464
12465 MachineFunction &MF = DAG.getMachineFunction();
12466 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12467 // If there is a possibility that flat instruction access scratch memory
12468 // then we need to use the same legalization rules we use for private.
12469 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12470 !Subtarget->hasMultiDwordFlatScratchAddressing())
12471 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12474
12475 unsigned NumElements = VT.getVectorNumElements();
12477 if (NumElements > 4)
12478 return SplitVectorStore(Op, DAG);
12479 // v3 stores not supported on SI.
12480 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12481 return SplitVectorStore(Op, DAG);
12482
12484 VT, *Store->getMemOperand()))
12485 return expandUnalignedStore(Store, DAG);
12486
12487 return SDValue();
12488 }
12489 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12490 switch (Subtarget->getMaxPrivateElementSize()) {
12491 case 4:
12492 return scalarizeVectorStore(Store, DAG);
12493 case 8:
12494 if (NumElements > 2)
12495 return SplitVectorStore(Op, DAG);
12496 return SDValue();
12497 case 16:
12498 if (NumElements > 4 ||
12499 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12500 return SplitVectorStore(Op, DAG);
12501 return SDValue();
12502 default:
12503 llvm_unreachable("unsupported private_element_size");
12504 }
12505 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12506 unsigned Fast = 0;
12507 auto Flags = Store->getMemOperand()->getFlags();
12509 Store->getAlign(), Flags, &Fast) &&
12510 Fast > 1)
12511 return SDValue();
12512
12513 if (VT.isVector())
12514 return SplitVectorStore(Op, DAG);
12515
12516 return expandUnalignedStore(Store, DAG);
12517 }
12518
12519 // Probably an invalid store. If so we'll end up emitting a selection error.
12520 return SDValue();
12521}
12522
12523// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12524SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12525 SDLoc SL(Op);
12526 assert(!Subtarget->has16BitInsts());
12527 SDNodeFlags Flags = Op->getFlags();
12528 SDValue Ext =
12529 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12530
12531 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12532 SDValue Sqrt =
12533 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12534
12535 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12536 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12537}
12538
12539SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12540 SDLoc DL(Op);
12541 SDNodeFlags Flags = Op->getFlags();
12542 MVT VT = Op.getValueType().getSimpleVT();
12543 const SDValue X = Op.getOperand(0);
12544
12545 if (allowApproxFunc(DAG, Flags)) {
12546 // Instruction is 1ulp but ignores denormals.
12547 return DAG.getNode(
12549 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12550 }
12551
12552 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12553 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12554
12555 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12556
12557 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12558
12559 SDValue SqrtX =
12560 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12561
12562 SDValue SqrtS;
12563 if (needsDenormHandlingF32(DAG, X, Flags)) {
12564 SDValue SqrtID =
12565 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12566 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12567
12568 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12569 SDValue SqrtSNextDownInt =
12570 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12571 DAG.getAllOnesConstant(DL, MVT::i32));
12572 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12573
12574 SDValue NegSqrtSNextDown =
12575 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12576
12577 SDValue SqrtVP =
12578 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12579
12580 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12581 DAG.getConstant(1, DL, MVT::i32));
12582 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12583
12584 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12585 SDValue SqrtVS =
12586 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12587
12588 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12589 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12590
12591 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12592 Flags);
12593
12594 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12595 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12596 Flags);
12597 } else {
12598 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12599
12600 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12601
12602 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12603 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12604 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12605
12606 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12607 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12608 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12609
12610 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12611 SDValue SqrtD =
12612 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12613 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12614 }
12615
12616 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12617
12618 SDValue ScaledDown =
12619 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12620
12621 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12622 SDValue IsZeroOrInf =
12623 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12624 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12625
12626 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12627}
12628
12629SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12630 // For double type, the SQRT and RSQ instructions don't have required
12631 // precision, we apply Goldschmidt's algorithm to improve the result:
12632 //
12633 // y0 = rsq(x)
12634 // g0 = x * y0
12635 // h0 = 0.5 * y0
12636 //
12637 // r0 = 0.5 - h0 * g0
12638 // g1 = g0 * r0 + g0
12639 // h1 = h0 * r0 + h0
12640 //
12641 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12642 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12643 // h2 = h1 * r1 + h1
12644 //
12645 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12646 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12647 //
12648 // sqrt(x) = g3
12649
12650 SDNodeFlags Flags = Op->getFlags();
12651
12652 SDLoc DL(Op);
12653
12654 SDValue X = Op.getOperand(0);
12655 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12656
12657 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12658
12659 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12660
12661 // Scale up input if it is too small.
12662 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12663 SDValue ScaleUp =
12664 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12665 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12666
12667 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12668
12669 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12670
12671 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12672 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12673
12674 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12675 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12676
12677 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12678
12679 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12680
12681 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12682 SDValue SqrtD0 =
12683 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12684
12685 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12686
12687 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12688 SDValue SqrtD1 =
12689 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12690
12691 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12692
12693 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12694 SDValue ScaleDown =
12695 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12696 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12697
12698 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12699 // with finite only or nsz because rsq(+/-0) = +/-inf
12700
12701 // TODO: Check for DAZ and expand to subnormals
12702 SDValue IsZeroOrInf =
12703 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12704 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12705
12706 // If x is +INF, +0, or -0, use its original value
12707 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12708 Flags);
12709}
12710
12711SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12712 SDLoc DL(Op);
12713 EVT VT = Op.getValueType();
12714 SDValue Arg = Op.getOperand(0);
12715 SDValue TrigVal;
12716
12717 // Propagate fast-math flags so that the multiply we introduce can be folded
12718 // if Arg is already the result of a multiply by constant.
12719 auto Flags = Op->getFlags();
12720
12721 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12722
12723 if (Subtarget->hasTrigReducedRange()) {
12724 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12725 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12726 } else {
12727 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12728 }
12729
12730 switch (Op.getOpcode()) {
12731 case ISD::FCOS:
12732 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12733 case ISD::FSIN:
12734 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12735 default:
12736 llvm_unreachable("Wrong trig opcode");
12737 }
12738}
12739
12740SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12741 SelectionDAG &DAG) const {
12742 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12743 assert(AtomicNode->isCompareAndSwap());
12744 unsigned AS = AtomicNode->getAddressSpace();
12745
12746 // No custom lowering required for local address space
12748 return Op;
12749
12750 // Non-local address space requires custom lowering for atomic compare
12751 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12752 SDLoc DL(Op);
12753 SDValue ChainIn = Op.getOperand(0);
12754 SDValue Addr = Op.getOperand(1);
12755 SDValue Old = Op.getOperand(2);
12756 SDValue New = Op.getOperand(3);
12757 EVT VT = Op.getValueType();
12758 MVT SimpleVT = VT.getSimpleVT();
12759 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12760
12761 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12762 SDValue Ops[] = {ChainIn, Addr, NewOld};
12763
12765 Op->getVTList(), Ops, VT,
12766 AtomicNode->getMemOperand());
12767}
12768
12769//===----------------------------------------------------------------------===//
12770// Custom DAG optimizations
12771//===----------------------------------------------------------------------===//
12772
12773SDValue
12774SITargetLowering::performUCharToFloatCombine(SDNode *N,
12775 DAGCombinerInfo &DCI) const {
12776 EVT VT = N->getValueType(0);
12777 EVT ScalarVT = VT.getScalarType();
12778 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12779 return SDValue();
12780
12781 SelectionDAG &DAG = DCI.DAG;
12782 SDLoc DL(N);
12783
12784 SDValue Src = N->getOperand(0);
12785 EVT SrcVT = Src.getValueType();
12786
12787 // TODO: We could try to match extracting the higher bytes, which would be
12788 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12789 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12790 // about in practice.
12791 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12792 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12793 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12794 DCI.AddToWorklist(Cvt.getNode());
12795
12796 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12797 if (ScalarVT != MVT::f32) {
12798 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12799 DAG.getTargetConstant(0, DL, MVT::i32));
12800 }
12801 return Cvt;
12802 }
12803 }
12804
12805 return SDValue();
12806}
12807
12808SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12809 DAGCombinerInfo &DCI) const {
12810 SDValue MagnitudeOp = N->getOperand(0);
12811 SDValue SignOp = N->getOperand(1);
12812
12813 // The generic combine for fcopysign + fp cast is too conservative with
12814 // vectors, and also gets confused by the splitting we will perform here, so
12815 // peek through FP casts.
12816 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12817 SignOp.getOpcode() == ISD::FP_ROUND)
12818 SignOp = SignOp.getOperand(0);
12819
12820 SelectionDAG &DAG = DCI.DAG;
12821 SDLoc DL(N);
12822 EVT SignVT = SignOp.getValueType();
12823
12824 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12825 // lower half with a copy.
12826 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12827 EVT MagVT = MagnitudeOp.getValueType();
12828
12829 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12830
12831 if (MagVT.getScalarType() == MVT::f64) {
12832 EVT F32VT = MagVT.isVector()
12833 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12834 : MVT::v2f32;
12835
12836 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12837
12839 for (unsigned I = 0; I != NumElts; ++I) {
12840 SDValue MagLo =
12841 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12842 DAG.getConstant(2 * I, DL, MVT::i32));
12843 SDValue MagHi =
12844 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12845 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12846
12847 SDValue SignOpElt =
12848 MagVT.isVector()
12850 SignOp, DAG.getConstant(I, DL, MVT::i32))
12851 : SignOp;
12852
12853 SDValue HiOp =
12854 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12855
12856 SDValue Vector =
12857 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12858
12859 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12860 NewElts.push_back(NewElt);
12861 }
12862
12863 if (NewElts.size() == 1)
12864 return NewElts[0];
12865
12866 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12867 }
12868
12869 if (SignVT.getScalarType() != MVT::f64)
12870 return SDValue();
12871
12872 // Reduce width of sign operand, we only need the highest bit.
12873 //
12874 // fcopysign f64:x, f64:y ->
12875 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12876 // TODO: In some cases it might make sense to go all the way to f16.
12877
12878 EVT F32VT = MagVT.isVector()
12879 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12880 : MVT::v2f32;
12881
12882 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12883
12884 SmallVector<SDValue, 8> F32Signs;
12885 for (unsigned I = 0; I != NumElts; ++I) {
12886 // Take sign from odd elements of cast vector
12887 SDValue SignAsF32 =
12888 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12889 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12890 F32Signs.push_back(SignAsF32);
12891 }
12892
12893 SDValue NewSign =
12894 NumElts == 1
12895 ? F32Signs.back()
12897 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12898 F32Signs);
12899
12900 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12901 NewSign);
12902}
12903
12904// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12905// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12906// bits
12907
12908// This is a variant of
12909// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12910//
12911// The normal DAG combiner will do this, but only if the add has one use since
12912// that would increase the number of instructions.
12913//
12914// This prevents us from seeing a constant offset that can be folded into a
12915// memory instruction's addressing mode. If we know the resulting add offset of
12916// a pointer can be folded into an addressing offset, we can replace the pointer
12917// operand with the add of new constant offset. This eliminates one of the uses,
12918// and may allow the remaining use to also be simplified.
12919//
12920SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12921 EVT MemVT,
12922 DAGCombinerInfo &DCI) const {
12923 SDValue N0 = N->getOperand(0);
12924 SDValue N1 = N->getOperand(1);
12925
12926 // We only do this to handle cases where it's profitable when there are
12927 // multiple uses of the add, so defer to the standard combine.
12928 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12929 N0->hasOneUse())
12930 return SDValue();
12931
12932 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12933 if (!CN1)
12934 return SDValue();
12935
12936 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12937 if (!CAdd)
12938 return SDValue();
12939
12940 SelectionDAG &DAG = DCI.DAG;
12941
12942 if (N0->getOpcode() == ISD::OR &&
12943 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12944 return SDValue();
12945
12946 // If the resulting offset is too large, we can't fold it into the
12947 // addressing mode offset.
12948 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12949 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12950
12951 AddrMode AM;
12952 AM.HasBaseReg = true;
12953 AM.BaseOffs = Offset.getSExtValue();
12954 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12955 return SDValue();
12956
12957 SDLoc SL(N);
12958 EVT VT = N->getValueType(0);
12959
12960 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12961 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12962
12963 SDNodeFlags Flags;
12964 Flags.setNoUnsignedWrap(
12965 N->getFlags().hasNoUnsignedWrap() &&
12966 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12967
12968 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12969}
12970
12971/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12972/// by the chain and intrinsic ID. Theoretically we would also need to check the
12973/// specific intrinsic, but they all place the pointer operand first.
12974static unsigned getBasePtrIndex(const MemSDNode *N) {
12975 switch (N->getOpcode()) {
12976 case ISD::STORE:
12979 return 2;
12980 default:
12981 return 1;
12982 }
12983}
12984
12985SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12986 DAGCombinerInfo &DCI) const {
12987 SelectionDAG &DAG = DCI.DAG;
12988
12989 unsigned PtrIdx = getBasePtrIndex(N);
12990 SDValue Ptr = N->getOperand(PtrIdx);
12991
12992 // TODO: We could also do this for multiplies.
12993 if (Ptr.getOpcode() == ISD::SHL) {
12994 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
12995 N->getMemoryVT(), DCI);
12996 if (NewPtr) {
12997 SmallVector<SDValue, 8> NewOps(N->ops());
12998
12999 NewOps[PtrIdx] = NewPtr;
13000 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13001 }
13002 }
13003
13004 return SDValue();
13005}
13006
13007static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13008 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13009 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13010 (Opc == ISD::XOR && Val == 0);
13011}
13012
13013// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13014// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13015// integer combine opportunities since most 64-bit operations are decomposed
13016// this way. TODO: We won't want this for SALU especially if it is an inline
13017// immediate.
13018SDValue SITargetLowering::splitBinaryBitConstantOp(
13019 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13020 const ConstantSDNode *CRHS) const {
13021 uint64_t Val = CRHS->getZExtValue();
13022 uint32_t ValLo = Lo_32(Val);
13023 uint32_t ValHi = Hi_32(Val);
13024 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13025
13026 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13028 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13029 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13030 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13031 !CRHS->user_begin()->isDivergent())
13032 return SDValue();
13033
13034 // If we need to materialize a 64-bit immediate, it will be split up later
13035 // anyway. Avoid creating the harder to understand 64-bit immediate
13036 // materialization.
13037 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13038 }
13039
13040 return SDValue();
13041}
13042
13044 if (V.getValueType() != MVT::i1)
13045 return false;
13046 switch (V.getOpcode()) {
13047 default:
13048 break;
13049 case ISD::SETCC:
13050 case ISD::IS_FPCLASS:
13052 return true;
13053 case ISD::AND:
13054 case ISD::OR:
13055 case ISD::XOR:
13056 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13057 case ISD::SADDO:
13058 case ISD::UADDO:
13059 case ISD::SSUBO:
13060 case ISD::USUBO:
13061 case ISD::SMULO:
13062 case ISD::UMULO:
13063 return V.getResNo() == 1;
13065 unsigned IntrinsicID = V.getConstantOperandVal(0);
13066 switch (IntrinsicID) {
13067 case Intrinsic::amdgcn_is_shared:
13068 case Intrinsic::amdgcn_is_private:
13069 return true;
13070 default:
13071 return false;
13072 }
13073
13074 return false;
13075 }
13076 }
13077 return false;
13078}
13079
13080// If a constant has all zeroes or all ones within each byte return it.
13081// Otherwise return 0.
13083 // 0xff for any zero byte in the mask
13084 uint32_t ZeroByteMask = 0;
13085 if (!(C & 0x000000ff))
13086 ZeroByteMask |= 0x000000ff;
13087 if (!(C & 0x0000ff00))
13088 ZeroByteMask |= 0x0000ff00;
13089 if (!(C & 0x00ff0000))
13090 ZeroByteMask |= 0x00ff0000;
13091 if (!(C & 0xff000000))
13092 ZeroByteMask |= 0xff000000;
13093 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13094 if ((NonZeroByteMask & C) != NonZeroByteMask)
13095 return 0; // Partial bytes selected.
13096 return C;
13097}
13098
13099// Check if a node selects whole bytes from its operand 0 starting at a byte
13100// boundary while masking the rest. Returns select mask as in the v_perm_b32
13101// or -1 if not succeeded.
13102// Note byte select encoding:
13103// value 0-3 selects corresponding source byte;
13104// value 0xc selects zero;
13105// value 0xff selects 0xff.
13107 assert(V.getValueSizeInBits() == 32);
13108
13109 if (V.getNumOperands() != 2)
13110 return ~0;
13111
13112 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13113 if (!N1)
13114 return ~0;
13115
13116 uint32_t C = N1->getZExtValue();
13117
13118 switch (V.getOpcode()) {
13119 default:
13120 break;
13121 case ISD::AND:
13122 if (uint32_t ConstMask = getConstantPermuteMask(C))
13123 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13124 break;
13125
13126 case ISD::OR:
13127 if (uint32_t ConstMask = getConstantPermuteMask(C))
13128 return (0x03020100 & ~ConstMask) | ConstMask;
13129 break;
13130
13131 case ISD::SHL:
13132 if (C % 8)
13133 return ~0;
13134
13135 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13136
13137 case ISD::SRL:
13138 if (C % 8)
13139 return ~0;
13140
13141 return uint32_t(0x0c0c0c0c03020100ull >> C);
13142 }
13143
13144 return ~0;
13145}
13146
13147SDValue SITargetLowering::performAndCombine(SDNode *N,
13148 DAGCombinerInfo &DCI) const {
13149 if (DCI.isBeforeLegalize())
13150 return SDValue();
13151
13152 SelectionDAG &DAG = DCI.DAG;
13153 EVT VT = N->getValueType(0);
13154 SDValue LHS = N->getOperand(0);
13155 SDValue RHS = N->getOperand(1);
13156
13157 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13158 if (VT == MVT::i64 && CRHS) {
13159 if (SDValue Split =
13160 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13161 return Split;
13162 }
13163
13164 if (CRHS && VT == MVT::i32) {
13165 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13166 // nb = number of trailing zeroes in mask
13167 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13168 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13169 uint64_t Mask = CRHS->getZExtValue();
13170 unsigned Bits = llvm::popcount(Mask);
13171 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13172 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13173 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13174 unsigned Shift = CShift->getZExtValue();
13175 unsigned NB = CRHS->getAPIntValue().countr_zero();
13176 unsigned Offset = NB + Shift;
13177 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13178 SDLoc SL(N);
13179 SDValue BFE =
13180 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13181 DAG.getConstant(Offset, SL, MVT::i32),
13182 DAG.getConstant(Bits, SL, MVT::i32));
13183 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13184 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13185 DAG.getValueType(NarrowVT));
13186 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13187 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13188 return Shl;
13189 }
13190 }
13191 }
13192
13193 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13194 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13195 isa<ConstantSDNode>(LHS.getOperand(2))) {
13196 uint32_t Sel = getConstantPermuteMask(Mask);
13197 if (!Sel)
13198 return SDValue();
13199
13200 // Select 0xc for all zero bytes
13201 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13202 SDLoc DL(N);
13203 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13204 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13205 }
13206 }
13207
13208 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13209 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13210 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13211 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13212 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13213
13214 SDValue X = LHS.getOperand(0);
13215 SDValue Y = RHS.getOperand(0);
13216 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13217 !isTypeLegal(X.getValueType()))
13218 return SDValue();
13219
13220 if (LCC == ISD::SETO) {
13221 if (X != LHS.getOperand(1))
13222 return SDValue();
13223
13224 if (RCC == ISD::SETUNE) {
13225 const ConstantFPSDNode *C1 =
13226 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13227 if (!C1 || !C1->isInfinity() || C1->isNegative())
13228 return SDValue();
13229
13230 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13234
13235 static_assert(
13238 0x3ff) == Mask,
13239 "mask not equal");
13240
13241 SDLoc DL(N);
13242 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13243 DAG.getConstant(Mask, DL, MVT::i32));
13244 }
13245 }
13246 }
13247
13248 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13249 std::swap(LHS, RHS);
13250
13251 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13252 RHS.hasOneUse()) {
13253 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13254 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13255 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13256 // | n_nan)
13257 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13258 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13259 (RHS.getOperand(0) == LHS.getOperand(0) &&
13260 LHS.getOperand(0) == LHS.getOperand(1))) {
13261 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13262 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13263 : Mask->getZExtValue() & OrdMask;
13264
13265 SDLoc DL(N);
13266 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13267 DAG.getConstant(NewMask, DL, MVT::i32));
13268 }
13269 }
13270
13271 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13272 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13273 // and x, (sext cc from i1) => select cc, x, 0
13274 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13275 std::swap(LHS, RHS);
13276 if (isBoolSGPR(RHS.getOperand(0)))
13277 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13278 DAG.getConstant(0, SDLoc(N), MVT::i32));
13279 }
13280
13281 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13282 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13283 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13284 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13285 uint32_t LHSMask = getPermuteMask(LHS);
13286 uint32_t RHSMask = getPermuteMask(RHS);
13287 if (LHSMask != ~0u && RHSMask != ~0u) {
13288 // Canonicalize the expression in an attempt to have fewer unique masks
13289 // and therefore fewer registers used to hold the masks.
13290 if (LHSMask > RHSMask) {
13291 std::swap(LHSMask, RHSMask);
13292 std::swap(LHS, RHS);
13293 }
13294
13295 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13296 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13297 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13298 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13299
13300 // Check of we need to combine values from two sources within a byte.
13301 if (!(LHSUsedLanes & RHSUsedLanes) &&
13302 // If we select high and lower word keep it for SDWA.
13303 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13304 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13305 // Each byte in each mask is either selector mask 0-3, or has higher
13306 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13307 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13308 // mask which is not 0xff wins. By anding both masks we have a correct
13309 // result except that 0x0c shall be corrected to give 0x0c only.
13310 uint32_t Mask = LHSMask & RHSMask;
13311 for (unsigned I = 0; I < 32; I += 8) {
13312 uint32_t ByteSel = 0xff << I;
13313 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13314 Mask &= (0x0c << I) & 0xffffffff;
13315 }
13316
13317 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13318 // or 0x0c.
13319 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13320 SDLoc DL(N);
13321
13322 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13323 RHS.getOperand(0),
13324 DAG.getConstant(Sel, DL, MVT::i32));
13325 }
13326 }
13327 }
13328
13329 return SDValue();
13330}
13331
13332// A key component of v_perm is a mapping between byte position of the src
13333// operands, and the byte position of the dest. To provide such, we need: 1. the
13334// node that provides x byte of the dest of the OR, and 2. the byte of the node
13335// used to provide that x byte. calculateByteProvider finds which node provides
13336// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13337// and finds an ultimate src and byte position For example: The supported
13338// LoadCombine pattern for vector loads is as follows
13339// t1
13340// or
13341// / \
13342// t2 t3
13343// zext shl
13344// | | \
13345// t4 t5 16
13346// or anyext
13347// / \ |
13348// t6 t7 t8
13349// srl shl or
13350// / | / \ / \
13351// t9 t10 t11 t12 t13 t14
13352// trunc* 8 trunc* 8 and and
13353// | | / | | \
13354// t15 t16 t17 t18 t19 t20
13355// trunc* 255 srl -256
13356// | / \
13357// t15 t15 16
13358//
13359// *In this example, the truncs are from i32->i16
13360//
13361// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13362// respectively. calculateSrcByte would find (given node) -> ultimate src &
13363// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13364// After finding the mapping, we can combine the tree into vperm t15, t16,
13365// 0x05000407
13366
13367// Find the source and byte position from a node.
13368// \p DestByte is the byte position of the dest of the or that the src
13369// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13370// dest of the or byte. \p Depth tracks how many recursive iterations we have
13371// performed.
13372static const std::optional<ByteProvider<SDValue>>
13373calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13374 unsigned Depth = 0) {
13375 // We may need to recursively traverse a series of SRLs
13376 if (Depth >= 6)
13377 return std::nullopt;
13378
13379 if (Op.getValueSizeInBits() < 8)
13380 return std::nullopt;
13381
13382 if (Op.getValueType().isVector())
13383 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13384
13385 switch (Op->getOpcode()) {
13386 case ISD::TRUNCATE: {
13387 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13388 }
13389
13390 case ISD::SIGN_EXTEND:
13391 case ISD::ZERO_EXTEND:
13393 SDValue NarrowOp = Op->getOperand(0);
13394 auto NarrowVT = NarrowOp.getValueType();
13395 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13396 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13397 NarrowVT = VTSign->getVT();
13398 }
13399 if (!NarrowVT.isByteSized())
13400 return std::nullopt;
13401 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13402
13403 if (SrcIndex >= NarrowByteWidth)
13404 return std::nullopt;
13405 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13406 }
13407
13408 case ISD::SRA:
13409 case ISD::SRL: {
13410 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13411 if (!ShiftOp)
13412 return std::nullopt;
13413
13414 uint64_t BitShift = ShiftOp->getZExtValue();
13415
13416 if (BitShift % 8 != 0)
13417 return std::nullopt;
13418
13419 SrcIndex += BitShift / 8;
13420
13421 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13422 }
13423
13424 default: {
13425 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13426 }
13427 }
13428 llvm_unreachable("fully handled switch");
13429}
13430
13431// For a byte position in the result of an Or, traverse the tree and find the
13432// node (and the byte of the node) which ultimately provides this {Or,
13433// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13434// the byte position of the Op that corresponds with the originally requested
13435// byte of the Or \p Depth tracks how many recursive iterations we have
13436// performed. \p StartingIndex is the originally requested byte of the Or
13437static const std::optional<ByteProvider<SDValue>>
13438calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13439 unsigned StartingIndex = 0) {
13440 // Finding Src tree of RHS of or typically requires at least 1 additional
13441 // depth
13442 if (Depth > 6)
13443 return std::nullopt;
13444
13445 unsigned BitWidth = Op.getScalarValueSizeInBits();
13446 if (BitWidth % 8 != 0)
13447 return std::nullopt;
13448 if (Index > BitWidth / 8 - 1)
13449 return std::nullopt;
13450
13451 bool IsVec = Op.getValueType().isVector();
13452 switch (Op.getOpcode()) {
13453 case ISD::OR: {
13454 if (IsVec)
13455 return std::nullopt;
13456
13457 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13458 StartingIndex);
13459 if (!RHS)
13460 return std::nullopt;
13461 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13462 StartingIndex);
13463 if (!LHS)
13464 return std::nullopt;
13465 // A well formed Or will have two ByteProviders for each byte, one of which
13466 // is constant zero
13467 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13468 return std::nullopt;
13469 if (!LHS || LHS->isConstantZero())
13470 return RHS;
13471 if (!RHS || RHS->isConstantZero())
13472 return LHS;
13473 return std::nullopt;
13474 }
13475
13476 case ISD::AND: {
13477 if (IsVec)
13478 return std::nullopt;
13479
13480 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13481 if (!BitMaskOp)
13482 return std::nullopt;
13483
13484 uint32_t BitMask = BitMaskOp->getZExtValue();
13485 // Bits we expect for our StartingIndex
13486 uint32_t IndexMask = 0xFF << (Index * 8);
13487
13488 if ((IndexMask & BitMask) != IndexMask) {
13489 // If the result of the and partially provides the byte, then it
13490 // is not well formatted
13491 if (IndexMask & BitMask)
13492 return std::nullopt;
13494 }
13495
13496 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13497 }
13498
13499 case ISD::FSHR: {
13500 if (IsVec)
13501 return std::nullopt;
13502
13503 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13504 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13505 if (!ShiftOp || Op.getValueType().isVector())
13506 return std::nullopt;
13507
13508 uint64_t BitsProvided = Op.getValueSizeInBits();
13509 if (BitsProvided % 8 != 0)
13510 return std::nullopt;
13511
13512 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13513 if (BitShift % 8)
13514 return std::nullopt;
13515
13516 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13517 uint64_t ByteShift = BitShift / 8;
13518
13519 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13520 uint64_t BytesProvided = BitsProvided / 8;
13521 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13522 NewIndex %= BytesProvided;
13523 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13524 }
13525
13526 case ISD::SRA:
13527 case ISD::SRL: {
13528 if (IsVec)
13529 return std::nullopt;
13530
13531 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13532 if (!ShiftOp)
13533 return std::nullopt;
13534
13535 uint64_t BitShift = ShiftOp->getZExtValue();
13536 if (BitShift % 8)
13537 return std::nullopt;
13538
13539 auto BitsProvided = Op.getScalarValueSizeInBits();
13540 if (BitsProvided % 8 != 0)
13541 return std::nullopt;
13542
13543 uint64_t BytesProvided = BitsProvided / 8;
13544 uint64_t ByteShift = BitShift / 8;
13545 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13546 // If the byte we are trying to provide (as tracked by index) falls in this
13547 // range, then the SRL provides the byte. The byte of interest of the src of
13548 // the SRL is Index + ByteShift
13549 return BytesProvided - ByteShift > Index
13550 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13551 Index + ByteShift)
13553 }
13554
13555 case ISD::SHL: {
13556 if (IsVec)
13557 return std::nullopt;
13558
13559 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13560 if (!ShiftOp)
13561 return std::nullopt;
13562
13563 uint64_t BitShift = ShiftOp->getZExtValue();
13564 if (BitShift % 8 != 0)
13565 return std::nullopt;
13566 uint64_t ByteShift = BitShift / 8;
13567
13568 // If we are shifting by an amount greater than (or equal to)
13569 // the index we are trying to provide, then it provides 0s. If not,
13570 // then this bytes are not definitively 0s, and the corresponding byte
13571 // of interest is Index - ByteShift of the src
13572 return Index < ByteShift
13574 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13575 Depth + 1, StartingIndex);
13576 }
13577 case ISD::ANY_EXTEND:
13578 case ISD::SIGN_EXTEND:
13579 case ISD::ZERO_EXTEND:
13581 case ISD::AssertZext:
13582 case ISD::AssertSext: {
13583 if (IsVec)
13584 return std::nullopt;
13585
13586 SDValue NarrowOp = Op->getOperand(0);
13587 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13588 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13589 Op->getOpcode() == ISD::AssertZext ||
13590 Op->getOpcode() == ISD::AssertSext) {
13591 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13592 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13593 }
13594 if (NarrowBitWidth % 8 != 0)
13595 return std::nullopt;
13596 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13597
13598 if (Index >= NarrowByteWidth)
13599 return Op.getOpcode() == ISD::ZERO_EXTEND
13600 ? std::optional<ByteProvider<SDValue>>(
13602 : std::nullopt;
13603 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13604 }
13605
13606 case ISD::TRUNCATE: {
13607 if (IsVec)
13608 return std::nullopt;
13609
13610 uint64_t NarrowByteWidth = BitWidth / 8;
13611
13612 if (NarrowByteWidth >= Index) {
13613 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13614 StartingIndex);
13615 }
13616
13617 return std::nullopt;
13618 }
13619
13620 case ISD::CopyFromReg: {
13621 if (BitWidth / 8 > Index)
13622 return calculateSrcByte(Op, StartingIndex, Index);
13623
13624 return std::nullopt;
13625 }
13626
13627 case ISD::LOAD: {
13628 auto *L = cast<LoadSDNode>(Op.getNode());
13629
13630 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13631 if (NarrowBitWidth % 8 != 0)
13632 return std::nullopt;
13633 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13634
13635 // If the width of the load does not reach byte we are trying to provide for
13636 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13637 // question
13638 if (Index >= NarrowByteWidth) {
13639 return L->getExtensionType() == ISD::ZEXTLOAD
13640 ? std::optional<ByteProvider<SDValue>>(
13642 : std::nullopt;
13643 }
13644
13645 if (NarrowByteWidth > Index) {
13646 return calculateSrcByte(Op, StartingIndex, Index);
13647 }
13648
13649 return std::nullopt;
13650 }
13651
13652 case ISD::BSWAP: {
13653 if (IsVec)
13654 return std::nullopt;
13655
13656 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13657 Depth + 1, StartingIndex);
13658 }
13659
13661 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13662 if (!IdxOp)
13663 return std::nullopt;
13664 auto VecIdx = IdxOp->getZExtValue();
13665 auto ScalarSize = Op.getScalarValueSizeInBits();
13666 if (ScalarSize < 32)
13667 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13668 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13669 StartingIndex, Index);
13670 }
13671
13672 case AMDGPUISD::PERM: {
13673 if (IsVec)
13674 return std::nullopt;
13675
13676 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13677 if (!PermMask)
13678 return std::nullopt;
13679
13680 auto IdxMask =
13681 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13682 if (IdxMask > 0x07 && IdxMask != 0x0c)
13683 return std::nullopt;
13684
13685 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13686 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13687
13688 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13691 }
13692
13693 default: {
13694 return std::nullopt;
13695 }
13696 }
13697
13698 llvm_unreachable("fully handled switch");
13699}
13700
13701// Returns true if the Operand is a scalar and is 16 bits
13702static bool isExtendedFrom16Bits(SDValue &Operand) {
13703
13704 switch (Operand.getOpcode()) {
13705 case ISD::ANY_EXTEND:
13706 case ISD::SIGN_EXTEND:
13707 case ISD::ZERO_EXTEND: {
13708 auto OpVT = Operand.getOperand(0).getValueType();
13709 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13710 }
13711 case ISD::LOAD: {
13712 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13713 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13714 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13715 ExtType == ISD::EXTLOAD) {
13716 auto MemVT = L->getMemoryVT();
13717 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13718 }
13719 return L->getMemoryVT().getSizeInBits() == 16;
13720 }
13721 default:
13722 return false;
13723 }
13724}
13725
13726// Returns true if the mask matches consecutive bytes, and the first byte
13727// begins at a power of 2 byte offset from 0th byte
13728static bool addresses16Bits(int Mask) {
13729 int Low8 = Mask & 0xff;
13730 int Hi8 = (Mask & 0xff00) >> 8;
13731
13732 assert(Low8 < 8 && Hi8 < 8);
13733 // Are the bytes contiguous in the order of increasing addresses.
13734 bool IsConsecutive = (Hi8 - Low8 == 1);
13735 // Is the first byte at location that is aligned for 16 bit instructions.
13736 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13737 // In this case, we still need code to extract the 16 bit operand, so it
13738 // is better to use i8 v_perm
13739 bool Is16Aligned = !(Low8 % 2);
13740
13741 return IsConsecutive && Is16Aligned;
13742}
13743
13744// Do not lower into v_perm if the operands are actually 16 bit
13745// and the selected bits (based on PermMask) correspond with two
13746// easily addressable 16 bit operands.
13748 SDValue &OtherOp) {
13749 int Low16 = PermMask & 0xffff;
13750 int Hi16 = (PermMask & 0xffff0000) >> 16;
13751
13752 auto TempOp = peekThroughBitcasts(Op);
13753 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13754
13755 auto OpIs16Bit =
13756 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13757 if (!OpIs16Bit)
13758 return true;
13759
13760 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13761 isExtendedFrom16Bits(TempOtherOp);
13762 if (!OtherOpIs16Bit)
13763 return true;
13764
13765 // Do we cleanly address both
13766 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13767}
13768
13770 unsigned DWordOffset) {
13771 SDValue Ret;
13772
13773 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13774 // ByteProvider must be at least 8 bits
13775 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13776
13777 if (TypeSize <= 32)
13778 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13779
13780 if (Src.getValueType().isVector()) {
13781 auto ScalarTySize = Src.getScalarValueSizeInBits();
13782 auto ScalarTy = Src.getValueType().getScalarType();
13783 if (ScalarTySize == 32) {
13784 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13785 DAG.getConstant(DWordOffset, SL, MVT::i32));
13786 }
13787 if (ScalarTySize > 32) {
13788 Ret = DAG.getNode(
13789 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13790 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13791 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13792 if (ShiftVal)
13793 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13794 DAG.getConstant(ShiftVal, SL, MVT::i32));
13795 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13796 }
13797
13798 assert(ScalarTySize < 32);
13799 auto NumElements = TypeSize / ScalarTySize;
13800 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13801 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13802 auto NumElementsIn32 = 32 / ScalarTySize;
13803 auto NumAvailElements = DWordOffset < Trunc32Elements
13804 ? NumElementsIn32
13805 : NumElements - NormalizedTrunc;
13806
13808 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13809 NumAvailElements);
13810
13811 Ret = DAG.getBuildVector(
13812 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13813 VecSrcs);
13814 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13815 }
13816
13817 /// Scalar Type
13818 auto ShiftVal = 32 * DWordOffset;
13819 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13820 DAG.getConstant(ShiftVal, SL, MVT::i32));
13821 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13822}
13823
13825 SelectionDAG &DAG = DCI.DAG;
13826 [[maybe_unused]] EVT VT = N->getValueType(0);
13828
13829 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13830 assert(VT == MVT::i32);
13831 for (int i = 0; i < 4; i++) {
13832 // Find the ByteProvider that provides the ith byte of the result of OR
13833 std::optional<ByteProvider<SDValue>> P =
13834 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13835 // TODO support constantZero
13836 if (!P || P->isConstantZero())
13837 return SDValue();
13838
13839 PermNodes.push_back(*P);
13840 }
13841 if (PermNodes.size() != 4)
13842 return SDValue();
13843
13844 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13845 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13846 uint64_t PermMask = 0x00000000;
13847 for (size_t i = 0; i < PermNodes.size(); i++) {
13848 auto PermOp = PermNodes[i];
13849 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13850 // by sizeof(Src2) = 4
13851 int SrcByteAdjust = 4;
13852
13853 // If the Src uses a byte from a different DWORD, then it corresponds
13854 // with a difference source
13855 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13856 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13857 if (SecondSrc)
13858 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13859 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13860 return SDValue();
13861
13862 // Set the index of the second distinct Src node
13863 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13864 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13865 SrcByteAdjust = 0;
13866 }
13867 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13869 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13870 }
13871 SDLoc DL(N);
13872 SDValue Op = *PermNodes[FirstSrc.first].Src;
13873 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13874 assert(Op.getValueSizeInBits() == 32);
13875
13876 // Check that we are not just extracting the bytes in order from an op
13877 if (!SecondSrc) {
13878 int Low16 = PermMask & 0xffff;
13879 int Hi16 = (PermMask & 0xffff0000) >> 16;
13880
13881 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13882 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13883
13884 // The perm op would really just produce Op. So combine into Op
13885 if (WellFormedLow && WellFormedHi)
13886 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13887 }
13888
13889 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13890
13891 if (SecondSrc) {
13892 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13893 assert(OtherOp.getValueSizeInBits() == 32);
13894 }
13895
13896 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13897
13898 assert(Op.getValueType().isByteSized() &&
13899 OtherOp.getValueType().isByteSized());
13900
13901 // If the ultimate src is less than 32 bits, then we will only be
13902 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13903 // CalculateByteProvider would not have returned Op as source if we
13904 // used a byte that is outside its ValueType. Thus, we are free to
13905 // ANY_EXTEND as the extended bits are dont-cares.
13906 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13907 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13908
13909 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13910 DAG.getConstant(PermMask, DL, MVT::i32));
13911 }
13912 return SDValue();
13913}
13914
13915SDValue SITargetLowering::performOrCombine(SDNode *N,
13916 DAGCombinerInfo &DCI) const {
13917 SelectionDAG &DAG = DCI.DAG;
13918 SDValue LHS = N->getOperand(0);
13919 SDValue RHS = N->getOperand(1);
13920
13921 EVT VT = N->getValueType(0);
13922 if (VT == MVT::i1) {
13923 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13924 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13925 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13926 SDValue Src = LHS.getOperand(0);
13927 if (Src != RHS.getOperand(0))
13928 return SDValue();
13929
13930 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13931 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13932 if (!CLHS || !CRHS)
13933 return SDValue();
13934
13935 // Only 10 bits are used.
13936 static const uint32_t MaxMask = 0x3ff;
13937
13938 uint32_t NewMask =
13939 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13940 SDLoc DL(N);
13941 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13942 DAG.getConstant(NewMask, DL, MVT::i32));
13943 }
13944
13945 return SDValue();
13946 }
13947
13948 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13950 LHS.getOpcode() == AMDGPUISD::PERM &&
13951 isa<ConstantSDNode>(LHS.getOperand(2))) {
13952 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13953 if (!Sel)
13954 return SDValue();
13955
13956 Sel |= LHS.getConstantOperandVal(2);
13957 SDLoc DL(N);
13958 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13959 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13960 }
13961
13962 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13963 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13964 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13965 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13966
13967 // If all the uses of an or need to extract the individual elements, do not
13968 // attempt to lower into v_perm
13969 auto usesCombinedOperand = [](SDNode *OrUse) {
13970 // If we have any non-vectorized use, then it is a candidate for v_perm
13971 if (OrUse->getOpcode() != ISD::BITCAST ||
13972 !OrUse->getValueType(0).isVector())
13973 return true;
13974
13975 // If we have any non-vectorized use, then it is a candidate for v_perm
13976 for (auto *VUser : OrUse->users()) {
13977 if (!VUser->getValueType(0).isVector())
13978 return true;
13979
13980 // If the use of a vector is a store, then combining via a v_perm
13981 // is beneficial.
13982 // TODO -- whitelist more uses
13983 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13984 if (VUser->getOpcode() == VectorwiseOp)
13985 return true;
13986 }
13987 return false;
13988 };
13989
13990 if (!any_of(N->users(), usesCombinedOperand))
13991 return SDValue();
13992
13993 uint32_t LHSMask = getPermuteMask(LHS);
13994 uint32_t RHSMask = getPermuteMask(RHS);
13995
13996 if (LHSMask != ~0u && RHSMask != ~0u) {
13997 // Canonicalize the expression in an attempt to have fewer unique masks
13998 // and therefore fewer registers used to hold the masks.
13999 if (LHSMask > RHSMask) {
14000 std::swap(LHSMask, RHSMask);
14001 std::swap(LHS, RHS);
14002 }
14003
14004 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14005 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14006 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14007 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14008
14009 // Check of we need to combine values from two sources within a byte.
14010 if (!(LHSUsedLanes & RHSUsedLanes) &&
14011 // If we select high and lower word keep it for SDWA.
14012 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14013 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14014 // Kill zero bytes selected by other mask. Zero value is 0xc.
14015 LHSMask &= ~RHSUsedLanes;
14016 RHSMask &= ~LHSUsedLanes;
14017 // Add 4 to each active LHS lane
14018 LHSMask |= LHSUsedLanes & 0x04040404;
14019 // Combine masks
14020 uint32_t Sel = LHSMask | RHSMask;
14021 SDLoc DL(N);
14022
14023 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14024 RHS.getOperand(0),
14025 DAG.getConstant(Sel, DL, MVT::i32));
14026 }
14027 }
14028 if (LHSMask == ~0u || RHSMask == ~0u) {
14029 if (SDValue Perm = matchPERM(N, DCI))
14030 return Perm;
14031 }
14032 }
14033
14034 // Detect identity v2i32 OR and replace with identity source node.
14035 // Specifically an Or that has operands constructed from the same source node
14036 // via extract_vector_elt and build_vector. I.E.
14037 // v2i32 or(
14038 // v2i32 build_vector(
14039 // i32 extract_elt(%IdentitySrc, 0),
14040 // i32 0
14041 // ),
14042 // v2i32 build_vector(
14043 // i32 0,
14044 // i32 extract_elt(%IdentitySrc, 1)
14045 // ) )
14046 // =>
14047 // v2i32 %IdentitySrc
14048
14049 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14050 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14051
14052 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14053 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14054
14055 // Test for and normalise build vectors.
14056 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14057
14058 // Get the extract_vector_element operands.
14059 SDValue LEVE = LHS->getOperand(0);
14060 SDValue REVE = RHS->getOperand(1);
14061
14062 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14064 // Check that different elements from the same vector are
14065 // extracted.
14066 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14067 LEVE->getOperand(1) != REVE->getOperand(1)) {
14068 SDValue IdentitySrc = LEVE.getOperand(0);
14069 return IdentitySrc;
14070 }
14071 }
14072 }
14073 }
14074
14075 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14076 return SDValue();
14077
14078 // TODO: This could be a generic combine with a predicate for extracting the
14079 // high half of an integer being free.
14080
14081 // (or i64:x, (zero_extend i32:y)) ->
14082 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14083 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14084 RHS.getOpcode() != ISD::ZERO_EXTEND)
14085 std::swap(LHS, RHS);
14086
14087 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14088 SDValue ExtSrc = RHS.getOperand(0);
14089 EVT SrcVT = ExtSrc.getValueType();
14090 if (SrcVT == MVT::i32) {
14091 SDLoc SL(N);
14092 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14093 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14094
14095 DCI.AddToWorklist(LowOr.getNode());
14096 DCI.AddToWorklist(HiBits.getNode());
14097
14098 SDValue Vec =
14099 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14100 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14101 }
14102 }
14103
14104 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14105 if (CRHS) {
14106 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14107 N->getOperand(0), CRHS))
14108 return Split;
14109 }
14110
14111 return SDValue();
14112}
14113
14114SDValue SITargetLowering::performXorCombine(SDNode *N,
14115 DAGCombinerInfo &DCI) const {
14116 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14117 return RV;
14118
14119 SDValue LHS = N->getOperand(0);
14120 SDValue RHS = N->getOperand(1);
14121
14122 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14123 SelectionDAG &DAG = DCI.DAG;
14124
14125 EVT VT = N->getValueType(0);
14126 if (CRHS && VT == MVT::i64) {
14127 if (SDValue Split =
14128 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14129 return Split;
14130 }
14131
14132 // v2i32 (xor (vselect cc, x, y), K) ->
14133 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14134 // replaced with source modifiers when the select is lowered to CNDMASK.
14135 unsigned Opc = LHS.getOpcode();
14136 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14137 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14138 CRHS && CRHS->getAPIntValue().isSignMask()) {
14139 SDValue CC = LHS->getOperand(0);
14140 SDValue TRUE = LHS->getOperand(1);
14141 SDValue FALSE = LHS->getOperand(2);
14142 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14143 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14144 SDValue XSelect =
14145 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14146 return XSelect;
14147 }
14148
14149 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14150 // fneg-like xors into 64-bit select.
14151 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14152 // This looks like an fneg, try to fold as a source modifier.
14153 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14155 // xor (select c, a, b), 0x80000000 ->
14156 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14157 SDLoc DL(N);
14158 SDValue CastLHS =
14159 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14160 SDValue CastRHS =
14161 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14162 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14163 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14164 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14165 LHS->getOperand(0), FNegLHS, FNegRHS);
14166 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14167 }
14168 }
14169
14170 return SDValue();
14171}
14172
14173SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14174 DAGCombinerInfo &DCI) const {
14175 if (!Subtarget->has16BitInsts() ||
14176 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14177 return SDValue();
14178
14179 EVT VT = N->getValueType(0);
14180 if (VT != MVT::i32)
14181 return SDValue();
14182
14183 SDValue Src = N->getOperand(0);
14184 if (Src.getValueType() != MVT::i16)
14185 return SDValue();
14186
14187 return SDValue();
14188}
14189
14190SDValue
14191SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14192 DAGCombinerInfo &DCI) const {
14193 SDValue Src = N->getOperand(0);
14194 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14195
14196 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14197 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14198 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14199 VTSign->getVT() == MVT::i8) ||
14200 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14201 VTSign->getVT() == MVT::i16))) {
14202 assert(Subtarget->hasScalarSubwordLoads() &&
14203 "s_buffer_load_{u8, i8} are supported "
14204 "in GFX12 (or newer) architectures.");
14205 EVT VT = Src.getValueType();
14206 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14209 SDLoc DL(N);
14210 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14211 SDValue Ops[] = {
14212 Src.getOperand(0), // source register
14213 Src.getOperand(1), // offset
14214 Src.getOperand(2) // cachePolicy
14215 };
14216 auto *M = cast<MemSDNode>(Src);
14217 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14218 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14219 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14220 return LoadVal;
14221 }
14222 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14223 VTSign->getVT() == MVT::i8) ||
14224 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14225 VTSign->getVT() == MVT::i16)) &&
14226 Src.hasOneUse()) {
14227 auto *M = cast<MemSDNode>(Src);
14228 SDValue Ops[] = {Src.getOperand(0), // Chain
14229 Src.getOperand(1), // rsrc
14230 Src.getOperand(2), // vindex
14231 Src.getOperand(3), // voffset
14232 Src.getOperand(4), // soffset
14233 Src.getOperand(5), // offset
14234 Src.getOperand(6), Src.getOperand(7)};
14235 // replace with BUFFER_LOAD_BYTE/SHORT
14236 SDVTList ResList =
14237 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14238 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14241 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14242 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14243 return DCI.DAG.getMergeValues(
14244 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14245 }
14246 return SDValue();
14247}
14248
14249SDValue SITargetLowering::performClassCombine(SDNode *N,
14250 DAGCombinerInfo &DCI) const {
14251 SelectionDAG &DAG = DCI.DAG;
14252 SDValue Mask = N->getOperand(1);
14253
14254 // fp_class x, 0 -> false
14255 if (isNullConstant(Mask))
14256 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14257
14258 if (N->getOperand(0).isUndef())
14259 return DAG.getUNDEF(MVT::i1);
14260
14261 return SDValue();
14262}
14263
14264SDValue SITargetLowering::performRcpCombine(SDNode *N,
14265 DAGCombinerInfo &DCI) const {
14266 EVT VT = N->getValueType(0);
14267 SDValue N0 = N->getOperand(0);
14268
14269 if (N0.isUndef()) {
14270 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14271 SDLoc(N), VT);
14272 }
14273
14274 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14275 N0.getOpcode() == ISD::SINT_TO_FP)) {
14276 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14277 N->getFlags());
14278 }
14279
14280 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14281 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14282 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14283 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14284 N->getFlags());
14285 }
14286
14288}
14289
14291 unsigned MaxDepth) const {
14292 unsigned Opcode = Op.getOpcode();
14293 if (Opcode == ISD::FCANONICALIZE)
14294 return true;
14295
14296 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14297 const auto &F = CFP->getValueAPF();
14298 if (F.isNaN() && F.isSignaling())
14299 return false;
14300 if (!F.isDenormal())
14301 return true;
14302
14303 DenormalMode Mode =
14304 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14305 return Mode == DenormalMode::getIEEE();
14306 }
14307
14308 // If source is a result of another standard FP operation it is already in
14309 // canonical form.
14310 if (MaxDepth == 0)
14311 return false;
14312
14313 switch (Opcode) {
14314 // These will flush denorms if required.
14315 case ISD::FADD:
14316 case ISD::FSUB:
14317 case ISD::FMUL:
14318 case ISD::FCEIL:
14319 case ISD::FFLOOR:
14320 case ISD::FMA:
14321 case ISD::FMAD:
14322 case ISD::FSQRT:
14323 case ISD::FDIV:
14324 case ISD::FREM:
14325 case ISD::FP_ROUND:
14326 case ISD::FP_EXTEND:
14327 case ISD::FP16_TO_FP:
14328 case ISD::FP_TO_FP16:
14329 case ISD::BF16_TO_FP:
14330 case ISD::FP_TO_BF16:
14331 case ISD::FLDEXP:
14334 case AMDGPUISD::RCP:
14335 case AMDGPUISD::RSQ:
14339 case AMDGPUISD::LOG:
14340 case AMDGPUISD::EXP:
14344 case AMDGPUISD::FRACT:
14351 case AMDGPUISD::SIN_HW:
14352 case AMDGPUISD::COS_HW:
14353 return true;
14354
14355 // It can/will be lowered or combined as a bit operation.
14356 // Need to check their input recursively to handle.
14357 case ISD::FNEG:
14358 case ISD::FABS:
14359 case ISD::FCOPYSIGN:
14360 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14361
14362 case ISD::AND:
14363 if (Op.getValueType() == MVT::i32) {
14364 // Be careful as we only know it is a bitcast floating point type. It
14365 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14366 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14367 // is valid to optimize for all types.
14368 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14369 if (RHS->getZExtValue() == 0xffff0000) {
14370 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14371 }
14372 }
14373 }
14374 break;
14375
14376 case ISD::FSIN:
14377 case ISD::FCOS:
14378 case ISD::FSINCOS:
14379 return Op.getValueType().getScalarType() != MVT::f16;
14380
14381 case ISD::FMINNUM:
14382 case ISD::FMAXNUM:
14383 case ISD::FMINNUM_IEEE:
14384 case ISD::FMAXNUM_IEEE:
14385 case ISD::FMINIMUM:
14386 case ISD::FMAXIMUM:
14387 case ISD::FMINIMUMNUM:
14388 case ISD::FMAXIMUMNUM:
14389 case AMDGPUISD::CLAMP:
14390 case AMDGPUISD::FMED3:
14391 case AMDGPUISD::FMAX3:
14392 case AMDGPUISD::FMIN3:
14394 case AMDGPUISD::FMINIMUM3: {
14395 // FIXME: Shouldn't treat the generic operations different based these.
14396 // However, we aren't really required to flush the result from
14397 // minnum/maxnum..
14398
14399 // snans will be quieted, so we only need to worry about denormals.
14400 if (Subtarget->supportsMinMaxDenormModes() ||
14401 // FIXME: denormalsEnabledForType is broken for dynamic
14402 denormalsEnabledForType(DAG, Op.getValueType()))
14403 return true;
14404
14405 // Flushing may be required.
14406 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14407 // targets need to check their input recursively.
14408
14409 // FIXME: Does this apply with clamp? It's implemented with max.
14410 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14411 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14412 return false;
14413 }
14414
14415 return true;
14416 }
14417 case ISD::SELECT: {
14418 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14419 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14420 }
14421 case ISD::BUILD_VECTOR: {
14422 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14423 SDValue SrcOp = Op.getOperand(i);
14424 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14425 return false;
14426 }
14427
14428 return true;
14429 }
14432 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14433 }
14435 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14436 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14437 }
14438 case ISD::UNDEF:
14439 // Could be anything.
14440 return false;
14441
14442 case ISD::BITCAST:
14443 // TODO: This is incorrect as it loses track of the operand's type. We may
14444 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14445 // same bits that are canonicalized in one type need not be in the other.
14446 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14447 case ISD::TRUNCATE: {
14448 // Hack round the mess we make when legalizing extract_vector_elt
14449 if (Op.getValueType() == MVT::i16) {
14450 SDValue TruncSrc = Op.getOperand(0);
14451 if (TruncSrc.getValueType() == MVT::i32 &&
14452 TruncSrc.getOpcode() == ISD::BITCAST &&
14453 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14454 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14455 }
14456 }
14457 return false;
14458 }
14460 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14461 // TODO: Handle more intrinsics
14462 switch (IntrinsicID) {
14463 case Intrinsic::amdgcn_cvt_pkrtz:
14464 case Intrinsic::amdgcn_cubeid:
14465 case Intrinsic::amdgcn_frexp_mant:
14466 case Intrinsic::amdgcn_fdot2:
14467 case Intrinsic::amdgcn_rcp:
14468 case Intrinsic::amdgcn_rsq:
14469 case Intrinsic::amdgcn_rsq_clamp:
14470 case Intrinsic::amdgcn_rcp_legacy:
14471 case Intrinsic::amdgcn_rsq_legacy:
14472 case Intrinsic::amdgcn_trig_preop:
14473 case Intrinsic::amdgcn_tanh:
14474 case Intrinsic::amdgcn_log:
14475 case Intrinsic::amdgcn_exp2:
14476 case Intrinsic::amdgcn_sqrt:
14477 return true;
14478 default:
14479 break;
14480 }
14481
14482 break;
14483 }
14484 default:
14485 break;
14486 }
14487
14488 // FIXME: denormalsEnabledForType is broken for dynamic
14489 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14490 DAG.isKnownNeverSNaN(Op);
14491}
14492
14494 unsigned MaxDepth) const {
14495 const MachineRegisterInfo &MRI = MF.getRegInfo();
14496 MachineInstr *MI = MRI.getVRegDef(Reg);
14497 unsigned Opcode = MI->getOpcode();
14498
14499 if (Opcode == AMDGPU::G_FCANONICALIZE)
14500 return true;
14501
14502 std::optional<FPValueAndVReg> FCR;
14503 // Constant splat (can be padded with undef) or scalar constant.
14505 if (FCR->Value.isSignaling())
14506 return false;
14507 if (!FCR->Value.isDenormal())
14508 return true;
14509
14510 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14511 return Mode == DenormalMode::getIEEE();
14512 }
14513
14514 if (MaxDepth == 0)
14515 return false;
14516
14517 switch (Opcode) {
14518 case AMDGPU::G_FADD:
14519 case AMDGPU::G_FSUB:
14520 case AMDGPU::G_FMUL:
14521 case AMDGPU::G_FCEIL:
14522 case AMDGPU::G_FFLOOR:
14523 case AMDGPU::G_FRINT:
14524 case AMDGPU::G_FNEARBYINT:
14525 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14526 case AMDGPU::G_INTRINSIC_TRUNC:
14527 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14528 case AMDGPU::G_FMA:
14529 case AMDGPU::G_FMAD:
14530 case AMDGPU::G_FSQRT:
14531 case AMDGPU::G_FDIV:
14532 case AMDGPU::G_FREM:
14533 case AMDGPU::G_FPOW:
14534 case AMDGPU::G_FPEXT:
14535 case AMDGPU::G_FLOG:
14536 case AMDGPU::G_FLOG2:
14537 case AMDGPU::G_FLOG10:
14538 case AMDGPU::G_FPTRUNC:
14539 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14540 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14541 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14542 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14543 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14544 return true;
14545 case AMDGPU::G_FNEG:
14546 case AMDGPU::G_FABS:
14547 case AMDGPU::G_FCOPYSIGN:
14548 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14549 case AMDGPU::G_FMINNUM:
14550 case AMDGPU::G_FMAXNUM:
14551 case AMDGPU::G_FMINNUM_IEEE:
14552 case AMDGPU::G_FMAXNUM_IEEE:
14553 case AMDGPU::G_FMINIMUM:
14554 case AMDGPU::G_FMAXIMUM:
14555 case AMDGPU::G_FMINIMUMNUM:
14556 case AMDGPU::G_FMAXIMUMNUM: {
14557 if (Subtarget->supportsMinMaxDenormModes() ||
14558 // FIXME: denormalsEnabledForType is broken for dynamic
14559 denormalsEnabledForType(MRI.getType(Reg), MF))
14560 return true;
14561
14562 [[fallthrough]];
14563 }
14564 case AMDGPU::G_BUILD_VECTOR:
14565 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14566 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14567 return false;
14568 return true;
14569 case AMDGPU::G_INTRINSIC:
14570 case AMDGPU::G_INTRINSIC_CONVERGENT:
14571 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14572 case Intrinsic::amdgcn_fmul_legacy:
14573 case Intrinsic::amdgcn_fmad_ftz:
14574 case Intrinsic::amdgcn_sqrt:
14575 case Intrinsic::amdgcn_fmed3:
14576 case Intrinsic::amdgcn_sin:
14577 case Intrinsic::amdgcn_cos:
14578 case Intrinsic::amdgcn_log:
14579 case Intrinsic::amdgcn_exp2:
14580 case Intrinsic::amdgcn_log_clamp:
14581 case Intrinsic::amdgcn_rcp:
14582 case Intrinsic::amdgcn_rcp_legacy:
14583 case Intrinsic::amdgcn_rsq:
14584 case Intrinsic::amdgcn_rsq_clamp:
14585 case Intrinsic::amdgcn_rsq_legacy:
14586 case Intrinsic::amdgcn_div_scale:
14587 case Intrinsic::amdgcn_div_fmas:
14588 case Intrinsic::amdgcn_div_fixup:
14589 case Intrinsic::amdgcn_fract:
14590 case Intrinsic::amdgcn_cvt_pkrtz:
14591 case Intrinsic::amdgcn_cubeid:
14592 case Intrinsic::amdgcn_cubema:
14593 case Intrinsic::amdgcn_cubesc:
14594 case Intrinsic::amdgcn_cubetc:
14595 case Intrinsic::amdgcn_frexp_mant:
14596 case Intrinsic::amdgcn_fdot2:
14597 case Intrinsic::amdgcn_trig_preop:
14598 case Intrinsic::amdgcn_tanh:
14599 return true;
14600 default:
14601 break;
14602 }
14603
14604 [[fallthrough]];
14605 default:
14606 return false;
14607 }
14608
14609 llvm_unreachable("invalid operation");
14610}
14611
14612// Constant fold canonicalize.
14613SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14614 const SDLoc &SL, EVT VT,
14615 const APFloat &C) const {
14616 // Flush denormals to 0 if not enabled.
14617 if (C.isDenormal()) {
14618 DenormalMode Mode =
14619 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14620 if (Mode == DenormalMode::getPreserveSign()) {
14621 return DAG.getConstantFP(
14622 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14623 }
14624
14625 if (Mode != DenormalMode::getIEEE())
14626 return SDValue();
14627 }
14628
14629 if (C.isNaN()) {
14630 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14631 if (C.isSignaling()) {
14632 // Quiet a signaling NaN.
14633 // FIXME: Is this supposed to preserve payload bits?
14634 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14635 }
14636
14637 // Make sure it is the canonical NaN bitpattern.
14638 //
14639 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14640 // immediate?
14641 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14642 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14643 }
14644
14645 // Already canonical.
14646 return DAG.getConstantFP(C, SL, VT);
14647}
14648
14650 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14651}
14652
14653SDValue
14654SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14655 DAGCombinerInfo &DCI) const {
14656 SelectionDAG &DAG = DCI.DAG;
14657 SDValue N0 = N->getOperand(0);
14658 EVT VT = N->getValueType(0);
14659
14660 // fcanonicalize undef -> qnan
14661 if (N0.isUndef()) {
14663 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14664 }
14665
14666 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14667 EVT VT = N->getValueType(0);
14668 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14669 }
14670
14671 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14672 // (fcanonicalize k)
14673 //
14674 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14675
14676 // TODO: This could be better with wider vectors that will be split to v2f16,
14677 // and to consider uses since there aren't that many packed operations.
14678 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14679 isTypeLegal(MVT::v2f16)) {
14680 SDLoc SL(N);
14681 SDValue NewElts[2];
14682 SDValue Lo = N0.getOperand(0);
14683 SDValue Hi = N0.getOperand(1);
14684 EVT EltVT = Lo.getValueType();
14685
14687 for (unsigned I = 0; I != 2; ++I) {
14688 SDValue Op = N0.getOperand(I);
14689 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14690 NewElts[I] =
14691 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14692 } else if (Op.isUndef()) {
14693 // Handled below based on what the other operand is.
14694 NewElts[I] = Op;
14695 } else {
14696 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14697 }
14698 }
14699
14700 // If one half is undef, and one is constant, prefer a splat vector rather
14701 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14702 // cheaper to use and may be free with a packed operation.
14703 if (NewElts[0].isUndef()) {
14704 if (isa<ConstantFPSDNode>(NewElts[1]))
14705 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14706 ? NewElts[1]
14707 : DAG.getConstantFP(0.0f, SL, EltVT);
14708 }
14709
14710 if (NewElts[1].isUndef()) {
14711 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14712 ? NewElts[0]
14713 : DAG.getConstantFP(0.0f, SL, EltVT);
14714 }
14715
14716 return DAG.getBuildVector(VT, SL, NewElts);
14717 }
14718 }
14719
14720 return SDValue();
14721}
14722
14723static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14724 switch (Opc) {
14725 case ISD::FMAXNUM:
14726 case ISD::FMAXNUM_IEEE:
14727 case ISD::FMAXIMUMNUM:
14728 return AMDGPUISD::FMAX3;
14729 case ISD::FMAXIMUM:
14730 return AMDGPUISD::FMAXIMUM3;
14731 case ISD::SMAX:
14732 return AMDGPUISD::SMAX3;
14733 case ISD::UMAX:
14734 return AMDGPUISD::UMAX3;
14735 case ISD::FMINNUM:
14736 case ISD::FMINNUM_IEEE:
14737 case ISD::FMINIMUMNUM:
14738 return AMDGPUISD::FMIN3;
14739 case ISD::FMINIMUM:
14740 return AMDGPUISD::FMINIMUM3;
14741 case ISD::SMIN:
14742 return AMDGPUISD::SMIN3;
14743 case ISD::UMIN:
14744 return AMDGPUISD::UMIN3;
14745 default:
14746 llvm_unreachable("Not a min/max opcode");
14747 }
14748}
14749
14750SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14751 const SDLoc &SL, SDValue Src,
14752 SDValue MinVal,
14753 SDValue MaxVal,
14754 bool Signed) const {
14755
14756 // med3 comes from
14757 // min(max(x, K0), K1), K0 < K1
14758 // max(min(x, K0), K1), K1 < K0
14759 //
14760 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14761 // min/max op.
14762 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14763 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14764
14765 if (!MinK || !MaxK)
14766 return SDValue();
14767
14768 if (Signed) {
14769 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14770 return SDValue();
14771 } else {
14772 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14773 return SDValue();
14774 }
14775
14776 EVT VT = MinK->getValueType(0);
14777 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14778 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14779 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14780
14781 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14782 // not available, but this is unlikely to be profitable as constants
14783 // will often need to be materialized & extended, especially on
14784 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14785 return SDValue();
14786}
14787
14790 return C;
14791
14793 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14794 return C;
14795 }
14796
14797 return nullptr;
14798}
14799
14800SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14801 const SDLoc &SL, SDValue Op0,
14802 SDValue Op1) const {
14803 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14804 if (!K1)
14805 return SDValue();
14806
14807 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14808 if (!K0)
14809 return SDValue();
14810
14811 // Ordered >= (although NaN inputs should have folded away by now).
14812 if (K0->getValueAPF() > K1->getValueAPF())
14813 return SDValue();
14814
14815 // med3 with a nan input acts like
14816 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14817 //
14818 // So the result depends on whether the IEEE mode bit is enabled or not with a
14819 // signaling nan input.
14820 // ieee=1
14821 // s0 snan: yields s2
14822 // s1 snan: yields s2
14823 // s2 snan: qnan
14824
14825 // s0 qnan: min(s1, s2)
14826 // s1 qnan: min(s0, s2)
14827 // s2 qnan: min(s0, s1)
14828
14829 // ieee=0
14830 // s0 snan: min(s1, s2)
14831 // s1 snan: min(s0, s2)
14832 // s2 snan: qnan
14833
14834 // s0 qnan: min(s1, s2)
14835 // s1 qnan: min(s0, s2)
14836 // s2 qnan: min(s0, s1)
14837 const MachineFunction &MF = DAG.getMachineFunction();
14838 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14839
14840 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14841 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14842 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14843 EVT VT = Op0.getValueType();
14844 if (Info->getMode().DX10Clamp) {
14845 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14846 // hardware fmed3 behavior converting to a min.
14847 // FIXME: Should this be allowing -0.0?
14848 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14849 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14850 }
14851
14852 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14853 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14854 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14855 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14856 // then give the other result, which is different from med3 with a NaN
14857 // input.
14858 SDValue Var = Op0.getOperand(0);
14859 if (!DAG.isKnownNeverSNaN(Var))
14860 return SDValue();
14861
14862 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14863
14864 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14865 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14866 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14867 SDValue(K0, 0), SDValue(K1, 0));
14868 }
14869 }
14870
14871 return SDValue();
14872}
14873
14874/// \return true if the subtarget supports minimum3 and maximum3 with the given
14875/// base min/max opcode \p Opc for type \p VT.
14876static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14877 EVT VT) {
14878 switch (Opc) {
14879 case ISD::FMINNUM:
14880 case ISD::FMAXNUM:
14881 case ISD::FMINNUM_IEEE:
14882 case ISD::FMAXNUM_IEEE:
14883 case ISD::FMINIMUMNUM:
14884 case ISD::FMAXIMUMNUM:
14887 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14888 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14889 case ISD::FMINIMUM:
14890 case ISD::FMAXIMUM:
14891 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14892 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14893 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14894 case ISD::SMAX:
14895 case ISD::SMIN:
14896 case ISD::UMAX:
14897 case ISD::UMIN:
14898 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14899 default:
14900 return false;
14901 }
14902
14903 llvm_unreachable("not a min/max opcode");
14904}
14905
14906SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14907 DAGCombinerInfo &DCI) const {
14908 SelectionDAG &DAG = DCI.DAG;
14909
14910 EVT VT = N->getValueType(0);
14911 unsigned Opc = N->getOpcode();
14912 SDValue Op0 = N->getOperand(0);
14913 SDValue Op1 = N->getOperand(1);
14914
14915 // Only do this if the inner op has one use since this will just increases
14916 // register pressure for no benefit.
14917
14918 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14919 // max(max(a, b), c) -> max3(a, b, c)
14920 // min(min(a, b), c) -> min3(a, b, c)
14921 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14922 SDLoc DL(N);
14923 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14924 Op0.getOperand(0), Op0.getOperand(1), Op1);
14925 }
14926
14927 // Try commuted.
14928 // max(a, max(b, c)) -> max3(a, b, c)
14929 // min(a, min(b, c)) -> min3(a, b, c)
14930 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14931 SDLoc DL(N);
14932 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14933 Op0, Op1.getOperand(0), Op1.getOperand(1));
14934 }
14935 }
14936
14937 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14938 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14939 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14940 if (SDValue Med3 = performIntMed3ImmCombine(
14941 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14942 return Med3;
14943 }
14944 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14945 if (SDValue Med3 = performIntMed3ImmCombine(
14946 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14947 return Med3;
14948 }
14949
14950 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14951 if (SDValue Med3 = performIntMed3ImmCombine(
14952 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14953 return Med3;
14954 }
14955 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14956 if (SDValue Med3 = performIntMed3ImmCombine(
14957 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14958 return Med3;
14959 }
14960
14961 // if !is_snan(x):
14962 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14963 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14964 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14965 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14966 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14967 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14968 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14970 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14971 (VT == MVT::f32 || VT == MVT::f64 ||
14972 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14973 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14974 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14975 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14976 Op0.hasOneUse()) {
14977 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14978 return Res;
14979 }
14980
14981 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14982 // for some types, but at a higher cost since it's implemented with a 3
14983 // operand form.
14984 const SDNodeFlags Flags = N->getFlags();
14985 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14986 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
14987 unsigned NewOpc =
14988 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14989 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
14990 }
14991
14992 return SDValue();
14993}
14994
14998 // FIXME: Should this be allowing -0.0?
14999 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15000 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15001 }
15002 }
15003
15004 return false;
15005}
15006
15007// FIXME: Should only worry about snans for version with chain.
15008SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15009 DAGCombinerInfo &DCI) const {
15010 EVT VT = N->getValueType(0);
15011 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15012 // NaNs. With a NaN input, the order of the operands may change the result.
15013
15014 SelectionDAG &DAG = DCI.DAG;
15015 SDLoc SL(N);
15016
15017 SDValue Src0 = N->getOperand(0);
15018 SDValue Src1 = N->getOperand(1);
15019 SDValue Src2 = N->getOperand(2);
15020
15021 if (isClampZeroToOne(Src0, Src1)) {
15022 // const_a, const_b, x -> clamp is safe in all cases including signaling
15023 // nans.
15024 // FIXME: Should this be allowing -0.0?
15025 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15026 }
15027
15028 const MachineFunction &MF = DAG.getMachineFunction();
15029 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15030
15031 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15032 // handling no dx10-clamp?
15033 if (Info->getMode().DX10Clamp) {
15034 // If NaNs is clamped to 0, we are free to reorder the inputs.
15035
15036 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15037 std::swap(Src0, Src1);
15038
15039 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15040 std::swap(Src1, Src2);
15041
15042 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15043 std::swap(Src0, Src1);
15044
15045 if (isClampZeroToOne(Src1, Src2))
15046 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15047 }
15048
15049 return SDValue();
15050}
15051
15052SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15053 DAGCombinerInfo &DCI) const {
15054 SDValue Src0 = N->getOperand(0);
15055 SDValue Src1 = N->getOperand(1);
15056 if (Src0.isUndef() && Src1.isUndef())
15057 return DCI.DAG.getUNDEF(N->getValueType(0));
15058 return SDValue();
15059}
15060
15061// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15062// expanded into a set of cmp/select instructions.
15064 unsigned NumElem,
15065 bool IsDivergentIdx,
15066 const GCNSubtarget *Subtarget) {
15068 return false;
15069
15070 unsigned VecSize = EltSize * NumElem;
15071
15072 // Sub-dword vectors of size 2 dword or less have better implementation.
15073 if (VecSize <= 64 && EltSize < 32)
15074 return false;
15075
15076 // Always expand the rest of sub-dword instructions, otherwise it will be
15077 // lowered via memory.
15078 if (EltSize < 32)
15079 return true;
15080
15081 // Always do this if var-idx is divergent, otherwise it will become a loop.
15082 if (IsDivergentIdx)
15083 return true;
15084
15085 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15086 unsigned NumInsts = NumElem /* Number of compares */ +
15087 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15088
15089 // On some architectures (GFX9) movrel is not available and it's better
15090 // to expand.
15091 if (Subtarget->useVGPRIndexMode())
15092 return NumInsts <= 16;
15093
15094 // If movrel is available, use it instead of expanding for vector of 8
15095 // elements.
15096 if (Subtarget->hasMovrel())
15097 return NumInsts <= 15;
15098
15099 return true;
15100}
15101
15103 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15104 if (isa<ConstantSDNode>(Idx))
15105 return false;
15106
15107 SDValue Vec = N->getOperand(0);
15108 EVT VecVT = Vec.getValueType();
15109 EVT EltVT = VecVT.getVectorElementType();
15110 unsigned EltSize = EltVT.getSizeInBits();
15111 unsigned NumElem = VecVT.getVectorNumElements();
15112
15114 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15115}
15116
15117SDValue
15118SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15119 DAGCombinerInfo &DCI) const {
15120 SDValue Vec = N->getOperand(0);
15121 SelectionDAG &DAG = DCI.DAG;
15122
15123 EVT VecVT = Vec.getValueType();
15124 EVT VecEltVT = VecVT.getVectorElementType();
15125 EVT ResVT = N->getValueType(0);
15126
15127 unsigned VecSize = VecVT.getSizeInBits();
15128 unsigned VecEltSize = VecEltVT.getSizeInBits();
15129
15130 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15132 SDLoc SL(N);
15133 SDValue Idx = N->getOperand(1);
15134 SDValue Elt =
15135 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15136 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15137 }
15138
15139 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15140 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15141 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15142 // depending on the shift operand. See e.g. performSraCombine().
15143 // This combine ensures that the optimisation is compatible with v2i32
15144 // legalised AND.
15145 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15146 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15147
15149 if (!C || C->getZExtValue() != 0x1f)
15150 return SDValue();
15151
15152 SDLoc SL(N);
15153 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15154 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15155 Vec->getOperand(0), N->getOperand(1));
15156 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15157 DAG.ReplaceAllUsesWith(N, A.getNode());
15158 }
15159
15160 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15161 // =>
15162 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15163 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15164 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15165 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15166 SDLoc SL(N);
15167 SDValue Idx = N->getOperand(1);
15168 unsigned Opc = Vec.getOpcode();
15169
15170 switch (Opc) {
15171 default:
15172 break;
15173 // TODO: Support other binary operations.
15174 case ISD::FADD:
15175 case ISD::FSUB:
15176 case ISD::FMUL:
15177 case ISD::ADD:
15178 case ISD::UMIN:
15179 case ISD::UMAX:
15180 case ISD::SMIN:
15181 case ISD::SMAX:
15182 case ISD::FMAXNUM:
15183 case ISD::FMINNUM:
15184 case ISD::FMAXNUM_IEEE:
15185 case ISD::FMINNUM_IEEE:
15186 case ISD::FMAXIMUM:
15187 case ISD::FMINIMUM: {
15188 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15189 Vec.getOperand(0), Idx);
15190 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15191 Vec.getOperand(1), Idx);
15192
15193 DCI.AddToWorklist(Elt0.getNode());
15194 DCI.AddToWorklist(Elt1.getNode());
15195 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15196 }
15197 }
15198 }
15199
15200 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15202 SDLoc SL(N);
15203 SDValue Idx = N->getOperand(1);
15204 SDValue V;
15205 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15206 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15207 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15208 if (I == 0)
15209 V = Elt;
15210 else
15211 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15212 }
15213 return V;
15214 }
15215
15216 if (!DCI.isBeforeLegalize())
15217 return SDValue();
15218
15219 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15220 // elements. This exposes more load reduction opportunities by replacing
15221 // multiple small extract_vector_elements with a single 32-bit extract.
15222 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15223 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15224 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15225 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15226
15227 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15228 unsigned EltIdx = BitIndex / 32;
15229 unsigned LeftoverBitIdx = BitIndex % 32;
15230 SDLoc SL(N);
15231
15232 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15233 DCI.AddToWorklist(Cast.getNode());
15234
15235 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15236 DAG.getConstant(EltIdx, SL, MVT::i32));
15237 DCI.AddToWorklist(Elt.getNode());
15238 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15239 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15240 DCI.AddToWorklist(Srl.getNode());
15241
15242 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15243 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15244 DCI.AddToWorklist(Trunc.getNode());
15245
15246 if (VecEltVT == ResVT) {
15247 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15248 }
15249
15250 assert(ResVT.isScalarInteger());
15251 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15252 }
15253
15254 return SDValue();
15255}
15256
15257SDValue
15258SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15259 DAGCombinerInfo &DCI) const {
15260 SDValue Vec = N->getOperand(0);
15261 SDValue Idx = N->getOperand(2);
15262 EVT VecVT = Vec.getValueType();
15263 EVT EltVT = VecVT.getVectorElementType();
15264
15265 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15266 // => BUILD_VECTOR n x select (e, const-idx)
15268 return SDValue();
15269
15270 SelectionDAG &DAG = DCI.DAG;
15271 SDLoc SL(N);
15272 SDValue Ins = N->getOperand(1);
15273 EVT IdxVT = Idx.getValueType();
15274
15276 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15277 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15278 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15279 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15280 Ops.push_back(V);
15281 }
15282
15283 return DAG.getBuildVector(VecVT, SL, Ops);
15284}
15285
15286/// Return the source of an fp_extend from f16 to f32, or a converted FP
15287/// constant.
15289 if (Src.getOpcode() == ISD::FP_EXTEND &&
15290 Src.getOperand(0).getValueType() == MVT::f16) {
15291 return Src.getOperand(0);
15292 }
15293
15294 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15295 APFloat Val = CFP->getValueAPF();
15296 bool LosesInfo = true;
15298 if (!LosesInfo)
15299 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15300 }
15301
15302 return SDValue();
15303}
15304
15305SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15306 DAGCombinerInfo &DCI) const {
15307 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15308 "combine only useful on gfx8");
15309
15310 SDValue TruncSrc = N->getOperand(0);
15311 EVT VT = N->getValueType(0);
15312 if (VT != MVT::f16)
15313 return SDValue();
15314
15315 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15316 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15317 return SDValue();
15318
15319 SelectionDAG &DAG = DCI.DAG;
15320 SDLoc SL(N);
15321
15322 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15323 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15324 // casting back.
15325
15326 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15327 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15328 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15329 if (!A)
15330 return SDValue();
15331
15332 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15333 if (!B)
15334 return SDValue();
15335
15336 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15337 if (!C)
15338 return SDValue();
15339
15340 // This changes signaling nan behavior. If an input is a signaling nan, it
15341 // would have been quieted by the fpext originally. We don't care because
15342 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15343 // we would be worse off than just doing the promotion.
15344 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15345 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15346 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15347 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15348}
15349
15350unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15351 const SDNode *N0,
15352 const SDNode *N1) const {
15353 EVT VT = N0->getValueType(0);
15354
15355 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15356 // support denormals ever.
15357 if (((VT == MVT::f32 &&
15359 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15362 return ISD::FMAD;
15363
15364 const TargetOptions &Options = DAG.getTarget().Options;
15365 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15366 (N0->getFlags().hasAllowContract() &&
15367 N1->getFlags().hasAllowContract())) &&
15369 return ISD::FMA;
15370 }
15371
15372 return 0;
15373}
15374
15375// For a reassociatable opcode perform:
15376// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15377SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15378 SelectionDAG &DAG) const {
15379 EVT VT = N->getValueType(0);
15380 if (VT != MVT::i32 && VT != MVT::i64)
15381 return SDValue();
15382
15383 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15384 return SDValue();
15385
15386 unsigned Opc = N->getOpcode();
15387 SDValue Op0 = N->getOperand(0);
15388 SDValue Op1 = N->getOperand(1);
15389
15390 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15391 return SDValue();
15392
15393 if (Op0->isDivergent())
15394 std::swap(Op0, Op1);
15395
15396 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15397 return SDValue();
15398
15399 SDValue Op2 = Op1.getOperand(1);
15400 Op1 = Op1.getOperand(0);
15401 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15402 return SDValue();
15403
15404 if (Op1->isDivergent())
15405 std::swap(Op1, Op2);
15406
15407 SDLoc SL(N);
15408 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15409 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15410}
15411
15412static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15413 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15415 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15416 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15417 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15418}
15419
15420// Fold
15421// y = lshr i64 x, 32
15422// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15423// with Const.hi == -1
15424// To
15425// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15427 SDValue MulLHS, SDValue MulRHS,
15428 SDValue AddRHS) {
15429 if (MulRHS.getOpcode() == ISD::SRL)
15430 std::swap(MulLHS, MulRHS);
15431
15432 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15433 return SDValue();
15434
15435 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15436 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15437 MulLHS.getOperand(0) != AddRHS)
15438 return SDValue();
15439
15441 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15442 return SDValue();
15443
15444 SDValue ConstMul =
15445 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15446 return getMad64_32(DAG, SL, MVT::i64,
15447 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15448 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15449}
15450
15451// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15452// multiplies, if any.
15453//
15454// Full 64-bit multiplies that feed into an addition are lowered here instead
15455// of using the generic expansion. The generic expansion ends up with
15456// a tree of ADD nodes that prevents us from using the "add" part of the
15457// MAD instruction. The expansion produced here results in a chain of ADDs
15458// instead of a tree.
15459SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15460 DAGCombinerInfo &DCI) const {
15461 assert(N->isAnyAdd());
15462
15463 SelectionDAG &DAG = DCI.DAG;
15464 EVT VT = N->getValueType(0);
15465 SDLoc SL(N);
15466 SDValue LHS = N->getOperand(0);
15467 SDValue RHS = N->getOperand(1);
15468
15469 if (VT.isVector())
15470 return SDValue();
15471
15472 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15473 // result in scalar registers for uniform values.
15474 if (!N->isDivergent() && Subtarget->hasSMulHi())
15475 return SDValue();
15476
15477 unsigned NumBits = VT.getScalarSizeInBits();
15478 if (NumBits <= 32 || NumBits > 64)
15479 return SDValue();
15480
15481 if (LHS.getOpcode() != ISD::MUL) {
15482 assert(RHS.getOpcode() == ISD::MUL);
15483 std::swap(LHS, RHS);
15484 }
15485
15486 // Avoid the fold if it would unduly increase the number of multiplies due to
15487 // multiple uses, except on hardware with full-rate multiply-add (which is
15488 // part of full-rate 64-bit ops).
15489 if (!Subtarget->hasFullRate64Ops()) {
15490 unsigned NumUsers = 0;
15491 for (SDNode *User : LHS->users()) {
15492 // There is a use that does not feed into addition, so the multiply can't
15493 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15494 if (!User->isAnyAdd())
15495 return SDValue();
15496
15497 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15498 // MUL + 3xADD + 3xADDC over 3xMAD.
15499 ++NumUsers;
15500 if (NumUsers >= 3)
15501 return SDValue();
15502 }
15503 }
15504
15505 SDValue MulLHS = LHS.getOperand(0);
15506 SDValue MulRHS = LHS.getOperand(1);
15507 SDValue AddRHS = RHS;
15508
15509 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15510 return FoldedMAD;
15511
15512 // Always check whether operands are small unsigned values, since that
15513 // knowledge is useful in more cases. Check for small signed values only if
15514 // doing so can unlock a shorter code sequence.
15515 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15516 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15517
15518 bool MulSignedLo = false;
15519 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15520 MulSignedLo =
15521 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15522 }
15523
15524 // The operands and final result all have the same number of bits. If
15525 // operands need to be extended, they can be extended with garbage. The
15526 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15527 // truncated away in the end.
15528 if (VT != MVT::i64) {
15529 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15530 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15531 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15532 }
15533
15534 // The basic code generated is conceptually straightforward. Pseudo code:
15535 //
15536 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15537 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15538 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15539 //
15540 // The second and third lines are optional, depending on whether the factors
15541 // are {sign,zero}-extended or not.
15542 //
15543 // The actual DAG is noisier than the pseudo code, but only due to
15544 // instructions that disassemble values into low and high parts, and
15545 // assemble the final result.
15546 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15547
15548 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15549 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15550 SDValue Accum =
15551 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15552
15553 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15554 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15555
15556 if (!MulLHSUnsigned32) {
15557 auto MulLHSHi =
15558 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15559 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15560 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15561 }
15562
15563 if (!MulRHSUnsigned32) {
15564 auto MulRHSHi =
15565 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15566 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15567 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15568 }
15569
15570 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15571 Accum = DAG.getBitcast(MVT::i64, Accum);
15572 }
15573
15574 if (VT != MVT::i64)
15575 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15576 return Accum;
15577}
15578
15579SDValue
15580SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15581 DAGCombinerInfo &DCI) const {
15582 SDValue RHS = N->getOperand(1);
15583 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15584 if (!CRHS)
15585 return SDValue();
15586
15587 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15588 // common.
15589 uint64_t Val = CRHS->getZExtValue();
15590 if (countr_zero(Val) >= 32) {
15591 SelectionDAG &DAG = DCI.DAG;
15592 SDLoc SL(N);
15593 SDValue LHS = N->getOperand(0);
15594
15595 // Avoid carry machinery if we know the low half of the add does not
15596 // contribute to the final result.
15597 //
15598 // add i64:x, K if computeTrailingZeros(K) >= 32
15599 // => build_pair (add x.hi, K.hi), x.lo
15600
15601 // Breaking the 64-bit add here with this strange constant is unlikely
15602 // to interfere with addressing mode patterns.
15603
15604 SDValue Hi = getHiHalf64(LHS, DAG);
15605 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15606 unsigned Opcode = N->getOpcode();
15607 if (Opcode == ISD::PTRADD)
15608 Opcode = ISD::ADD;
15609 SDValue AddHi =
15610 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15611
15612 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15613 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15614 }
15615
15616 return SDValue();
15617}
15618
15619// Collect the ultimate src of each of the mul node's operands, and confirm
15620// each operand is 8 bytes.
15621static std::optional<ByteProvider<SDValue>>
15622handleMulOperand(const SDValue &MulOperand) {
15623 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15624 if (!Byte0 || Byte0->isConstantZero()) {
15625 return std::nullopt;
15626 }
15627 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15628 if (Byte1 && !Byte1->isConstantZero()) {
15629 return std::nullopt;
15630 }
15631 return Byte0;
15632}
15633
15634static unsigned addPermMasks(unsigned First, unsigned Second) {
15635 unsigned FirstCs = First & 0x0c0c0c0c;
15636 unsigned SecondCs = Second & 0x0c0c0c0c;
15637 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15638 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15639
15640 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15641 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15642 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15643 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15644
15645 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15646}
15647
15648struct DotSrc {
15650 int64_t PermMask;
15652};
15653
15657 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15658
15659 assert(Src0.Src.has_value() && Src1.Src.has_value());
15660 // Src0s and Src1s are empty, just place arbitrarily.
15661 if (Step == 0) {
15662 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15663 Src0.SrcOffset / 4});
15664 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15665 Src1.SrcOffset / 4});
15666 return;
15667 }
15668
15669 for (int BPI = 0; BPI < 2; BPI++) {
15670 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15671 if (BPI == 1) {
15672 BPP = {Src1, Src0};
15673 }
15674 unsigned ZeroMask = 0x0c0c0c0c;
15675 unsigned FMask = 0xFF << (8 * (3 - Step));
15676
15677 unsigned FirstMask =
15678 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15679 unsigned SecondMask =
15680 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15681 // Attempt to find Src vector which contains our SDValue, if so, add our
15682 // perm mask to the existing one. If we are unable to find a match for the
15683 // first SDValue, attempt to find match for the second.
15684 int FirstGroup = -1;
15685 for (int I = 0; I < 2; I++) {
15686 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15687 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15688 return IterElt.SrcOp == *BPP.first.Src &&
15689 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15690 };
15691
15692 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15693 if (Match != Srcs.end()) {
15694 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15695 FirstGroup = I;
15696 break;
15697 }
15698 }
15699 if (FirstGroup != -1) {
15700 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15701 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15702 return IterElt.SrcOp == *BPP.second.Src &&
15703 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15704 };
15705 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15706 if (Match != Srcs.end()) {
15707 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15708 } else
15709 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15710 return;
15711 }
15712 }
15713
15714 // If we have made it here, then we could not find a match in Src0s or Src1s
15715 // for either Src0 or Src1, so just place them arbitrarily.
15716
15717 unsigned ZeroMask = 0x0c0c0c0c;
15718 unsigned FMask = 0xFF << (8 * (3 - Step));
15719
15720 Src0s.push_back(
15721 {*Src0.Src,
15722 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15723 Src0.SrcOffset / 4});
15724 Src1s.push_back(
15725 {*Src1.Src,
15726 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15727 Src1.SrcOffset / 4});
15728}
15729
15731 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15732 bool IsAny) {
15733
15734 // If we just have one source, just permute it accordingly.
15735 if (Srcs.size() == 1) {
15736 auto *Elt = Srcs.begin();
15737 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15738
15739 // v_perm will produce the original value
15740 if (Elt->PermMask == 0x3020100)
15741 return EltOp;
15742
15743 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15744 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15745 }
15746
15747 auto *FirstElt = Srcs.begin();
15748 auto *SecondElt = std::next(FirstElt);
15749
15751
15752 // If we have multiple sources in the chain, combine them via perms (using
15753 // calculated perm mask) and Ors.
15754 while (true) {
15755 auto FirstMask = FirstElt->PermMask;
15756 auto SecondMask = SecondElt->PermMask;
15757
15758 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15759 unsigned FirstPlusFour = FirstMask | 0x04040404;
15760 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15761 // original 0x0C.
15762 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15763
15764 auto PermMask = addPermMasks(FirstMask, SecondMask);
15765 auto FirstVal =
15766 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15767 auto SecondVal =
15768 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15769
15770 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15771 SecondVal,
15772 DAG.getConstant(PermMask, SL, MVT::i32)));
15773
15774 FirstElt = std::next(SecondElt);
15775 if (FirstElt == Srcs.end())
15776 break;
15777
15778 SecondElt = std::next(FirstElt);
15779 // If we only have a FirstElt, then just combine that into the cumulative
15780 // source node.
15781 if (SecondElt == Srcs.end()) {
15782 auto EltOp =
15783 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15784
15785 Perms.push_back(
15786 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15787 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15788 break;
15789 }
15790 }
15791
15792 assert(Perms.size() == 1 || Perms.size() == 2);
15793 return Perms.size() == 2
15794 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15795 : Perms[0];
15796}
15797
15798static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15799 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15800 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15801 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15802 EntryMask += ZeroMask;
15803 }
15804}
15805
15806static bool isMul(const SDValue Op) {
15807 auto Opcode = Op.getOpcode();
15808
15809 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15810 Opcode == AMDGPUISD::MUL_I24);
15811}
15812
15813static std::optional<bool>
15815 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15816 const SDValue &S1Op, const SelectionDAG &DAG) {
15817 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15818 // of the dot4 is irrelevant.
15819 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15820 return false;
15821
15822 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15823 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15824 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15825 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15826 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15827 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15828
15829 assert(!(S0IsUnsigned && S0IsSigned));
15830 assert(!(S1IsUnsigned && S1IsSigned));
15831
15832 // There are 9 possible permutations of
15833 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15834
15835 // In two permutations, the sign bits are known to be the same for both Ops,
15836 // so simply return Signed / Unsigned corresponding to the MSB
15837
15838 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15839 return S0IsSigned;
15840
15841 // In another two permutations, the sign bits are known to be opposite. In
15842 // this case return std::nullopt to indicate a bad match.
15843
15844 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15845 return std::nullopt;
15846
15847 // In the remaining five permutations, we don't know the value of the sign
15848 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15849 // the upper bits must be extension bits. Thus, the only ways for the sign
15850 // bit to be unknown is if it was sign extended from unknown value, or if it
15851 // was any extended. In either case, it is correct to use the signed
15852 // version of the signedness semantics of dot4
15853
15854 // In two of such permutations, we known the sign bit is set for
15855 // one op, and the other is unknown. It is okay to used signed version of
15856 // dot4.
15857 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15858 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15859 return true;
15860
15861 // In one such permutation, we don't know either of the sign bits. It is okay
15862 // to used the signed version of dot4.
15863 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15864 return true;
15865
15866 // In two of such permutations, we known the sign bit is unset for
15867 // one op, and the other is unknown. Return std::nullopt to indicate a
15868 // bad match.
15869 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15870 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15871 return std::nullopt;
15872
15873 llvm_unreachable("Fully covered condition");
15874}
15875
15876SDValue SITargetLowering::performAddCombine(SDNode *N,
15877 DAGCombinerInfo &DCI) const {
15878 SelectionDAG &DAG = DCI.DAG;
15879 EVT VT = N->getValueType(0);
15880 SDLoc SL(N);
15881 SDValue LHS = N->getOperand(0);
15882 SDValue RHS = N->getOperand(1);
15883
15884 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15885 if (Subtarget->hasMad64_32()) {
15886 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15887 return Folded;
15888 }
15889 }
15890
15891 if (SDValue V = reassociateScalarOps(N, DAG)) {
15892 return V;
15893 }
15894
15895 if (VT == MVT::i64) {
15896 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15897 return Folded;
15898 }
15899
15900 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15901 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15902 SDValue TempNode(N, 0);
15903 std::optional<bool> IsSigned;
15907
15908 // Match the v_dot4 tree, while collecting src nodes.
15909 int ChainLength = 0;
15910 for (int I = 0; I < 4; I++) {
15911 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15912 if (MulIdx == -1)
15913 break;
15914 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15915 if (!Src0)
15916 break;
15917 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15918 if (!Src1)
15919 break;
15920
15921 auto IterIsSigned = checkDot4MulSignedness(
15922 TempNode->getOperand(MulIdx), *Src0, *Src1,
15923 TempNode->getOperand(MulIdx)->getOperand(0),
15924 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15925 if (!IterIsSigned)
15926 break;
15927 if (!IsSigned)
15928 IsSigned = *IterIsSigned;
15929 if (*IterIsSigned != *IsSigned)
15930 break;
15931 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15932 auto AddIdx = 1 - MulIdx;
15933 // Allow the special case where add (add (mul24, 0), mul24) became ->
15934 // add (mul24, mul24).
15935 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15936 Src2s.push_back(TempNode->getOperand(AddIdx));
15937 auto Src0 =
15938 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15939 if (!Src0)
15940 break;
15941 auto Src1 =
15942 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15943 if (!Src1)
15944 break;
15945 auto IterIsSigned = checkDot4MulSignedness(
15946 TempNode->getOperand(AddIdx), *Src0, *Src1,
15947 TempNode->getOperand(AddIdx)->getOperand(0),
15948 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15949 if (!IterIsSigned)
15950 break;
15951 assert(IsSigned);
15952 if (*IterIsSigned != *IsSigned)
15953 break;
15954 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15955 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15956 ChainLength = I + 2;
15957 break;
15958 }
15959
15960 TempNode = TempNode->getOperand(AddIdx);
15961 Src2s.push_back(TempNode);
15962 ChainLength = I + 1;
15963 if (TempNode->getNumOperands() < 2)
15964 break;
15965 LHS = TempNode->getOperand(0);
15966 RHS = TempNode->getOperand(1);
15967 }
15968
15969 if (ChainLength < 2)
15970 return SDValue();
15971
15972 // Masks were constructed with assumption that we would find a chain of
15973 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15974 // 0x0c) so they do not affect dot calculation.
15975 if (ChainLength < 4) {
15976 fixMasks(Src0s, ChainLength);
15977 fixMasks(Src1s, ChainLength);
15978 }
15979
15980 SDValue Src0, Src1;
15981
15982 // If we are just using a single source for both, and have permuted the
15983 // bytes consistently, we can just use the sources without permuting
15984 // (commutation).
15985 bool UseOriginalSrc = false;
15986 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15987 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15988 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15989 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15990 SmallVector<unsigned, 4> SrcBytes;
15991 auto Src0Mask = Src0s.begin()->PermMask;
15992 SrcBytes.push_back(Src0Mask & 0xFF000000);
15993 bool UniqueEntries = true;
15994 for (auto I = 1; I < 4; I++) {
15995 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15996
15997 if (is_contained(SrcBytes, NextByte)) {
15998 UniqueEntries = false;
15999 break;
16000 }
16001 SrcBytes.push_back(NextByte);
16002 }
16003
16004 if (UniqueEntries) {
16005 UseOriginalSrc = true;
16006
16007 auto *FirstElt = Src0s.begin();
16008 auto FirstEltOp =
16009 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16010
16011 auto *SecondElt = Src1s.begin();
16012 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16013 SecondElt->DWordOffset);
16014
16015 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16016 MVT::getIntegerVT(32));
16017 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16018 MVT::getIntegerVT(32));
16019 }
16020 }
16021
16022 if (!UseOriginalSrc) {
16023 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16024 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16025 }
16026
16027 assert(IsSigned);
16028 SDValue Src2 =
16029 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16030
16031 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16032 : Intrinsic::amdgcn_udot4,
16033 SL, MVT::i64);
16034
16035 assert(!VT.isVector());
16036 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16037 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16038
16039 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16040 }
16041
16042 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16043 return SDValue();
16044
16045 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16046 // add x, sext (setcc) => usubo_carry x, 0, setcc
16047 unsigned Opc = LHS.getOpcode();
16050 std::swap(RHS, LHS);
16051
16052 Opc = RHS.getOpcode();
16053 switch (Opc) {
16054 default:
16055 break;
16056 case ISD::ZERO_EXTEND:
16057 case ISD::SIGN_EXTEND:
16058 case ISD::ANY_EXTEND: {
16059 auto Cond = RHS.getOperand(0);
16060 // If this won't be a real VOPC output, we would still need to insert an
16061 // extra instruction anyway.
16062 if (!isBoolSGPR(Cond))
16063 break;
16064 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16065 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16067 return DAG.getNode(Opc, SL, VTList, Args);
16068 }
16069 case ISD::UADDO_CARRY: {
16070 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16071 if (!isNullConstant(RHS.getOperand(1)))
16072 break;
16073 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16074 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16075 }
16076 }
16077 return SDValue();
16078}
16079
16080SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16081 DAGCombinerInfo &DCI) const {
16082 SelectionDAG &DAG = DCI.DAG;
16083 SDLoc DL(N);
16084 EVT VT = N->getValueType(0);
16085 SDValue N0 = N->getOperand(0);
16086 SDValue N1 = N->getOperand(1);
16087
16088 // The following folds transform PTRADDs into regular arithmetic in cases
16089 // where the PTRADD wouldn't be folded as an immediate offset into memory
16090 // instructions anyway. They are target-specific in that other targets might
16091 // prefer to not lose information about the pointer arithmetic.
16092
16093 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16094 // Adapted from DAGCombiner::visitADDLikeCommutative.
16095 SDValue V, K;
16096 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16097 SDNodeFlags ShlFlags = N1->getFlags();
16098 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16099 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16100 // preserved.
16101 SDNodeFlags NewShlFlags =
16102 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16104 : SDNodeFlags();
16105 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16106 DCI.AddToWorklist(Inner.getNode());
16107 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16108 }
16109
16110 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16111 // performAddCombine.
16112 if (N1.getOpcode() == ISD::MUL) {
16113 if (Subtarget->hasMad64_32()) {
16114 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16115 return Folded;
16116 }
16117 }
16118
16119 // If the 32 low bits of the constant are all zero, there is nothing to fold
16120 // into an immediate offset, so it's better to eliminate the unnecessary
16121 // addition for the lower 32 bits than to preserve the PTRADD.
16122 // Analogous to a fold in performAddCombine.
16123 if (VT == MVT::i64) {
16124 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16125 return Folded;
16126 }
16127
16128 if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
16129 // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
16130 // global address GA and constant c, such that c can be folded into GA.
16131 SDValue GAValue = N0.getOperand(0);
16132 if (const GlobalAddressSDNode *GA =
16134 if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
16135 // If both additions in the original were NUW, reassociation preserves
16136 // that.
16137 SDNodeFlags Flags =
16138 (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16139 SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
16140 DCI.AddToWorklist(Inner.getNode());
16141 return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
16142 }
16143 }
16144 }
16145
16146 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16147 return SDValue();
16148
16149 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
16150 // y is not, and (add y, z) is used only once.
16151 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
16152 // z is not, and (add y, z) is used only once.
16153 // The goal is to move constant offsets to the outermost ptradd, to create
16154 // more opportunities to fold offsets into memory instructions.
16155 // Together with the generic combines in DAGCombiner.cpp, this also
16156 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
16157 //
16158 // This transform is here instead of in the general DAGCombiner as it can
16159 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
16160 // AArch64's CPA.
16161 SDValue X = N0;
16162 SDValue Y = N1.getOperand(0);
16163 SDValue Z = N1.getOperand(1);
16164 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16165 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16166
16167 // If both additions in the original were NUW, reassociation preserves that.
16168 SDNodeFlags ReassocFlags =
16169 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16170
16171 if (ZIsConstant != YIsConstant) {
16172 if (YIsConstant)
16173 std::swap(Y, Z);
16174 SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16175 DCI.AddToWorklist(Inner.getNode());
16176 return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
16177 }
16178
16179 // If one of Y and Z is constant, they have been handled above. If both were
16180 // constant, the addition would have been folded in SelectionDAG::getNode
16181 // already. This ensures that the generic DAG combines won't undo the
16182 // following reassociation.
16183 assert(!YIsConstant && !ZIsConstant);
16184
16185 if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
16186 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16187 // y are uniform and z isn't.
16188 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16189 // z are uniform and y isn't.
16190 // The goal is to push uniform operands up in the computation, so that they
16191 // can be handled with scalar operations. We can't use reassociateScalarOps
16192 // for this since it requires two identical commutative operations to
16193 // reassociate.
16194 if (Y->isDivergent())
16195 std::swap(Y, Z);
16196 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16197 DCI.AddToWorklist(UniformInner.getNode());
16198 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16199 }
16200
16201 return SDValue();
16202}
16203
16204SDValue SITargetLowering::performSubCombine(SDNode *N,
16205 DAGCombinerInfo &DCI) const {
16206 SelectionDAG &DAG = DCI.DAG;
16207 EVT VT = N->getValueType(0);
16208
16209 if (VT == MVT::i64) {
16210 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16211 return Folded;
16212 }
16213
16214 if (VT != MVT::i32)
16215 return SDValue();
16216
16217 SDLoc SL(N);
16218 SDValue LHS = N->getOperand(0);
16219 SDValue RHS = N->getOperand(1);
16220
16221 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16222 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16223 unsigned Opc = RHS.getOpcode();
16224 switch (Opc) {
16225 default:
16226 break;
16227 case ISD::ZERO_EXTEND:
16228 case ISD::SIGN_EXTEND:
16229 case ISD::ANY_EXTEND: {
16230 auto Cond = RHS.getOperand(0);
16231 // If this won't be a real VOPC output, we would still need to insert an
16232 // extra instruction anyway.
16233 if (!isBoolSGPR(Cond))
16234 break;
16235 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16236 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16238 return DAG.getNode(Opc, SL, VTList, Args);
16239 }
16240 }
16241
16242 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16243 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16244 if (!isNullConstant(LHS.getOperand(1)))
16245 return SDValue();
16246 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16247 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16248 }
16249 return SDValue();
16250}
16251
16252SDValue
16253SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16254 DAGCombinerInfo &DCI) const {
16255
16256 if (N->getValueType(0) != MVT::i32)
16257 return SDValue();
16258
16259 if (!isNullConstant(N->getOperand(1)))
16260 return SDValue();
16261
16262 SelectionDAG &DAG = DCI.DAG;
16263 SDValue LHS = N->getOperand(0);
16264
16265 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16266 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16267 unsigned LHSOpc = LHS.getOpcode();
16268 unsigned Opc = N->getOpcode();
16269 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16270 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16271 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16272 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16273 }
16274 return SDValue();
16275}
16276
16277SDValue SITargetLowering::performFAddCombine(SDNode *N,
16278 DAGCombinerInfo &DCI) const {
16279 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16280 return SDValue();
16281
16282 SelectionDAG &DAG = DCI.DAG;
16283 EVT VT = N->getValueType(0);
16284
16285 SDLoc SL(N);
16286 SDValue LHS = N->getOperand(0);
16287 SDValue RHS = N->getOperand(1);
16288
16289 // These should really be instruction patterns, but writing patterns with
16290 // source modifiers is a pain.
16291
16292 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16293 if (LHS.getOpcode() == ISD::FADD) {
16294 SDValue A = LHS.getOperand(0);
16295 if (A == LHS.getOperand(1)) {
16296 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16297 if (FusedOp != 0) {
16298 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16299 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16300 }
16301 }
16302 }
16303
16304 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16305 if (RHS.getOpcode() == ISD::FADD) {
16306 SDValue A = RHS.getOperand(0);
16307 if (A == RHS.getOperand(1)) {
16308 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16309 if (FusedOp != 0) {
16310 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16311 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16312 }
16313 }
16314 }
16315
16316 return SDValue();
16317}
16318
16319SDValue SITargetLowering::performFSubCombine(SDNode *N,
16320 DAGCombinerInfo &DCI) const {
16321 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16322 return SDValue();
16323
16324 SelectionDAG &DAG = DCI.DAG;
16325 SDLoc SL(N);
16326 EVT VT = N->getValueType(0);
16327 assert(!VT.isVector());
16328
16329 // Try to get the fneg to fold into the source modifier. This undoes generic
16330 // DAG combines and folds them into the mad.
16331 //
16332 // Only do this if we are not trying to support denormals. v_mad_f32 does
16333 // not support denormals ever.
16334 SDValue LHS = N->getOperand(0);
16335 SDValue RHS = N->getOperand(1);
16336 if (LHS.getOpcode() == ISD::FADD) {
16337 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16338 SDValue A = LHS.getOperand(0);
16339 if (A == LHS.getOperand(1)) {
16340 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16341 if (FusedOp != 0) {
16342 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16343 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16344
16345 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16346 }
16347 }
16348 }
16349
16350 if (RHS.getOpcode() == ISD::FADD) {
16351 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16352
16353 SDValue A = RHS.getOperand(0);
16354 if (A == RHS.getOperand(1)) {
16355 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16356 if (FusedOp != 0) {
16357 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16358 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16359 }
16360 }
16361 }
16362
16363 return SDValue();
16364}
16365
16366SDValue SITargetLowering::performFDivCombine(SDNode *N,
16367 DAGCombinerInfo &DCI) const {
16368 SelectionDAG &DAG = DCI.DAG;
16369 SDLoc SL(N);
16370 EVT VT = N->getValueType(0);
16371 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16372 return SDValue();
16373
16374 SDValue LHS = N->getOperand(0);
16375 SDValue RHS = N->getOperand(1);
16376
16377 SDNodeFlags Flags = N->getFlags();
16378 SDNodeFlags RHSFlags = RHS->getFlags();
16379 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16380 !RHS->hasOneUse())
16381 return SDValue();
16382
16383 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16384 bool IsNegative = false;
16385 if (CLHS->isExactlyValue(1.0) ||
16386 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16387 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16388 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16389 if (RHS.getOpcode() == ISD::FSQRT) {
16390 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16391 SDValue Rsq =
16392 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16393 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16394 }
16395 }
16396 }
16397
16398 return SDValue();
16399}
16400
16401SDValue SITargetLowering::performFMulCombine(SDNode *N,
16402 DAGCombinerInfo &DCI) const {
16403 SelectionDAG &DAG = DCI.DAG;
16404 EVT VT = N->getValueType(0);
16405 EVT ScalarVT = VT.getScalarType();
16406 EVT IntVT = VT.changeElementType(MVT::i32);
16407
16408 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16409 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16410 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16411 return SDValue();
16412 }
16413
16414 SDValue LHS = N->getOperand(0);
16415 SDValue RHS = N->getOperand(1);
16416
16417 // It is cheaper to realize i32 inline constants as compared against
16418 // materializing f16 or f64 (or even non-inline f32) values,
16419 // possible via ldexp usage, as shown below :
16420 //
16421 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16422 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16423 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16424 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16425 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16426 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16427 if (!TrueNode)
16428 return SDValue();
16429 const ConstantFPSDNode *FalseNode =
16430 isConstOrConstSplatFP(RHS.getOperand(2));
16431 if (!FalseNode)
16432 return SDValue();
16433
16434 if (TrueNode->isNegative() != FalseNode->isNegative())
16435 return SDValue();
16436
16437 // For f32, only non-inline constants should be transformed.
16438 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16439 if (ScalarVT == MVT::f32 &&
16440 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16441 TII->isInlineConstant(FalseNode->getValueAPF()))
16442 return SDValue();
16443
16444 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16445 if (TrueNodeExpVal == INT_MIN)
16446 return SDValue();
16447 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16448 if (FalseNodeExpVal == INT_MIN)
16449 return SDValue();
16450
16451 SDLoc SL(N);
16452 SDValue SelectNode =
16453 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16454 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16455 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16456
16457 LHS = TrueNode->isNegative()
16458 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16459 : LHS;
16460
16461 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16462 }
16463
16464 return SDValue();
16465}
16466
16467SDValue SITargetLowering::performFMACombine(SDNode *N,
16468 DAGCombinerInfo &DCI) const {
16469 SelectionDAG &DAG = DCI.DAG;
16470 EVT VT = N->getValueType(0);
16471 SDLoc SL(N);
16472
16473 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16474 return SDValue();
16475
16476 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16477 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16478 SDValue Op1 = N->getOperand(0);
16479 SDValue Op2 = N->getOperand(1);
16480 SDValue FMA = N->getOperand(2);
16481
16482 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16483 Op2.getOpcode() != ISD::FP_EXTEND)
16484 return SDValue();
16485
16486 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16487 // regardless of the denorm mode setting. Therefore,
16488 // fp-contract is sufficient to allow generating fdot2.
16489 const TargetOptions &Options = DAG.getTarget().Options;
16490 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16491 (N->getFlags().hasAllowContract() &&
16492 FMA->getFlags().hasAllowContract())) {
16493 Op1 = Op1.getOperand(0);
16494 Op2 = Op2.getOperand(0);
16495 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16497 return SDValue();
16498
16499 SDValue Vec1 = Op1.getOperand(0);
16500 SDValue Idx1 = Op1.getOperand(1);
16501 SDValue Vec2 = Op2.getOperand(0);
16502
16503 SDValue FMAOp1 = FMA.getOperand(0);
16504 SDValue FMAOp2 = FMA.getOperand(1);
16505 SDValue FMAAcc = FMA.getOperand(2);
16506
16507 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16508 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16509 return SDValue();
16510
16511 FMAOp1 = FMAOp1.getOperand(0);
16512 FMAOp2 = FMAOp2.getOperand(0);
16513 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16515 return SDValue();
16516
16517 SDValue Vec3 = FMAOp1.getOperand(0);
16518 SDValue Vec4 = FMAOp2.getOperand(0);
16519 SDValue Idx2 = FMAOp1.getOperand(1);
16520
16521 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16522 // Idx1 and Idx2 cannot be the same.
16523 Idx1 == Idx2)
16524 return SDValue();
16525
16526 if (Vec1 == Vec2 || Vec3 == Vec4)
16527 return SDValue();
16528
16529 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16530 return SDValue();
16531
16532 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16533 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16534 DAG.getTargetConstant(0, SL, MVT::i1));
16535 }
16536 }
16537 return SDValue();
16538}
16539
16540SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16541 DAGCombinerInfo &DCI) const {
16542 SelectionDAG &DAG = DCI.DAG;
16543 SDLoc SL(N);
16544
16545 SDValue LHS = N->getOperand(0);
16546 SDValue RHS = N->getOperand(1);
16547 EVT VT = LHS.getValueType();
16548 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16549
16550 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16551 if (!CRHS) {
16553 if (CRHS) {
16554 std::swap(LHS, RHS);
16555 CC = getSetCCSwappedOperands(CC);
16556 }
16557 }
16558
16559 if (CRHS) {
16560 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16561 isBoolSGPR(LHS.getOperand(0))) {
16562 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16563 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16564 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16565 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16566 if ((CRHS->isAllOnes() &&
16567 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16568 (CRHS->isZero() &&
16569 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16570 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16571 DAG.getAllOnesConstant(SL, MVT::i1));
16572 if ((CRHS->isAllOnes() &&
16573 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16574 (CRHS->isZero() &&
16575 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16576 return LHS.getOperand(0);
16577 }
16578
16579 const APInt &CRHSVal = CRHS->getAPIntValue();
16580 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16581 LHS.getOpcode() == ISD::SELECT &&
16582 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16583 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16584 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16585 isBoolSGPR(LHS.getOperand(0))) {
16586 // Given CT != FT:
16587 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16588 // setcc (select cc, CT, CF), CF, ne => cc
16589 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16590 // setcc (select cc, CT, CF), CT, eq => cc
16591 const APInt &CT = LHS.getConstantOperandAPInt(1);
16592 const APInt &CF = LHS.getConstantOperandAPInt(2);
16593
16594 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16595 (CT == CRHSVal && CC == ISD::SETNE))
16596 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16597 DAG.getAllOnesConstant(SL, MVT::i1));
16598 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16599 (CT == CRHSVal && CC == ISD::SETEQ))
16600 return LHS.getOperand(0);
16601 }
16602 }
16603
16604 if (VT != MVT::f32 && VT != MVT::f64 &&
16605 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16606 return SDValue();
16607
16608 // Match isinf/isfinite pattern
16609 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16610 // (fcmp one (fabs x), inf) -> (fp_class x,
16611 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16612 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16613 LHS.getOpcode() == ISD::FABS) {
16614 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16615 if (!CRHS)
16616 return SDValue();
16617
16618 const APFloat &APF = CRHS->getValueAPF();
16619 if (APF.isInfinity() && !APF.isNegative()) {
16620 const unsigned IsInfMask =
16622 const unsigned IsFiniteMask =
16626 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16627 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16628 DAG.getConstant(Mask, SL, MVT::i32));
16629 }
16630 }
16631
16632 return SDValue();
16633}
16634
16635SDValue
16636SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16637 DAGCombinerInfo &DCI) const {
16638 SelectionDAG &DAG = DCI.DAG;
16639 SDLoc SL(N);
16640 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16641
16642 SDValue Src = N->getOperand(0);
16643 SDValue Shift = N->getOperand(0);
16644
16645 // TODO: Extend type shouldn't matter (assuming legal types).
16646 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16647 Shift = Shift.getOperand(0);
16648
16649 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16650 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16651 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16652 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16653 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16654 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16655 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16656 SDValue Shifted = DAG.getZExtOrTrunc(
16657 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16658
16659 unsigned ShiftOffset = 8 * Offset;
16660 if (Shift.getOpcode() == ISD::SHL)
16661 ShiftOffset -= C->getZExtValue();
16662 else
16663 ShiftOffset += C->getZExtValue();
16664
16665 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16666 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16667 MVT::f32, Shifted);
16668 }
16669 }
16670 }
16671
16672 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16673 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16674 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16675 // We simplified Src. If this node is not dead, visit it again so it is
16676 // folded properly.
16677 if (N->getOpcode() != ISD::DELETED_NODE)
16678 DCI.AddToWorklist(N);
16679 return SDValue(N, 0);
16680 }
16681
16682 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16683 if (SDValue DemandedSrc =
16684 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16685 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16686
16687 return SDValue();
16688}
16689
16690SDValue SITargetLowering::performClampCombine(SDNode *N,
16691 DAGCombinerInfo &DCI) const {
16692 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16693 if (!CSrc)
16694 return SDValue();
16695
16696 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16697 const APFloat &F = CSrc->getValueAPF();
16698 APFloat Zero = APFloat::getZero(F.getSemantics());
16699 if (F < Zero ||
16700 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16701 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16702 }
16703
16704 APFloat One(F.getSemantics(), "1.0");
16705 if (F > One)
16706 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16707
16708 return SDValue(CSrc, 0);
16709}
16710
16711SDValue SITargetLowering::performSelectCombine(SDNode *N,
16712 DAGCombinerInfo &DCI) const {
16713
16714 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16715 // integer).
16716 // Detect when CMP and SELECT use the same constant and fold them to avoid
16717 // loading the constant twice. Specifically handles patterns like:
16718 // %cmp = icmp eq i32 %val, 4242
16719 // %sel = select i1 %cmp, i32 4242, i32 %other
16720 // It can be optimized to reuse %val instead of 4242 in select.
16721 SDValue Cond = N->getOperand(0);
16722 SDValue TrueVal = N->getOperand(1);
16723 SDValue FalseVal = N->getOperand(2);
16724
16725 // Check if condition is a comparison.
16726 if (Cond.getOpcode() != ISD::SETCC)
16727 return SDValue();
16728
16729 SDValue LHS = Cond.getOperand(0);
16730 SDValue RHS = Cond.getOperand(1);
16731 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16732
16733 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16734 bool isInteger = LHS.getValueType().isInteger();
16735
16736 // Handle simple floating-point and integer types only.
16737 if (!isFloatingPoint && !isInteger)
16738 return SDValue();
16739
16740 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16741 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16742 if (!isEquality && !isNonEquality)
16743 return SDValue();
16744
16745 SDValue ArgVal, ConstVal;
16746 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16747 (isInteger && isa<ConstantSDNode>(RHS))) {
16748 ConstVal = RHS;
16749 ArgVal = LHS;
16750 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16751 (isInteger && isa<ConstantSDNode>(LHS))) {
16752 ConstVal = LHS;
16753 ArgVal = RHS;
16754 } else {
16755 return SDValue();
16756 }
16757
16758 // Skip optimization for inlinable immediates.
16759 if (isFloatingPoint) {
16760 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16761 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16762 return SDValue();
16763 } else {
16765 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16766 return SDValue();
16767 }
16768
16769 // For equality and non-equality comparisons, patterns:
16770 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16771 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16772 if (!(isEquality && TrueVal == ConstVal) &&
16773 !(isNonEquality && FalseVal == ConstVal))
16774 return SDValue();
16775
16776 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16777 SDValue SelectRHS =
16778 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16779 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16780 SelectLHS, SelectRHS);
16781}
16782
16784 DAGCombinerInfo &DCI) const {
16785 switch (N->getOpcode()) {
16786 case ISD::ADD:
16787 case ISD::SUB:
16788 case ISD::SHL:
16789 case ISD::SRL:
16790 case ISD::SRA:
16791 case ISD::AND:
16792 case ISD::OR:
16793 case ISD::XOR:
16794 case ISD::MUL:
16795 case ISD::SETCC:
16796 case ISD::SELECT:
16797 case ISD::SMIN:
16798 case ISD::SMAX:
16799 case ISD::UMIN:
16800 case ISD::UMAX:
16801 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16802 return Res;
16803 break;
16804 default:
16805 break;
16806 }
16807
16808 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16809 return SDValue();
16810
16811 switch (N->getOpcode()) {
16812 case ISD::ADD:
16813 return performAddCombine(N, DCI);
16814 case ISD::PTRADD:
16815 return performPtrAddCombine(N, DCI);
16816 case ISD::SUB:
16817 return performSubCombine(N, DCI);
16818 case ISD::UADDO_CARRY:
16819 case ISD::USUBO_CARRY:
16820 return performAddCarrySubCarryCombine(N, DCI);
16821 case ISD::FADD:
16822 return performFAddCombine(N, DCI);
16823 case ISD::FSUB:
16824 return performFSubCombine(N, DCI);
16825 case ISD::FDIV:
16826 return performFDivCombine(N, DCI);
16827 case ISD::FMUL:
16828 return performFMulCombine(N, DCI);
16829 case ISD::SETCC:
16830 return performSetCCCombine(N, DCI);
16831 case ISD::SELECT:
16832 if (auto Res = performSelectCombine(N, DCI))
16833 return Res;
16834 break;
16835 case ISD::FMAXNUM:
16836 case ISD::FMINNUM:
16837 case ISD::FMAXNUM_IEEE:
16838 case ISD::FMINNUM_IEEE:
16839 case ISD::FMAXIMUM:
16840 case ISD::FMINIMUM:
16841 case ISD::FMAXIMUMNUM:
16842 case ISD::FMINIMUMNUM:
16843 case ISD::SMAX:
16844 case ISD::SMIN:
16845 case ISD::UMAX:
16846 case ISD::UMIN:
16849 return performMinMaxCombine(N, DCI);
16850 case ISD::FMA:
16851 return performFMACombine(N, DCI);
16852 case ISD::AND:
16853 return performAndCombine(N, DCI);
16854 case ISD::OR:
16855 return performOrCombine(N, DCI);
16856 case ISD::FSHR: {
16858 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16859 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16860 return matchPERM(N, DCI);
16861 }
16862 break;
16863 }
16864 case ISD::XOR:
16865 return performXorCombine(N, DCI);
16866 case ISD::ZERO_EXTEND:
16867 return performZeroExtendCombine(N, DCI);
16869 return performSignExtendInRegCombine(N, DCI);
16871 return performClassCombine(N, DCI);
16872 case ISD::FCANONICALIZE:
16873 return performFCanonicalizeCombine(N, DCI);
16874 case AMDGPUISD::RCP:
16875 return performRcpCombine(N, DCI);
16876 case ISD::FLDEXP:
16877 case AMDGPUISD::FRACT:
16878 case AMDGPUISD::RSQ:
16881 case AMDGPUISD::RSQ_CLAMP: {
16882 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16883 SDValue Src = N->getOperand(0);
16884 if (Src.isUndef())
16885 return Src;
16886 break;
16887 }
16888 case ISD::SINT_TO_FP:
16889 case ISD::UINT_TO_FP:
16890 return performUCharToFloatCombine(N, DCI);
16891 case ISD::FCOPYSIGN:
16892 return performFCopySignCombine(N, DCI);
16897 return performCvtF32UByteNCombine(N, DCI);
16898 case AMDGPUISD::FMED3:
16899 return performFMed3Combine(N, DCI);
16901 return performCvtPkRTZCombine(N, DCI);
16902 case AMDGPUISD::CLAMP:
16903 return performClampCombine(N, DCI);
16904 case ISD::SCALAR_TO_VECTOR: {
16905 SelectionDAG &DAG = DCI.DAG;
16906 EVT VT = N->getValueType(0);
16907
16908 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16909 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16910 SDLoc SL(N);
16911 SDValue Src = N->getOperand(0);
16912 EVT EltVT = Src.getValueType();
16913 if (EltVT != MVT::i16)
16914 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16915
16916 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16917 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16918 }
16919
16920 break;
16921 }
16923 return performExtractVectorEltCombine(N, DCI);
16925 return performInsertVectorEltCombine(N, DCI);
16926 case ISD::FP_ROUND:
16927 return performFPRoundCombine(N, DCI);
16928 case ISD::LOAD: {
16929 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16930 return Widened;
16931 [[fallthrough]];
16932 }
16933 default: {
16934 if (!DCI.isBeforeLegalize()) {
16935 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16936 return performMemSDNodeCombine(MemNode, DCI);
16937 }
16938
16939 break;
16940 }
16941 }
16942
16944}
16945
16946/// Helper function for adjustWritemask
16947static unsigned SubIdx2Lane(unsigned Idx) {
16948 switch (Idx) {
16949 default:
16950 return ~0u;
16951 case AMDGPU::sub0:
16952 return 0;
16953 case AMDGPU::sub1:
16954 return 1;
16955 case AMDGPU::sub2:
16956 return 2;
16957 case AMDGPU::sub3:
16958 return 3;
16959 case AMDGPU::sub4:
16960 return 4; // Possible with TFE/LWE
16961 }
16962}
16963
16964/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16965SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16966 SelectionDAG &DAG) const {
16967 unsigned Opcode = Node->getMachineOpcode();
16968
16969 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16970 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16971 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16972 return Node; // not implemented for D16
16973
16974 SDNode *Users[5] = {nullptr};
16975 unsigned Lane = 0;
16976 unsigned DmaskIdx =
16977 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16978 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16979 unsigned NewDmask = 0;
16980 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16981 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16982 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16983 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16984 unsigned TFCLane = 0;
16985 bool HasChain = Node->getNumValues() > 1;
16986
16987 if (OldDmask == 0) {
16988 // These are folded out, but on the chance it happens don't assert.
16989 return Node;
16990 }
16991
16992 unsigned OldBitsSet = llvm::popcount(OldDmask);
16993 // Work out which is the TFE/LWE lane if that is enabled.
16994 if (UsesTFC) {
16995 TFCLane = OldBitsSet;
16996 }
16997
16998 // Try to figure out the used register components
16999 for (SDUse &Use : Node->uses()) {
17000
17001 // Don't look at users of the chain.
17002 if (Use.getResNo() != 0)
17003 continue;
17004
17005 SDNode *User = Use.getUser();
17006
17007 // Abort if we can't understand the usage
17008 if (!User->isMachineOpcode() ||
17009 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17010 return Node;
17011
17012 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17013 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17014 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17015 // set, etc.
17016 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17017 if (Lane == ~0u)
17018 return Node;
17019
17020 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17021 if (UsesTFC && Lane == TFCLane) {
17022 Users[Lane] = User;
17023 } else {
17024 // Set which texture component corresponds to the lane.
17025 unsigned Comp;
17026 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17027 Comp = llvm::countr_zero(Dmask);
17028 Dmask &= ~(1 << Comp);
17029 }
17030
17031 // Abort if we have more than one user per component.
17032 if (Users[Lane])
17033 return Node;
17034
17035 Users[Lane] = User;
17036 NewDmask |= 1 << Comp;
17037 }
17038 }
17039
17040 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17041 bool NoChannels = !NewDmask;
17042 if (NoChannels) {
17043 if (!UsesTFC) {
17044 // No uses of the result and not using TFC. Then do nothing.
17045 return Node;
17046 }
17047 // If the original dmask has one channel - then nothing to do
17048 if (OldBitsSet == 1)
17049 return Node;
17050 // Use an arbitrary dmask - required for the instruction to work
17051 NewDmask = 1;
17052 }
17053 // Abort if there's no change
17054 if (NewDmask == OldDmask)
17055 return Node;
17056
17057 unsigned BitsSet = llvm::popcount(NewDmask);
17058
17059 // Check for TFE or LWE - increase the number of channels by one to account
17060 // for the extra return value
17061 // This will need adjustment for D16 if this is also included in
17062 // adjustWriteMask (this function) but at present D16 are excluded.
17063 unsigned NewChannels = BitsSet + UsesTFC;
17064
17065 int NewOpcode =
17066 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17067 assert(NewOpcode != -1 &&
17068 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17069 "failed to find equivalent MIMG op");
17070
17071 // Adjust the writemask in the node
17073 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17074 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17075 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17076
17077 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17078
17079 MVT ResultVT = NewChannels == 1
17080 ? SVT
17081 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17082 : NewChannels == 5 ? 8
17083 : NewChannels);
17084 SDVTList NewVTList =
17085 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17086
17087 MachineSDNode *NewNode =
17088 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17089
17090 if (HasChain) {
17091 // Update chain.
17092 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17093 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17094 }
17095
17096 if (NewChannels == 1) {
17097 assert(Node->hasNUsesOfValue(1, 0));
17098 SDNode *Copy =
17099 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17100 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17101 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17102 return nullptr;
17103 }
17104
17105 // Update the users of the node with the new indices
17106 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17107 SDNode *User = Users[i];
17108 if (!User) {
17109 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17110 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17111 if (i || !NoChannels)
17112 continue;
17113 } else {
17114 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17115 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17116 if (NewUser != User) {
17117 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17118 DAG.RemoveDeadNode(User);
17119 }
17120 }
17121
17122 switch (Idx) {
17123 default:
17124 break;
17125 case AMDGPU::sub0:
17126 Idx = AMDGPU::sub1;
17127 break;
17128 case AMDGPU::sub1:
17129 Idx = AMDGPU::sub2;
17130 break;
17131 case AMDGPU::sub2:
17132 Idx = AMDGPU::sub3;
17133 break;
17134 case AMDGPU::sub3:
17135 Idx = AMDGPU::sub4;
17136 break;
17137 }
17138 }
17139
17140 DAG.RemoveDeadNode(Node);
17141 return nullptr;
17142}
17143
17145 if (Op.getOpcode() == ISD::AssertZext)
17146 Op = Op.getOperand(0);
17147
17148 return isa<FrameIndexSDNode>(Op);
17149}
17150
17151/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17152/// with frame index operands.
17153/// LLVM assumes that inputs are to these instructions are registers.
17154SDNode *
17156 SelectionDAG &DAG) const {
17157 if (Node->getOpcode() == ISD::CopyToReg) {
17158 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17159 SDValue SrcVal = Node->getOperand(2);
17160
17161 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17162 // to try understanding copies to physical registers.
17163 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17164 SDLoc SL(Node);
17166 SDValue VReg = DAG.getRegister(
17167 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17168
17169 SDNode *Glued = Node->getGluedNode();
17170 SDValue ToVReg = DAG.getCopyToReg(
17171 Node->getOperand(0), SL, VReg, SrcVal,
17172 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17173 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17174 VReg, ToVReg.getValue(1));
17175 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17176 DAG.RemoveDeadNode(Node);
17177 return ToResultReg.getNode();
17178 }
17179 }
17180
17182 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17183 if (!isFrameIndexOp(Node->getOperand(i))) {
17184 Ops.push_back(Node->getOperand(i));
17185 continue;
17186 }
17187
17188 SDLoc DL(Node);
17189 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17190 Node->getOperand(i).getValueType(),
17191 Node->getOperand(i)),
17192 0));
17193 }
17194
17195 return DAG.UpdateNodeOperands(Node, Ops);
17196}
17197
17198/// Fold the instructions after selecting them.
17199/// Returns null if users were already updated.
17201 SelectionDAG &DAG) const {
17203 unsigned Opcode = Node->getMachineOpcode();
17204
17205 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17206 !TII->isGather4(Opcode) &&
17207 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17208 return adjustWritemask(Node, DAG);
17209 }
17210
17211 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17213 return Node;
17214 }
17215
17216 switch (Opcode) {
17217 case AMDGPU::V_DIV_SCALE_F32_e64:
17218 case AMDGPU::V_DIV_SCALE_F64_e64: {
17219 // Satisfy the operand register constraint when one of the inputs is
17220 // undefined. Ordinarily each undef value will have its own implicit_def of
17221 // a vreg, so force these to use a single register.
17222 SDValue Src0 = Node->getOperand(1);
17223 SDValue Src1 = Node->getOperand(3);
17224 SDValue Src2 = Node->getOperand(5);
17225
17226 if ((Src0.isMachineOpcode() &&
17227 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17228 (Src0 == Src1 || Src0 == Src2))
17229 break;
17230
17231 MVT VT = Src0.getValueType().getSimpleVT();
17232 const TargetRegisterClass *RC =
17233 getRegClassFor(VT, Src0.getNode()->isDivergent());
17234
17236 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17237
17238 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17239 Src0, SDValue());
17240
17241 // src0 must be the same register as src1 or src2, even if the value is
17242 // undefined, so make sure we don't violate this constraint.
17243 if (Src0.isMachineOpcode() &&
17244 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17245 if (Src1.isMachineOpcode() &&
17246 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17247 Src0 = Src1;
17248 else if (Src2.isMachineOpcode() &&
17249 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17250 Src0 = Src2;
17251 else {
17252 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17253 Src0 = UndefReg;
17254 Src1 = UndefReg;
17255 }
17256 } else
17257 break;
17258
17260 Ops[1] = Src0;
17261 Ops[3] = Src1;
17262 Ops[5] = Src2;
17263 Ops.push_back(ImpDef.getValue(1));
17264 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17265 }
17266 default:
17267 break;
17268 }
17269
17270 return Node;
17271}
17272
17273// Any MIMG instructions that use tfe or lwe require an initialization of the
17274// result register that will be written in the case of a memory access failure.
17275// The required code is also added to tie this init code to the result of the
17276// img instruction.
17279 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17280 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17281 MachineBasicBlock &MBB = *MI.getParent();
17282
17283 int DstIdx =
17284 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17285 unsigned InitIdx = 0;
17286
17287 if (TII->isImage(MI)) {
17288 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17289 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17290 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17291
17292 if (!TFE && !LWE) // intersect_ray
17293 return;
17294
17295 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17296 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17297 unsigned D16Val = D16 ? D16->getImm() : 0;
17298
17299 if (!TFEVal && !LWEVal)
17300 return;
17301
17302 // At least one of TFE or LWE are non-zero
17303 // We have to insert a suitable initialization of the result value and
17304 // tie this to the dest of the image instruction.
17305
17306 // Calculate which dword we have to initialize to 0.
17307 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17308
17309 // check that dmask operand is found.
17310 assert(MO_Dmask && "Expected dmask operand in instruction");
17311
17312 unsigned dmask = MO_Dmask->getImm();
17313 // Determine the number of active lanes taking into account the
17314 // Gather4 special case
17315 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17316
17317 bool Packed = !Subtarget->hasUnpackedD16VMem();
17318
17319 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17320
17321 // Abandon attempt if the dst size isn't large enough
17322 // - this is in fact an error but this is picked up elsewhere and
17323 // reported correctly.
17324 uint32_t DstSize =
17325 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17326 if (DstSize < InitIdx)
17327 return;
17328 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17329 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17330 } else {
17331 return;
17332 }
17333
17334 const DebugLoc &DL = MI.getDebugLoc();
17335
17336 // Create a register for the initialization value.
17337 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17338 unsigned NewDst = 0; // Final initialized value will be in here
17339
17340 // If PRTStrictNull feature is enabled (the default) then initialize
17341 // all the result registers to 0, otherwise just the error indication
17342 // register (VGPRn+1)
17343 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17344 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17345
17346 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17347 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17348 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17349 // Initialize dword
17350 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17351 // clang-format off
17352 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17353 .addImm(0);
17354 // clang-format on
17355 // Insert into the super-reg
17356 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17357 .addReg(PrevDst)
17358 .addReg(SubReg)
17360
17361 PrevDst = NewDst;
17362 }
17363
17364 // Add as an implicit operand
17365 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17366
17367 // Tie the just added implicit operand to the dst
17368 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17369}
17370
17371/// Assign the register class depending on the number of
17372/// bits set in the writemask
17374 SDNode *Node) const {
17376
17377 MachineFunction *MF = MI.getParent()->getParent();
17380
17381 if (TII->isVOP3(MI.getOpcode())) {
17382 // Make sure constant bus requirements are respected.
17383 TII->legalizeOperandsVOP3(MRI, MI);
17384
17385 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17386 // This saves a chain-copy of registers and better balance register
17387 // use between vgpr and agpr as agpr tuples tend to be big.
17388 if (!MI.getDesc().operands().empty()) {
17389 unsigned Opc = MI.getOpcode();
17390 bool HasAGPRs = Info->mayNeedAGPRs();
17391 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17392 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17393 for (auto I :
17394 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17395 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17396 if (I == -1)
17397 break;
17398 if ((I == Src2Idx) && (HasAGPRs))
17399 break;
17400 MachineOperand &Op = MI.getOperand(I);
17401 if (!Op.isReg() || !Op.getReg().isVirtual())
17402 continue;
17403 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17404 if (!TRI->hasAGPRs(RC))
17405 continue;
17406 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17407 if (!Src || !Src->isCopy() ||
17408 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17409 continue;
17410 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17411 // All uses of agpr64 and agpr32 can also accept vgpr except for
17412 // v_accvgpr_read, but we do not produce agpr reads during selection,
17413 // so no use checks are needed.
17414 MRI.setRegClass(Op.getReg(), NewRC);
17415 }
17416
17417 if (TII->isMAI(MI)) {
17418 // The ordinary src0, src1, src2 were legalized above.
17419 //
17420 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17421 // as a separate instruction.
17422 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17423 AMDGPU::OpName::scale_src0);
17424 if (Src0Idx != -1) {
17425 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17426 AMDGPU::OpName::scale_src1);
17427 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17428 TII->usesConstantBus(MRI, MI, Src1Idx))
17429 TII->legalizeOpWithMove(MI, Src1Idx);
17430 }
17431 }
17432
17433 if (!HasAGPRs)
17434 return;
17435
17436 // Resolve the rest of AV operands to AGPRs.
17437 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17438 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17439 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17440 if (TRI->isVectorSuperClass(RC)) {
17441 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17442 MRI.setRegClass(Src2->getReg(), NewRC);
17443 if (Src2->isTied())
17444 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17445 }
17446 }
17447 }
17448 }
17449
17450 return;
17451 }
17452
17453 if (TII->isImage(MI))
17454 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17455}
17456
17458 uint64_t Val) {
17459 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17460 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17461}
17462
17464 const SDLoc &DL,
17465 SDValue Ptr) const {
17467
17468 // Build the half of the subregister with the constants before building the
17469 // full 128-bit register. If we are building multiple resource descriptors,
17470 // this will allow CSEing of the 2-component register.
17471 const SDValue Ops0[] = {
17472 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17473 buildSMovImm32(DAG, DL, 0),
17474 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17475 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17476 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17477
17478 SDValue SubRegHi = SDValue(
17479 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17480
17481 // Combine the constants and the pointer.
17482 const SDValue Ops1[] = {
17483 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17484 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17485 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17486
17487 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17488}
17489
17490/// Return a resource descriptor with the 'Add TID' bit enabled
17491/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17492/// of the resource descriptor) to create an offset, which is added to
17493/// the resource pointer.
17495 SDValue Ptr, uint32_t RsrcDword1,
17496 uint64_t RsrcDword2And3) const {
17497 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17498 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17499 if (RsrcDword1) {
17500 PtrHi =
17501 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17502 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17503 0);
17504 }
17505
17506 SDValue DataLo =
17507 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17508 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17509
17510 const SDValue Ops[] = {
17511 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17512 PtrLo,
17513 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17514 PtrHi,
17515 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17516 DataLo,
17517 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17518 DataHi,
17519 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17520
17521 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17522}
17523
17524//===----------------------------------------------------------------------===//
17525// SI Inline Assembly Support
17526//===----------------------------------------------------------------------===//
17527
17528std::pair<unsigned, const TargetRegisterClass *>
17530 StringRef Constraint,
17531 MVT VT) const {
17532 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17533
17534 const TargetRegisterClass *RC = nullptr;
17535 if (Constraint.size() == 1) {
17536 // Check if we cannot determine the bit size of the given value type. This
17537 // can happen, for example, in this situation where we have an empty struct
17538 // (size 0): `call void asm "", "v"({} poison)`-
17539 if (VT == MVT::Other)
17540 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17541 const unsigned BitWidth = VT.getSizeInBits();
17542 switch (Constraint[0]) {
17543 default:
17544 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17545 case 's':
17546 case 'r':
17547 switch (BitWidth) {
17548 case 16:
17549 RC = &AMDGPU::SReg_32RegClass;
17550 break;
17551 case 64:
17552 RC = &AMDGPU::SGPR_64RegClass;
17553 break;
17554 default:
17556 if (!RC)
17557 return std::pair(0U, nullptr);
17558 break;
17559 }
17560 break;
17561 case 'v':
17562 switch (BitWidth) {
17563 case 16:
17564 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17565 : &AMDGPU::VGPR_32_Lo256RegClass;
17566 break;
17567 default:
17568 RC = Subtarget->has1024AddressableVGPRs()
17569 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17570 : TRI->getVGPRClassForBitWidth(BitWidth);
17571 if (!RC)
17572 return std::pair(0U, nullptr);
17573 break;
17574 }
17575 break;
17576 case 'a':
17577 if (!Subtarget->hasMAIInsts())
17578 break;
17579 switch (BitWidth) {
17580 case 16:
17581 RC = &AMDGPU::AGPR_32RegClass;
17582 break;
17583 default:
17584 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17585 if (!RC)
17586 return std::pair(0U, nullptr);
17587 break;
17588 }
17589 break;
17590 }
17591 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17592 const unsigned BitWidth = VT.getSizeInBits();
17593 switch (BitWidth) {
17594 case 16:
17595 RC = &AMDGPU::AV_32RegClass;
17596 break;
17597 default:
17598 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17599 if (!RC)
17600 return std::pair(0U, nullptr);
17601 break;
17602 }
17603 }
17604
17605 // We actually support i128, i16 and f16 as inline parameters
17606 // even if they are not reported as legal
17607 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17608 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17609 return std::pair(0U, RC);
17610
17611 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17612 if (Kind != '\0') {
17613 if (Kind == 'v') {
17614 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17615 } else if (Kind == 's') {
17616 RC = &AMDGPU::SGPR_32RegClass;
17617 } else if (Kind == 'a') {
17618 RC = &AMDGPU::AGPR_32RegClass;
17619 }
17620
17621 if (RC) {
17622 if (NumRegs > 1) {
17623 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17624 return std::pair(0U, nullptr);
17625
17626 uint32_t Width = NumRegs * 32;
17627 // Prohibit constraints for register ranges with a width that does not
17628 // match the required type.
17629 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17630 return std::pair(0U, nullptr);
17631
17632 MCRegister Reg = RC->getRegister(Idx);
17634 RC = TRI->getVGPRClassForBitWidth(Width);
17635 else if (SIRegisterInfo::isSGPRClass(RC))
17636 RC = TRI->getSGPRClassForBitWidth(Width);
17637 else if (SIRegisterInfo::isAGPRClass(RC))
17638 RC = TRI->getAGPRClassForBitWidth(Width);
17639 if (RC) {
17640 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17641 if (!Reg) {
17642 // The register class does not contain the requested register,
17643 // e.g., because it is an SGPR pair that would violate alignment
17644 // requirements.
17645 return std::pair(0U, nullptr);
17646 }
17647 return std::pair(Reg, RC);
17648 }
17649 }
17650
17651 // Check for lossy scalar/vector conversions.
17652 if (VT.isVector() && VT.getSizeInBits() != 32)
17653 return std::pair(0U, nullptr);
17654 if (Idx < RC->getNumRegs())
17655 return std::pair(RC->getRegister(Idx), RC);
17656 return std::pair(0U, nullptr);
17657 }
17658 }
17659
17660 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17661 if (Ret.first)
17662 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17663
17664 return Ret;
17665}
17666
17667static bool isImmConstraint(StringRef Constraint) {
17668 if (Constraint.size() == 1) {
17669 switch (Constraint[0]) {
17670 default:
17671 break;
17672 case 'I':
17673 case 'J':
17674 case 'A':
17675 case 'B':
17676 case 'C':
17677 return true;
17678 }
17679 } else if (Constraint == "DA" || Constraint == "DB") {
17680 return true;
17681 }
17682 return false;
17683}
17684
17687 if (Constraint.size() == 1) {
17688 switch (Constraint[0]) {
17689 default:
17690 break;
17691 case 's':
17692 case 'v':
17693 case 'a':
17694 return C_RegisterClass;
17695 }
17696 } else if (Constraint.size() == 2) {
17697 if (Constraint == "VA")
17698 return C_RegisterClass;
17699 }
17700 if (isImmConstraint(Constraint)) {
17701 return C_Other;
17702 }
17703 return TargetLowering::getConstraintType(Constraint);
17704}
17705
17706static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17708 Val = Val & maskTrailingOnes<uint64_t>(Size);
17709 }
17710 return Val;
17711}
17712
17714 StringRef Constraint,
17715 std::vector<SDValue> &Ops,
17716 SelectionDAG &DAG) const {
17717 if (isImmConstraint(Constraint)) {
17718 uint64_t Val;
17719 if (getAsmOperandConstVal(Op, Val) &&
17720 checkAsmConstraintVal(Op, Constraint, Val)) {
17721 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17722 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17723 }
17724 } else {
17726 }
17727}
17728
17730 unsigned Size = Op.getScalarValueSizeInBits();
17731 if (Size > 64)
17732 return false;
17733
17734 if (Size == 16 && !Subtarget->has16BitInsts())
17735 return false;
17736
17738 Val = C->getSExtValue();
17739 return true;
17740 }
17742 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17743 return true;
17744 }
17746 if (Size != 16 || Op.getNumOperands() != 2)
17747 return false;
17748 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17749 return false;
17750 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17751 Val = C->getSExtValue();
17752 return true;
17753 }
17754 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17755 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17756 return true;
17757 }
17758 }
17759
17760 return false;
17761}
17762
17764 uint64_t Val) const {
17765 if (Constraint.size() == 1) {
17766 switch (Constraint[0]) {
17767 case 'I':
17769 case 'J':
17770 return isInt<16>(Val);
17771 case 'A':
17772 return checkAsmConstraintValA(Op, Val);
17773 case 'B':
17774 return isInt<32>(Val);
17775 case 'C':
17776 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17778 default:
17779 break;
17780 }
17781 } else if (Constraint.size() == 2) {
17782 if (Constraint == "DA") {
17783 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17784 int64_t LoBits = static_cast<int32_t>(Val);
17785 return checkAsmConstraintValA(Op, HiBits, 32) &&
17786 checkAsmConstraintValA(Op, LoBits, 32);
17787 }
17788 if (Constraint == "DB") {
17789 return true;
17790 }
17791 }
17792 llvm_unreachable("Invalid asm constraint");
17793}
17794
17796 unsigned MaxSize) const {
17797 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17798 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17799 if (Size == 16) {
17800 MVT VT = Op.getSimpleValueType();
17801 switch (VT.SimpleTy) {
17802 default:
17803 return false;
17804 case MVT::i16:
17805 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17806 case MVT::f16:
17807 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17808 case MVT::bf16:
17809 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17810 case MVT::v2i16:
17811 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17812 case MVT::v2f16:
17813 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17814 case MVT::v2bf16:
17815 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17816 }
17817 }
17818 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17819 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17820 return true;
17821 return false;
17822}
17823
17824static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17825 switch (UnalignedClassID) {
17826 case AMDGPU::VReg_64RegClassID:
17827 return AMDGPU::VReg_64_Align2RegClassID;
17828 case AMDGPU::VReg_96RegClassID:
17829 return AMDGPU::VReg_96_Align2RegClassID;
17830 case AMDGPU::VReg_128RegClassID:
17831 return AMDGPU::VReg_128_Align2RegClassID;
17832 case AMDGPU::VReg_160RegClassID:
17833 return AMDGPU::VReg_160_Align2RegClassID;
17834 case AMDGPU::VReg_192RegClassID:
17835 return AMDGPU::VReg_192_Align2RegClassID;
17836 case AMDGPU::VReg_224RegClassID:
17837 return AMDGPU::VReg_224_Align2RegClassID;
17838 case AMDGPU::VReg_256RegClassID:
17839 return AMDGPU::VReg_256_Align2RegClassID;
17840 case AMDGPU::VReg_288RegClassID:
17841 return AMDGPU::VReg_288_Align2RegClassID;
17842 case AMDGPU::VReg_320RegClassID:
17843 return AMDGPU::VReg_320_Align2RegClassID;
17844 case AMDGPU::VReg_352RegClassID:
17845 return AMDGPU::VReg_352_Align2RegClassID;
17846 case AMDGPU::VReg_384RegClassID:
17847 return AMDGPU::VReg_384_Align2RegClassID;
17848 case AMDGPU::VReg_512RegClassID:
17849 return AMDGPU::VReg_512_Align2RegClassID;
17850 case AMDGPU::VReg_1024RegClassID:
17851 return AMDGPU::VReg_1024_Align2RegClassID;
17852 case AMDGPU::AReg_64RegClassID:
17853 return AMDGPU::AReg_64_Align2RegClassID;
17854 case AMDGPU::AReg_96RegClassID:
17855 return AMDGPU::AReg_96_Align2RegClassID;
17856 case AMDGPU::AReg_128RegClassID:
17857 return AMDGPU::AReg_128_Align2RegClassID;
17858 case AMDGPU::AReg_160RegClassID:
17859 return AMDGPU::AReg_160_Align2RegClassID;
17860 case AMDGPU::AReg_192RegClassID:
17861 return AMDGPU::AReg_192_Align2RegClassID;
17862 case AMDGPU::AReg_256RegClassID:
17863 return AMDGPU::AReg_256_Align2RegClassID;
17864 case AMDGPU::AReg_512RegClassID:
17865 return AMDGPU::AReg_512_Align2RegClassID;
17866 case AMDGPU::AReg_1024RegClassID:
17867 return AMDGPU::AReg_1024_Align2RegClassID;
17868 default:
17869 return -1;
17870 }
17871}
17872
17873// Figure out which registers should be reserved for stack access. Only after
17874// the function is legalized do we know all of the non-spill stack objects or if
17875// calls are present.
17879 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17880 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17881 const SIInstrInfo *TII = ST.getInstrInfo();
17882
17883 if (Info->isEntryFunction()) {
17884 // Callable functions have fixed registers used for stack access.
17886 }
17887
17888 // TODO: Move this logic to getReservedRegs()
17889 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17890 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17891 Register SReg = ST.isWave32()
17892 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17893 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17894 &AMDGPU::SGPR_64RegClass);
17895 Info->setSGPRForEXECCopy(SReg);
17896
17897 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17898 Info->getStackPtrOffsetReg()));
17899 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17900 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17901
17902 // We need to worry about replacing the default register with itself in case
17903 // of MIR testcases missing the MFI.
17904 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17905 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17906
17907 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17908 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17909
17910 Info->limitOccupancy(MF);
17911
17912 if (ST.isWave32() && !MF.empty()) {
17913 for (auto &MBB : MF) {
17914 for (auto &MI : MBB) {
17915 TII->fixImplicitOperands(MI);
17916 }
17917 }
17918 }
17919
17920 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17921 // classes if required. Ideally the register class constraints would differ
17922 // per-subtarget, but there's no easy way to achieve that right now. This is
17923 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17924 // from using them as the register class for legal types.
17925 if (ST.needsAlignedVGPRs()) {
17926 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17927 const Register Reg = Register::index2VirtReg(I);
17928 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17929 if (!RC)
17930 continue;
17931 int NewClassID = getAlignedAGPRClassID(RC->getID());
17932 if (NewClassID != -1)
17933 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17934 }
17935 }
17936
17938}
17939
17941 KnownBits &Known,
17942 const APInt &DemandedElts,
17943 const SelectionDAG &DAG,
17944 unsigned Depth) const {
17945 Known.resetAll();
17946 unsigned Opc = Op.getOpcode();
17947 switch (Opc) {
17949 unsigned IID = Op.getConstantOperandVal(0);
17950 switch (IID) {
17951 case Intrinsic::amdgcn_mbcnt_lo:
17952 case Intrinsic::amdgcn_mbcnt_hi: {
17953 const GCNSubtarget &ST =
17955 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17956 // most 31 + src1.
17957 Known.Zero.setBitsFrom(
17958 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17959 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17960 Known = KnownBits::add(Known, Known2);
17961 return;
17962 }
17963 }
17964 break;
17965 }
17966 }
17968 Op, Known, DemandedElts, DAG, Depth);
17969}
17970
17972 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17974
17975 // Set the high bits to zero based on the maximum allowed scratch size per
17976 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17977 // calculation won't overflow, so assume the sign bit is never set.
17978 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17979}
17980
17982 GISelValueTracking &VT, KnownBits &Known,
17983 unsigned Dim) {
17984 unsigned MaxValue =
17985 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17986 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17987}
17988
17990 KnownBits &Known, const APInt &DemandedElts,
17991 unsigned BFEWidth, bool SExt, unsigned Depth) {
17993 const MachineOperand &Src1 = MI.getOperand(2);
17994
17995 unsigned Src1Cst = 0;
17996 if (Src1.isImm()) {
17997 Src1Cst = Src1.getImm();
17998 } else if (Src1.isReg()) {
17999 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18000 if (!Cst)
18001 return;
18002 Src1Cst = Cst->Value.getZExtValue();
18003 } else {
18004 return;
18005 }
18006
18007 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18008 // Width is always [22:16].
18009 const unsigned Offset =
18010 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18011 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18012
18013 if (Width >= BFEWidth) // Ill-formed.
18014 return;
18015
18016 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18017 Depth + 1);
18018
18019 Known = Known.extractBits(Width, Offset);
18020
18021 if (SExt)
18022 Known = Known.sext(BFEWidth);
18023 else
18024 Known = Known.zext(BFEWidth);
18025}
18026
18028 GISelValueTracking &VT, Register R, KnownBits &Known,
18029 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18030 unsigned Depth) const {
18031 Known.resetAll();
18032 const MachineInstr *MI = MRI.getVRegDef(R);
18033 switch (MI->getOpcode()) {
18034 case AMDGPU::S_BFE_I32:
18035 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18036 /*SExt=*/true, Depth);
18037 case AMDGPU::S_BFE_U32:
18038 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18039 /*SExt=*/false, Depth);
18040 case AMDGPU::S_BFE_I64:
18041 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18042 /*SExt=*/true, Depth);
18043 case AMDGPU::S_BFE_U64:
18044 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18045 /*SExt=*/false, Depth);
18046 case AMDGPU::G_INTRINSIC:
18047 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18048 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18049 switch (IID) {
18050 case Intrinsic::amdgcn_workitem_id_x:
18051 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18052 break;
18053 case Intrinsic::amdgcn_workitem_id_y:
18054 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18055 break;
18056 case Intrinsic::amdgcn_workitem_id_z:
18057 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18058 break;
18059 case Intrinsic::amdgcn_mbcnt_lo:
18060 case Intrinsic::amdgcn_mbcnt_hi: {
18061 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18062 // most 31 + src1.
18063 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18064 ? getSubtarget()->getWavefrontSizeLog2()
18065 : 5);
18066 KnownBits Known2;
18067 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18068 Depth + 1);
18069 Known = KnownBits::add(Known, Known2);
18070 break;
18071 }
18072 case Intrinsic::amdgcn_groupstaticsize: {
18073 // We can report everything over the maximum size as 0. We can't report
18074 // based on the actual size because we don't know if it's accurate or not
18075 // at any given point.
18076 Known.Zero.setHighBits(
18077 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18078 break;
18079 }
18080 }
18081 break;
18082 }
18083 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18084 Known.Zero.setHighBits(24);
18085 break;
18086 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18087 Known.Zero.setHighBits(16);
18088 break;
18089 case AMDGPU::G_AMDGPU_SMED3:
18090 case AMDGPU::G_AMDGPU_UMED3: {
18091 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18092
18093 KnownBits Known2;
18094 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18095 if (Known2.isUnknown())
18096 break;
18097
18098 KnownBits Known1;
18099 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18100 if (Known1.isUnknown())
18101 break;
18102
18103 KnownBits Known0;
18104 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18105 if (Known0.isUnknown())
18106 break;
18107
18108 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18109 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18110 Known.One = Known0.One & Known1.One & Known2.One;
18111 break;
18112 }
18113 }
18114}
18115
18118 unsigned Depth) const {
18119 const MachineInstr *MI = MRI.getVRegDef(R);
18120 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18121 // FIXME: Can this move to generic code? What about the case where the call
18122 // site specifies a lower alignment?
18123 Intrinsic::ID IID = GI->getIntrinsicID();
18125 AttributeList Attrs =
18126 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18127 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18128 return *RetAlign;
18129 }
18130 return Align(1);
18131}
18132
18135 const Align CacheLineAlign = Align(64);
18136
18137 // Pre-GFX10 target did not benefit from loop alignment
18138 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18139 getSubtarget()->hasInstFwdPrefetchBug())
18140 return PrefAlign;
18141
18142 // On GFX10 I$ is 4 x 64 bytes cache lines.
18143 // By default prefetcher keeps one cache line behind and reads two ahead.
18144 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18145 // behind and one ahead.
18146 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18147 // If loop fits 64 bytes it always spans no more than two cache lines and
18148 // does not need an alignment.
18149 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18150 // Else if loop is less or equal 192 bytes we need two lines behind.
18151
18153 const MachineBasicBlock *Header = ML->getHeader();
18154 if (Header->getAlignment() != PrefAlign)
18155 return Header->getAlignment(); // Already processed.
18156
18157 unsigned LoopSize = 0;
18158 for (const MachineBasicBlock *MBB : ML->blocks()) {
18159 // If inner loop block is aligned assume in average half of the alignment
18160 // size to be added as nops.
18161 if (MBB != Header)
18162 LoopSize += MBB->getAlignment().value() / 2;
18163
18164 for (const MachineInstr &MI : *MBB) {
18165 LoopSize += TII->getInstSizeInBytes(MI);
18166 if (LoopSize > 192)
18167 return PrefAlign;
18168 }
18169 }
18170
18171 if (LoopSize <= 64)
18172 return PrefAlign;
18173
18174 if (LoopSize <= 128)
18175 return CacheLineAlign;
18176
18177 // If any of parent loops is surrounded by prefetch instructions do not
18178 // insert new for inner loop, which would reset parent's settings.
18179 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18180 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18181 auto I = Exit->getFirstNonDebugInstr();
18182 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18183 return CacheLineAlign;
18184 }
18185 }
18186
18187 MachineBasicBlock *Pre = ML->getLoopPreheader();
18188 MachineBasicBlock *Exit = ML->getExitBlock();
18189
18190 if (Pre && Exit) {
18191 auto PreTerm = Pre->getFirstTerminator();
18192 if (PreTerm == Pre->begin() ||
18193 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18194 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18195 .addImm(1); // prefetch 2 lines behind PC
18196
18197 auto ExitHead = Exit->getFirstNonDebugInstr();
18198 if (ExitHead == Exit->end() ||
18199 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18200 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18201 .addImm(2); // prefetch 1 line behind PC
18202 }
18203
18204 return CacheLineAlign;
18205}
18206
18208static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18209 assert(N->getOpcode() == ISD::CopyFromReg);
18210 do {
18211 // Follow the chain until we find an INLINEASM node.
18212 N = N->getOperand(0).getNode();
18213 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18214 return true;
18215 } while (N->getOpcode() == ISD::CopyFromReg);
18216 return false;
18217}
18218
18221 UniformityInfo *UA) const {
18222 switch (N->getOpcode()) {
18223 case ISD::CopyFromReg: {
18224 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18225 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18226 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18227 Register Reg = R->getReg();
18228
18229 // FIXME: Why does this need to consider isLiveIn?
18230 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18231 return !TRI->isSGPRReg(MRI, Reg);
18232
18233 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18234 return UA->isDivergent(V);
18235
18237 return !TRI->isSGPRReg(MRI, Reg);
18238 }
18239 case ISD::LOAD: {
18240 const LoadSDNode *L = cast<LoadSDNode>(N);
18241 unsigned AS = L->getAddressSpace();
18242 // A flat load may access private memory.
18244 }
18245 case ISD::CALLSEQ_END:
18246 return true;
18248 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18250 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18269 // Target-specific read-modify-write atomics are sources of divergence.
18270 return true;
18271 default:
18272 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18273 // Generic read-modify-write atomics are sources of divergence.
18274 return A->readMem() && A->writeMem();
18275 }
18276 return false;
18277 }
18278}
18279
18281 EVT VT) const {
18282 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18283 case MVT::f32:
18285 case MVT::f64:
18286 case MVT::f16:
18288 default:
18289 return false;
18290 }
18291}
18292
18294 LLT Ty, const MachineFunction &MF) const {
18295 switch (Ty.getScalarSizeInBits()) {
18296 case 32:
18297 return !denormalModeIsFlushAllF32(MF);
18298 case 64:
18299 case 16:
18300 return !denormalModeIsFlushAllF64F16(MF);
18301 default:
18302 return false;
18303 }
18304}
18305
18307 const APInt &DemandedElts,
18308 const SelectionDAG &DAG,
18309 bool SNaN,
18310 unsigned Depth) const {
18311 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18312 const MachineFunction &MF = DAG.getMachineFunction();
18314
18315 if (Info->getMode().DX10Clamp)
18316 return true; // Clamped to 0.
18317 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18318 }
18319
18321 DAG, SNaN, Depth);
18322}
18323
18324// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18325// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18327 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18328 return true;
18329
18331 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18332 if (DenormMode == DenormalMode::getPreserveSign())
18333 return true;
18334
18335 // TODO: Remove this.
18336 return RMW->getFunction()
18337 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18338 .getValueAsBool();
18339}
18340
18342 LLVMContext &Ctx = RMW->getContext();
18343 StringRef MemScope =
18344 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18345
18346 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18347 << "Hardware instruction generated for atomic "
18348 << RMW->getOperationName(RMW->getOperation())
18349 << " operation at memory scope " << MemScope;
18350}
18351
18352static bool isV2F16OrV2BF16(Type *Ty) {
18353 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18354 Type *EltTy = VT->getElementType();
18355 return VT->getNumElements() == 2 &&
18356 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18357 }
18358
18359 return false;
18360}
18361
18362static bool isV2F16(Type *Ty) {
18364 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18365}
18366
18367static bool isV2BF16(Type *Ty) {
18369 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18370}
18371
18372/// \return true if atomicrmw integer ops work for the type.
18373static bool isAtomicRMWLegalIntTy(Type *Ty) {
18374 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18375 unsigned BW = IT->getBitWidth();
18376 return BW == 32 || BW == 64;
18377 }
18378
18379 return false;
18380}
18381
18382/// \return true if this atomicrmw xchg type can be selected.
18383static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18384 Type *Ty = RMW->getType();
18385 if (isAtomicRMWLegalIntTy(Ty))
18386 return true;
18387
18388 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18389 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18390 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18391 return BW == 32 || BW == 64;
18392 }
18393
18394 if (Ty->isFloatTy() || Ty->isDoubleTy())
18395 return true;
18396
18398 return VT->getNumElements() == 2 &&
18399 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18400 }
18401
18402 return false;
18403}
18404
18405/// \returns true if it's valid to emit a native instruction for \p RMW, based
18406/// on the properties of the target memory.
18407static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18408 const AtomicRMWInst *RMW,
18409 bool HasSystemScope) {
18410 // The remote/fine-grained access logic is different from the integer
18411 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18412 // fine-grained access does not work, even for a device local allocation.
18413 //
18414 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18415 // allocations work.
18416 if (HasSystemScope) {
18418 RMW->hasMetadata("amdgpu.no.remote.memory"))
18419 return true;
18420 if (Subtarget.hasEmulatedSystemScopeAtomics())
18421 return true;
18423 return true;
18424
18425 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18426}
18427
18428/// \return Action to perform on AtomicRMWInsts for integer operations.
18435
18436/// Return if a flat address space atomicrmw can access private memory.
18438 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18439 return !MD ||
18441}
18442
18450
18453 unsigned AS = RMW->getPointerAddressSpace();
18454 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18456
18457 // 64-bit flat atomics that dynamically reside in private memory will silently
18458 // be dropped.
18459 //
18460 // Note that we will emit a new copy of the original atomic in the expansion,
18461 // which will be incrementally relegalized.
18462 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18463 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18464 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18467
18468 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18470 ORE.emit([=]() {
18471 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18472 });
18473 return Kind;
18474 };
18475
18476 auto SSID = RMW->getSyncScopeID();
18477 bool HasSystemScope =
18478 SSID == SyncScope::System ||
18479 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18480
18481 auto Op = RMW->getOperation();
18482 switch (Op) {
18484 // PCIe supports add and xchg for system atomics.
18485 return isAtomicRMWLegalXChgTy(RMW)
18488 case AtomicRMWInst::Add:
18489 // PCIe supports add and xchg for system atomics.
18491 case AtomicRMWInst::Sub:
18492 case AtomicRMWInst::And:
18493 case AtomicRMWInst::Or:
18494 case AtomicRMWInst::Xor:
18495 case AtomicRMWInst::Max:
18496 case AtomicRMWInst::Min:
18503 if (Subtarget->hasEmulatedSystemScopeAtomics())
18505
18506 // On most subtargets, for atomicrmw operations other than add/xchg,
18507 // whether or not the instructions will behave correctly depends on where
18508 // the address physically resides and what interconnect is used in the
18509 // system configuration. On some some targets the instruction will nop,
18510 // and in others synchronization will only occur at degraded device scope.
18511 //
18512 // If the allocation is known local to the device, the instructions should
18513 // work correctly.
18514 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18516
18517 // If fine-grained remote memory works at device scope, we don't need to
18518 // do anything.
18519 if (!HasSystemScope &&
18520 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18522
18523 // If we are targeting a remote allocated address, it depends what kind of
18524 // allocation the address belongs to.
18525 //
18526 // If the allocation is fine-grained (in host memory, or in PCIe peer
18527 // device memory), the operation will fail depending on the target.
18528 //
18529 // Note fine-grained host memory access does work on APUs or if XGMI is
18530 // used, but we do not know if we are targeting an APU or the system
18531 // configuration from the ISA version/target-cpu.
18532 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18534
18537 // Atomic sub/or/xor do not work over PCI express, but atomic add
18538 // does. InstCombine transforms these with 0 to or, so undo that.
18539 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18540 ConstVal && ConstVal->isNullValue())
18542 }
18543
18544 // If the allocation could be in remote, fine-grained memory, the rmw
18545 // instructions may fail. cmpxchg should work, so emit that. On some
18546 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18547 // even work, so you're out of luck anyway.
18548
18549 // In summary:
18550 //
18551 // Cases that may fail:
18552 // - fine-grained pinned host memory
18553 // - fine-grained migratable host memory
18554 // - fine-grained PCIe peer device
18555 //
18556 // Cases that should work, but may be treated overly conservatively.
18557 // - fine-grained host memory on an APU
18558 // - fine-grained XGMI peer device
18560 }
18561
18563 }
18564 case AtomicRMWInst::FAdd: {
18565 Type *Ty = RMW->getType();
18566
18567 // TODO: Handle REGION_ADDRESS
18568 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18569 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18570 // is fixed to round-to-nearest-even.
18571 //
18572 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18573 // round-to-nearest-even.
18574 //
18575 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18576 // suggests it is OK if the floating-point mode may not match the calling
18577 // thread.
18578 if (Ty->isFloatTy()) {
18579 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18581 }
18582
18583 if (Ty->isDoubleTy()) {
18584 // Ignores denormal mode, but we don't consider flushing mandatory.
18585 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18587 }
18588
18589 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18591
18593 }
18594
18595 // LDS atomics respect the denormal mode from the mode register.
18596 //
18597 // Traditionally f32 global/buffer memory atomics would unconditionally
18598 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18599 // flush.
18600 //
18601 // On targets with flat atomic fadd, denormals would flush depending on
18602 // whether the target address resides in LDS or global memory. We consider
18603 // this flat-maybe-flush as will-flush.
18604 if (Ty->isFloatTy() &&
18605 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18608
18609 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18610 // safe. The message phrasing also should be better.
18611 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18612 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18613 // gfx942, gfx12
18614 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18615 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18616 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18617 // gfx90a, gfx942, gfx12
18618 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18619 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18620
18621 // gfx942, gfx12
18622 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18623 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18624 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18625 // gfx90a, gfx942, gfx12
18626 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18627 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18628
18629 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18630 // buffer. gfx12 does have the buffer version.
18631 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18632 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18633 }
18634
18635 // global and flat atomic fadd f64: gfx90a, gfx942.
18636 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18637 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18638
18639 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18640 if (Ty->isFloatTy()) {
18641 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18642 // gfx11+.
18643 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18644 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18645 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18646 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18647 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18648 } else {
18649 // gfx908
18650 if (RMW->use_empty() &&
18651 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18652 isV2F16(Ty))
18653 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18654 }
18655 }
18656
18657 // flat atomic fadd f32: gfx942, gfx11+.
18658 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18659 if (Subtarget->hasFlatAtomicFaddF32Inst())
18660 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18661
18662 // If it is in flat address space, and the type is float, we will try to
18663 // expand it, if the target supports global and lds atomic fadd. The
18664 // reason we need that is, in the expansion, we emit the check of
18665 // address space. If it is in global address space, we emit the global
18666 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18667 // fadd.
18668 if (Subtarget->hasLDSFPAtomicAddF32()) {
18669 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18671 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18673 }
18674 }
18675 }
18676
18678 }
18680 case AtomicRMWInst::FMax: {
18681 Type *Ty = RMW->getType();
18682
18683 // LDS float and double fmin/fmax were always supported.
18684 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18685 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18687 }
18688
18689 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18690 // For flat and global cases:
18691 // float, double in gfx7. Manual claims denormal support.
18692 // Removed in gfx8.
18693 // float, double restored in gfx10.
18694 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18695 //
18696 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18697 // no f32.
18698 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18699 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18700 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18701 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18702 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18703 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18705 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18706 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18707 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18708 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18709 }
18710 }
18711
18713 }
18716 default:
18718 }
18719
18720 llvm_unreachable("covered atomicrmw op switch");
18721}
18722
18729
18736
18739 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18740 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18742
18743 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18745
18746 const DataLayout &DL = CmpX->getDataLayout();
18747
18748 Type *ValTy = CmpX->getNewValOperand()->getType();
18749
18750 // If a 64-bit flat atomic may alias private, we need to avoid using the
18751 // atomic in the private case.
18752 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18754}
18755
18756const TargetRegisterClass *
18757SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18759 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18760 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18761 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18762 : &AMDGPU::SReg_32RegClass;
18763 if (!TRI->isSGPRClass(RC) && !isDivergent)
18764 return TRI->getEquivalentSGPRClass(RC);
18765 if (TRI->isSGPRClass(RC) && isDivergent)
18766 return TRI->getEquivalentVGPRClass(RC);
18767
18768 return RC;
18769}
18770
18771// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18772// uniform values (as produced by the mask results of control flow intrinsics)
18773// used outside of divergent blocks. The phi users need to also be treated as
18774// always uniform.
18775//
18776// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18777static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18778 unsigned WaveSize) {
18779 // FIXME: We assume we never cast the mask results of a control flow
18780 // intrinsic.
18781 // Early exit if the type won't be consistent as a compile time hack.
18782 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18783 if (!IT || IT->getBitWidth() != WaveSize)
18784 return false;
18785
18786 if (!isa<Instruction>(V))
18787 return false;
18788 if (!Visited.insert(V).second)
18789 return false;
18790 bool Result = false;
18791 for (const auto *U : V->users()) {
18793 if (V == U->getOperand(1)) {
18794 switch (Intrinsic->getIntrinsicID()) {
18795 default:
18796 Result = false;
18797 break;
18798 case Intrinsic::amdgcn_if_break:
18799 case Intrinsic::amdgcn_if:
18800 case Intrinsic::amdgcn_else:
18801 Result = true;
18802 break;
18803 }
18804 }
18805 if (V == U->getOperand(0)) {
18806 switch (Intrinsic->getIntrinsicID()) {
18807 default:
18808 Result = false;
18809 break;
18810 case Intrinsic::amdgcn_end_cf:
18811 case Intrinsic::amdgcn_loop:
18812 Result = true;
18813 break;
18814 }
18815 }
18816 } else {
18817 Result = hasCFUser(U, Visited, WaveSize);
18818 }
18819 if (Result)
18820 break;
18821 }
18822 return Result;
18823}
18824
18826 const Value *V) const {
18827 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18828 if (CI->isInlineAsm()) {
18829 // FIXME: This cannot give a correct answer. This should only trigger in
18830 // the case where inline asm returns mixed SGPR and VGPR results, used
18831 // outside the defining block. We don't have a specific result to
18832 // consider, so this assumes if any value is SGPR, the overall register
18833 // also needs to be SGPR.
18834 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18836 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18837 for (auto &TC : TargetConstraints) {
18838 if (TC.Type == InlineAsm::isOutput) {
18840 const TargetRegisterClass *RC =
18841 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18842 TC.ConstraintVT)
18843 .second;
18844 if (RC && SIRI->isSGPRClass(RC))
18845 return true;
18846 }
18847 }
18848 }
18849 }
18851 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18852}
18853
18855 for (SDUse &Use : N->uses()) {
18857 if (getBasePtrIndex(M) == Use.getOperandNo())
18858 return true;
18859 }
18860 }
18861 return false;
18862}
18863
18865 SDValue N1) const {
18866 if (!N0.hasOneUse())
18867 return false;
18868 // Take care of the opportunity to keep N0 uniform
18869 if (N0->isDivergent() || !N1->isDivergent())
18870 return true;
18871 // Check if we have a good chance to form the memory access pattern with the
18872 // base and offset
18873 return (DAG.isBaseWithConstantOffset(N0) &&
18875}
18876
18878 Register N0, Register N1) const {
18879 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18880}
18881
18884 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18886 if (I.getMetadata("amdgpu.noclobber"))
18887 Flags |= MONoClobber;
18888 if (I.getMetadata("amdgpu.last.use"))
18889 Flags |= MOLastUse;
18890 return Flags;
18891}
18892
18894 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18895 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18896 if (User->getOpcode() != ISD::CopyToReg)
18897 return false;
18898 if (!Def->isMachineOpcode())
18899 return false;
18901 if (!MDef)
18902 return false;
18903
18904 unsigned ResNo = User->getOperand(Op).getResNo();
18905 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18906 return false;
18907 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18908 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18909 PhysReg = AMDGPU::SCC;
18910 const TargetRegisterClass *RC =
18911 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18912 Cost = RC->getCopyCost();
18913 return true;
18914 }
18915 return false;
18916}
18917
18919 Instruction *AI) const {
18920 // Given: atomicrmw fadd ptr %addr, float %val ordering
18921 //
18922 // With this expansion we produce the following code:
18923 // [...]
18924 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18925 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18926 //
18927 // atomicrmw.shared:
18928 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18929 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18930 // float %val ordering
18931 // br label %atomicrmw.phi
18932 //
18933 // atomicrmw.check.private:
18934 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18935 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18936 //
18937 // atomicrmw.private:
18938 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18939 // %loaded.private = load float, ptr addrspace(5) %cast.private
18940 // %val.new = fadd float %loaded.private, %val
18941 // store float %val.new, ptr addrspace(5) %cast.private
18942 // br label %atomicrmw.phi
18943 //
18944 // atomicrmw.global:
18945 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18946 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18947 // float %val ordering
18948 // br label %atomicrmw.phi
18949 //
18950 // atomicrmw.phi:
18951 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18952 // [ %loaded.private, %atomicrmw.private ],
18953 // [ %loaded.global, %atomicrmw.global ]
18954 // br label %atomicrmw.end
18955 //
18956 // atomicrmw.end:
18957 // [...]
18958 //
18959 //
18960 // For 64-bit atomics which may reside in private memory, we perform a simpler
18961 // version that only inserts the private check, and uses the flat operation.
18962
18963 IRBuilder<> Builder(AI);
18964 LLVMContext &Ctx = Builder.getContext();
18965
18966 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18967 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18969 Value *Addr = AI->getOperand(PtrOpIdx);
18970
18971 /// TODO: Only need to check private, then emit flat-known-not private (no
18972 /// need for shared block, or cast to global).
18974
18975 Align Alignment;
18976 if (RMW)
18977 Alignment = RMW->getAlign();
18978 else if (CX)
18979 Alignment = CX->getAlign();
18980 else
18981 llvm_unreachable("unhandled atomic operation");
18982
18983 // FullFlatEmulation is true if we need to issue the private, shared, and
18984 // global cases.
18985 //
18986 // If this is false, we are only dealing with the flat-targeting-private case,
18987 // where we only insert a check for private and still use the flat instruction
18988 // for global and shared.
18989
18990 bool FullFlatEmulation =
18991 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18992 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18993 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18994 RMW->getType()->isDoubleTy()));
18995
18996 // If the return value isn't used, do not introduce a false use in the phi.
18997 bool ReturnValueIsUsed = !AI->use_empty();
18998
18999 BasicBlock *BB = Builder.GetInsertBlock();
19000 Function *F = BB->getParent();
19001 BasicBlock *ExitBB =
19002 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19003 BasicBlock *SharedBB = nullptr;
19004
19005 BasicBlock *CheckPrivateBB = BB;
19006 if (FullFlatEmulation) {
19007 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19008 CheckPrivateBB =
19009 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19010 }
19011
19012 BasicBlock *PrivateBB =
19013 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19014 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19015 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19016
19017 std::prev(BB->end())->eraseFromParent();
19018 Builder.SetInsertPoint(BB);
19019
19020 Value *LoadedShared = nullptr;
19021 if (FullFlatEmulation) {
19022 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19023 {Addr}, nullptr, "is.shared");
19024 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19025 Builder.SetInsertPoint(SharedBB);
19026 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19028
19029 Instruction *Clone = AI->clone();
19030 Clone->insertInto(SharedBB, SharedBB->end());
19031 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19032 LoadedShared = Clone;
19033
19034 Builder.CreateBr(PhiBB);
19035 Builder.SetInsertPoint(CheckPrivateBB);
19036 }
19037
19038 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19039 {Addr}, nullptr, "is.private");
19040 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19041
19042 Builder.SetInsertPoint(PrivateBB);
19043
19044 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19046
19047 Value *LoadedPrivate;
19048 if (RMW) {
19049 LoadedPrivate = Builder.CreateAlignedLoad(
19050 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19051
19052 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19053 LoadedPrivate, RMW->getValOperand());
19054
19055 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19056 } else {
19057 auto [ResultLoad, Equal] =
19058 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19059 CX->getNewValOperand(), CX->getAlign());
19060
19061 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19062 ResultLoad, 0);
19063 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19064 }
19065
19066 Builder.CreateBr(PhiBB);
19067
19068 Builder.SetInsertPoint(GlobalBB);
19069
19070 // Continue using a flat instruction if we only emitted the check for private.
19071 Instruction *LoadedGlobal = AI;
19072 if (FullFlatEmulation) {
19073 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19075 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19076 }
19077
19078 AI->removeFromParent();
19079 AI->insertInto(GlobalBB, GlobalBB->end());
19080
19081 // The new atomicrmw may go through another round of legalization later.
19082 if (!FullFlatEmulation) {
19083 // We inserted the runtime check already, make sure we do not try to
19084 // re-expand this.
19085 // TODO: Should union with any existing metadata.
19086 MDBuilder MDB(F->getContext());
19087 MDNode *RangeNotPrivate =
19090 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19091 RangeNotPrivate);
19092 }
19093
19094 Builder.CreateBr(PhiBB);
19095
19096 Builder.SetInsertPoint(PhiBB);
19097
19098 if (ReturnValueIsUsed) {
19099 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19100 AI->replaceAllUsesWith(Loaded);
19101 if (FullFlatEmulation)
19102 Loaded->addIncoming(LoadedShared, SharedBB);
19103 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19104 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19105 Loaded->takeName(AI);
19106 }
19107
19108 Builder.CreateBr(ExitBB);
19109}
19110
19112 unsigned PtrOpIdx) {
19113 Value *PtrOp = I->getOperand(PtrOpIdx);
19116
19117 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19118 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19119 I->getIterator());
19120 I->setOperand(PtrOpIdx, ASCast);
19121}
19122
19125
19128
19131 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19132 ConstVal && ConstVal->isNullValue()) {
19133 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19135
19136 // We may still need the private-alias-flat handling below.
19137
19138 // TODO: Skip this for cases where we cannot access remote memory.
19139 }
19140 }
19141
19142 // The non-flat expansions should only perform the de-canonicalization of
19143 // identity values.
19145 return;
19146
19148}
19149
19156
19160
19162 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19163}
19164
19166 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19167 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19168
19170 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19171}
19172
19173LoadInst *
19175 IRBuilder<> Builder(AI);
19176 auto Order = AI->getOrdering();
19177
19178 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19179 // must be flushed if the atomic ordering had a release semantics. This is
19180 // not necessary a fence, a release fence just coincides to do that flush.
19181 // Avoid replacing of an atomicrmw with a release semantics.
19182 if (isReleaseOrStronger(Order))
19183 return nullptr;
19184
19185 LoadInst *LI = Builder.CreateAlignedLoad(
19186 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19187 LI->setAtomic(Order, AI->getSyncScopeID());
19188 LI->copyMetadata(*AI);
19189 LI->takeName(AI);
19190 AI->replaceAllUsesWith(LI);
19191 AI->eraseFromParent();
19192 return LI;
19193}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1247
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1244
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:199
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1445
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:307
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:833
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:270
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2118
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:203
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1740
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:165
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:218
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:241
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs