LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
42#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
46#include "llvm/IR/MDBuilder.h"
49#include "llvm/Support/ModRef.h"
51#include <optional>
52
53using namespace llvm;
54using namespace llvm::SDPatternMatch;
55
56#define DEBUG_TYPE "si-lower"
57
58STATISTIC(NumTailCalls, "Number of tail calls");
59
60static cl::opt<bool>
61 DisableLoopAlignment("amdgpu-disable-loop-alignment",
62 cl::desc("Do not align and prefetch loops"),
63 cl::init(false));
64
66 "amdgpu-use-divergent-register-indexing", cl::Hidden,
67 cl::desc("Use indirect register addressing for divergent indexes"),
68 cl::init(false));
69
72 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
73}
74
77 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
78}
79
80static unsigned findFirstFreeSGPR(CCState &CCInfo) {
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
83 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
84 return AMDGPU::SGPR0 + Reg;
85 }
86 }
87 llvm_unreachable("Cannot allocate sgpr");
88}
89
91 const GCNSubtarget &STI)
92 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
93 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
94 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
95
96 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V32RegClass =
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
101 addRegisterClass(MVT::f32, V32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const TargetRegisterClass *V64RegClass =
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32,
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32,
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
149
150 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
151 addRegisterClass(MVT::v12f32,
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
153
154 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v16f32,
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
157
158 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
159 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
160
161 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
162 addRegisterClass(MVT::v16f64,
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
167 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
170 } else {
171 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
174 }
175
176 // Unless there are also VOP3P operations, not operations are really legal.
177 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
180 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
183 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
186 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
189 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
191 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
192 }
193
194 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
195 addRegisterClass(MVT::v32f32,
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
197
198 computeRegisterProperties(Subtarget->getRegisterInfo());
199
200 // The boolean content concept here is too inflexible. Compares only ever
201 // really produce a 1-bit result. Any copy/extend from these will turn into a
202 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
203 // it's what most targets use.
206
207 // We need to custom lower vector stores from local memory
208 setOperationAction(ISD::LOAD,
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
213 Custom);
214
215 setOperationAction(ISD::STORE,
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
220 Custom);
221
222 if (isTypeLegal(MVT::bf16)) {
223 for (unsigned Opc :
225 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
226 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
227 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
228 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
229 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
230 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
231 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
232 ISD::SETCC}) {
233 // FIXME: The promoted to type shouldn't need to be explicit
234 setOperationAction(Opc, MVT::bf16, Promote);
235 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
236 }
237
239
241 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
242
243 setOperationAction(ISD::FABS, MVT::bf16, Legal);
244 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
246
247 // We only need to custom lower because we can't specify an action for bf16
248 // sources.
251 }
252
253 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
254 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
259 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
264 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
265 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
266 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
267 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
268 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
269
270 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
271 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
272 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
274 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
275 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
276 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
277
278 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
279
283 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
284
285 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
286
288 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
289
291 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
292 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
293
295 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
296 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
297 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
298 Expand);
300 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
301 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
302 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
303 Expand);
304
306 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
307 MVT::v3i16, MVT::v4i16, MVT::Other},
308 Custom);
309
310 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
311 setOperationAction(ISD::BR_CC,
312 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
313
315
317
319 Expand);
320
321#if 0
323#endif
324
325 // We only support LOAD/STORE and vector manipulation ops for vectors
326 // with > 4 elements.
327 for (MVT VT :
328 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
329 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
330 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
331 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
332 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
333 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
334 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
335 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
336 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
337 switch (Op) {
338 case ISD::LOAD:
339 case ISD::STORE:
341 case ISD::BITCAST:
342 case ISD::UNDEF:
346 case ISD::IS_FPCLASS:
347 break;
352 break;
353 default:
355 break;
356 }
357 }
358 }
359
360 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
361
362 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
363 // is expanded to avoid having two separate loops in case the index is a VGPR.
364
365 // Most operations are naturally 32-bit vector operations. We only support
366 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
367 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
369 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
370
372 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
373
375 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
376
378 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
379 }
380
381 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
383 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
384
386 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
387
389 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
390
392 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
393 }
394
395 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
397 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
398
400 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
401
403 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
404
406 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
407 }
408
409 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
411 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
412
414 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
415
417 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
418
420 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
421 }
422
423 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
425 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
426
428 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
429
431 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
432
434 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
435 }
436
438 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
439 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
440 Custom);
441
442 if (Subtarget->hasPkMovB32()) {
443 // TODO: 16-bit element vectors should be legal with even aligned elements.
444 // TODO: Can be legal with wider source types than the result with
445 // subregister extracts.
446 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
447 }
448
450 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
451 // instead lower to cndmask in SITargetLowering::LowerSELECT().
453 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
454 // alignbit.
455 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
456
457 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
458 Custom);
459
460 // Avoid stack access for these.
461 // TODO: Generalize to more vector types.
463 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
464 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
465 Custom);
466
467 // Deal with vec3 vector operations when widened to vec4.
469 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
470
471 // Deal with vec5/6/7 vector operations when widened to vec8.
473 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
474 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
475 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
476 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
477 Custom);
478
479 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
480 // and output demarshalling
481 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
482
483 // We can't return success/failure, only the old value,
484 // let LLVM add the comparison
485 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
486 Expand);
487
488 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
489
490 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
491
492 // FIXME: This should be narrowed to i32, but that only happens if i64 is
493 // illegal.
494 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
495 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
496
497 // On SI this is s_memtime and s_memrealtime on VI.
498 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
499
500 if (Subtarget->hasSMemRealTime() ||
501 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
502 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
503 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
504
505 if (Subtarget->has16BitInsts()) {
506 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
507 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
508 } else {
509 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
510 }
511
512 if (Subtarget->hasMadMacF32Insts())
514
515 if (!Subtarget->hasBFI())
516 // fcopysign can be done in a single instruction with BFI.
517 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
518
519 if (!Subtarget->hasBCNT(32))
521
522 if (!Subtarget->hasBCNT(64))
524
525 if (Subtarget->hasFFBH())
527
528 if (Subtarget->hasFFBL())
530
531 // We only really have 32-bit BFE instructions (and 16-bit on VI).
532 //
533 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
534 // effort to match them now. We want this to be false for i64 cases when the
535 // extraction isn't restricted to the upper or lower half. Ideally we would
536 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
537 // span the midpoint are probably relatively rare, so don't worry about them
538 // for now.
539 if (Subtarget->hasBFE())
541
542 // Clamp modifier on add/sub
543 if (Subtarget->hasIntClamp())
545
546 if (Subtarget->hasAddNoCarry())
547 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
548 Legal);
549
551 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
552 {MVT::f32, MVT::f64}, Custom);
553
554 // These are really only legal for ieee_mode functions. We should be avoiding
555 // them for functions that don't have ieee_mode enabled, so just say they are
556 // legal.
557 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
558 {MVT::f32, MVT::f64}, Legal);
559
560 if (Subtarget->haveRoundOpsF64())
561 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
562 Legal);
563 else
564 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
565 MVT::f64, Custom);
566
567 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
568 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
569 Legal);
570 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
571
572 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
574
575 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
576 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
577
578 // Custom lower these because we can't specify a rule based on an illegal
579 // source bf16.
580 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
581 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
582
583 if (Subtarget->has16BitInsts()) {
586 MVT::i16, Legal);
587
588 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
589
591 MVT::i16, Expand);
592
596 ISD::CTPOP},
597 MVT::i16, Promote);
598
599 setOperationAction(ISD::LOAD, MVT::i16, Custom);
600
601 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
602
603 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
604 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
605 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
606 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
607
611
613
614 // F16 - Constant Actions.
617
618 // F16 - Load/Store Actions.
619 setOperationAction(ISD::LOAD, MVT::f16, Promote);
620 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
621 setOperationAction(ISD::STORE, MVT::f16, Promote);
622 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
623
624 // BF16 - Load/Store Actions.
625 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
626 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
627 setOperationAction(ISD::STORE, MVT::bf16, Promote);
628 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
629
630 // F16 - VOP1 Actions.
632 ISD::FSIN, ISD::FROUND},
633 MVT::f16, Custom);
634
635 // BF16 - VOP1 Actions.
636 if (Subtarget->hasBF16TransInsts())
637 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
638
641
642 // F16 - VOP2 Actions.
643 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
644 Expand);
645 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
646 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
648
649 // F16 - VOP3 Actions.
651 if (STI.hasMadF16())
653
654 for (MVT VT :
655 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
656 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
657 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
658 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
659 switch (Op) {
660 case ISD::LOAD:
661 case ISD::STORE:
663 case ISD::BITCAST:
664 case ISD::UNDEF:
669 case ISD::IS_FPCLASS:
670 break;
674 break;
675 default:
677 break;
678 }
679 }
680 }
681
682 // v_perm_b32 can handle either of these.
683 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
685
686 // XXX - Do these do anything? Vector constants turn into build_vector.
687 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
688
689 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
690 Legal);
691
692 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
694 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
695 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
696
697 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
698 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
699 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
700 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
701
702 setOperationAction(ISD::AND, MVT::v2i16, Promote);
703 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
704 setOperationAction(ISD::OR, MVT::v2i16, Promote);
705 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
706 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
707 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
708
709 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
711 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
712 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
713 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
715
716 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
718 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
719 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
720 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
721 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
722
723 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
725 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
726 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
727 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
728 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
729
730 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
732 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
734
735 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
737 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
738 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
739 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
740 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
741
742 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
743 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
744 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
745 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
746 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
747 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
748
749 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
750 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
751 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
752 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
753 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
754 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
755
756 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
757 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
758 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
759 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
760 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
761 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
762
763 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
764 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
765 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
766 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
767 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
768 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
769
771 MVT::v2i32, Expand);
772 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
773
775 MVT::v4i32, Expand);
776
778 MVT::v8i32, Expand);
779
780 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
781 Subtarget->hasVOP3PInsts() ? Legal : Custom);
782
783 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
784 // This isn't really legal, but this avoids the legalizer unrolling it (and
785 // allows matching fneg (fabs x) patterns)
786 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
787
788 // Can do this in one BFI plus a constant materialize.
790 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
791 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
792 MVT::v32f16, MVT::v32bf16},
793 Custom);
794
796 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
797 MVT::f16, Custom);
798 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
799
800 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
801 ISD::FMAXIMUMNUM},
802 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
803 Custom);
804
805 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
806 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
807 Expand);
808
809 for (MVT Vec16 :
810 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
811 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
814 Vec16, Custom);
816 }
817 }
818
819 if (Subtarget->hasVOP3PInsts()) {
823 MVT::v2i16, Legal);
824
825 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
826 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
827 MVT::v2f16, Legal);
828
830 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
831
833 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
834 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
835 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
836 Custom);
837
838 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
839 // Split vector operations.
844 VT, Custom);
845
846 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
847 // Split vector operations.
849 VT, Custom);
850
852 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
853 {MVT::v2f16, MVT::v4f16}, Custom);
854
855 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
856 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
857 Custom);
858
859 if (Subtarget->hasBF16PackedInsts()) {
860 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
861 // Split vector operations.
863 VT, Custom);
864 }
865
866 if (Subtarget->hasPackedFP32Ops()) {
868 MVT::v2f32, Legal);
870 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
871 Custom);
872 }
873 }
874
875 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
876
877 if (Subtarget->has16BitInsts()) {
879 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
881 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
882 } else {
883 // Legalization hack.
884 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
885
886 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
887 }
888
890 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
891 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
893 MVT::v32f16, MVT::v32bf16},
894 Custom);
895
897
898 if (Subtarget->hasVectorMulU64())
900 else if (Subtarget->hasScalarSMulU64())
902
903 if (Subtarget->hasMad64_32())
905
906 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
907 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
908
909 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
910 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
911 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
912 } else {
913 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
914 if (Subtarget->hasMinimum3Maximum3F32())
915 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
916
917 if (Subtarget->hasMinimum3Maximum3PKF16()) {
918 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
919
920 // If only the vector form is available, we need to widen to a vector.
921 if (!Subtarget->hasMinimum3Maximum3F16())
922 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
923 }
924 }
925
926 if (Subtarget->hasVOP3PInsts()) {
927 // We want to break these into v2f16 pieces, not scalarize.
928 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
929 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
930 Custom);
931 }
932
933 if (Subtarget->hasIntMinMax64())
935 Legal);
936
938 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
939 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
940 MVT::i8},
941 Custom);
942
944 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
945 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
946 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
947 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
948 Custom);
949
951 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
952 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
953 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
954 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
955 Custom);
956
957 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
959 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
960 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
961 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
962
963 // TODO: Could move this to custom lowering, could benefit from combines on
964 // extract of relevant bits.
965 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
966
968
969 if (Subtarget->hasBF16ConversionInsts()) {
970 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
972 }
973
974 if (Subtarget->hasBF16PackedInsts()) {
976 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
977 MVT::v2bf16, Legal);
978 }
979
980 if (Subtarget->hasBF16TransInsts()) {
981 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
982 }
983
984 if (Subtarget->hasCvtPkF16F32Inst()) {
986 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
987 Custom);
988 }
989
991 ISD::PTRADD,
993 ISD::SUB,
995 ISD::MUL,
996 ISD::FADD,
997 ISD::FSUB,
998 ISD::FDIV,
999 ISD::FMUL,
1000 ISD::FMINNUM,
1001 ISD::FMAXNUM,
1002 ISD::FMINNUM_IEEE,
1003 ISD::FMAXNUM_IEEE,
1004 ISD::FMINIMUM,
1005 ISD::FMAXIMUM,
1006 ISD::FMINIMUMNUM,
1007 ISD::FMAXIMUMNUM,
1008 ISD::FMA,
1009 ISD::SMIN,
1010 ISD::SMAX,
1011 ISD::UMIN,
1012 ISD::UMAX,
1013 ISD::SETCC,
1015 ISD::SMIN,
1016 ISD::SMAX,
1017 ISD::UMIN,
1018 ISD::UMAX,
1019 ISD::AND,
1020 ISD::OR,
1021 ISD::XOR,
1022 ISD::SHL,
1023 ISD::SRL,
1024 ISD::SRA,
1025 ISD::FSHR,
1035
1036 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1038
1039 // All memory operations. Some folding on the pointer operand is done to help
1040 // matching the constant offsets in the addressing modes.
1041 setTargetDAGCombine({ISD::LOAD,
1042 ISD::STORE,
1043 ISD::ATOMIC_LOAD,
1044 ISD::ATOMIC_STORE,
1045 ISD::ATOMIC_CMP_SWAP,
1046 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1047 ISD::ATOMIC_SWAP,
1048 ISD::ATOMIC_LOAD_ADD,
1049 ISD::ATOMIC_LOAD_SUB,
1050 ISD::ATOMIC_LOAD_AND,
1051 ISD::ATOMIC_LOAD_OR,
1052 ISD::ATOMIC_LOAD_XOR,
1053 ISD::ATOMIC_LOAD_NAND,
1054 ISD::ATOMIC_LOAD_MIN,
1055 ISD::ATOMIC_LOAD_MAX,
1056 ISD::ATOMIC_LOAD_UMIN,
1057 ISD::ATOMIC_LOAD_UMAX,
1058 ISD::ATOMIC_LOAD_FADD,
1059 ISD::ATOMIC_LOAD_FMIN,
1060 ISD::ATOMIC_LOAD_FMAX,
1061 ISD::ATOMIC_LOAD_UINC_WRAP,
1062 ISD::ATOMIC_LOAD_UDEC_WRAP,
1063 ISD::ATOMIC_LOAD_USUB_COND,
1064 ISD::ATOMIC_LOAD_USUB_SAT,
1067
1068 // FIXME: In other contexts we pretend this is a per-function property.
1070
1072}
1073
1074const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1075
1077 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1078 return RCRegs;
1079}
1080
1081//===----------------------------------------------------------------------===//
1082// TargetLowering queries
1083//===----------------------------------------------------------------------===//
1084
1085// v_mad_mix* support a conversion from f16 to f32.
1086//
1087// There is only one special case when denormals are enabled we don't currently,
1088// where this is OK to use.
1089bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1090 EVT DestVT, EVT SrcVT) const {
1091 return DestVT.getScalarType() == MVT::f32 &&
1092 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1093 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1094 SrcVT.getScalarType() == MVT::f16) ||
1095 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1096 SrcVT.getScalarType() == MVT::bf16)) &&
1097 // TODO: This probably only requires no input flushing?
1099}
1100
1102 LLT DestTy, LLT SrcTy) const {
1103 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1104 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1105 DestTy.getScalarSizeInBits() == 32 &&
1106 SrcTy.getScalarSizeInBits() == 16 &&
1107 // TODO: This probably only requires no input flushing?
1108 denormalModeIsFlushAllF32(*MI.getMF());
1109}
1110
1112 // SI has some legal vector types, but no legal vector operations. Say no
1113 // shuffles are legal in order to prefer scalarizing some vector operations.
1114 return false;
1115}
1116
1118 CallingConv::ID CC,
1119 EVT VT) const {
1121 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1122
1123 if (VT.isVector()) {
1124 EVT ScalarVT = VT.getScalarType();
1125 unsigned Size = ScalarVT.getSizeInBits();
1126 if (Size == 16) {
1127 if (Subtarget->has16BitInsts()) {
1128 if (VT.isInteger())
1129 return MVT::v2i16;
1130 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1131 }
1132 return VT.isInteger() ? MVT::i32 : MVT::f32;
1133 }
1134
1135 if (Size < 16)
1136 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1137 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1138 }
1139
1140 if (VT.getSizeInBits() > 32)
1141 return MVT::i32;
1142
1143 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1144}
1145
1147 CallingConv::ID CC,
1148 EVT VT) const {
1150 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1151
1152 if (VT.isVector()) {
1153 unsigned NumElts = VT.getVectorNumElements();
1154 EVT ScalarVT = VT.getScalarType();
1155 unsigned Size = ScalarVT.getSizeInBits();
1156
1157 // FIXME: Should probably promote 8-bit vectors to i16.
1158 if (Size == 16 && Subtarget->has16BitInsts())
1159 return (NumElts + 1) / 2;
1160
1161 if (Size <= 32)
1162 return NumElts;
1163
1164 if (Size > 32)
1165 return NumElts * ((Size + 31) / 32);
1166 } else if (VT.getSizeInBits() > 32)
1167 return (VT.getSizeInBits() + 31) / 32;
1168
1169 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1170}
1171
1173 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1174 unsigned &NumIntermediates, MVT &RegisterVT) const {
1175 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1176 unsigned NumElts = VT.getVectorNumElements();
1177 EVT ScalarVT = VT.getScalarType();
1178 unsigned Size = ScalarVT.getSizeInBits();
1179 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1180 // support, but unless we can properly handle 3-vectors, it will be still be
1181 // inconsistent.
1182 if (Size == 16 && Subtarget->has16BitInsts()) {
1183 if (ScalarVT == MVT::bf16) {
1184 RegisterVT = MVT::i32;
1185 IntermediateVT = MVT::v2bf16;
1186 } else {
1187 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1188 IntermediateVT = RegisterVT;
1189 }
1190 NumIntermediates = (NumElts + 1) / 2;
1191 return NumIntermediates;
1192 }
1193
1194 if (Size == 32) {
1195 RegisterVT = ScalarVT.getSimpleVT();
1196 IntermediateVT = RegisterVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1199 }
1200
1201 if (Size < 16 && Subtarget->has16BitInsts()) {
1202 // FIXME: Should probably form v2i16 pieces
1203 RegisterVT = MVT::i16;
1204 IntermediateVT = ScalarVT;
1205 NumIntermediates = NumElts;
1206 return NumIntermediates;
1207 }
1208
1209 if (Size != 16 && Size <= 32) {
1210 RegisterVT = MVT::i32;
1211 IntermediateVT = ScalarVT;
1212 NumIntermediates = NumElts;
1213 return NumIntermediates;
1214 }
1215
1216 if (Size > 32) {
1217 RegisterVT = MVT::i32;
1218 IntermediateVT = RegisterVT;
1219 NumIntermediates = NumElts * ((Size + 31) / 32);
1220 return NumIntermediates;
1221 }
1222 }
1223
1225 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1226}
1227
1229 const DataLayout &DL, Type *Ty,
1230 unsigned MaxNumLanes) {
1231 assert(MaxNumLanes != 0);
1232
1233 LLVMContext &Ctx = Ty->getContext();
1234 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1235 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1236 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1237 NumElts);
1238 }
1239
1240 return TLI.getValueType(DL, Ty);
1241}
1242
1243// Peek through TFE struct returns to only use the data size.
1245 const DataLayout &DL, Type *Ty,
1246 unsigned MaxNumLanes) {
1247 auto *ST = dyn_cast<StructType>(Ty);
1248 if (!ST)
1249 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1250
1251 // TFE intrinsics return an aggregate type.
1252 assert(ST->getNumContainedTypes() == 2 &&
1253 ST->getContainedType(1)->isIntegerTy(32));
1254 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1255}
1256
1257/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1258/// in-memory representation. This return value is a custom type because there
1259/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1260/// could cause issues during codegen, these address space 7 pointers will be
1261/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1262/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1263/// for cost modeling, to work. (This also sets us up decently for doing the
1264/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1266 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1267 return MVT::amdgpuBufferFatPointer;
1269 DL.getPointerSizeInBits(AS) == 192)
1270 return MVT::amdgpuBufferStridedPointer;
1272}
1273/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1274/// v8i32 when padding is added.
1275/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1276/// also v8i32 with padding.
1278 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1279 DL.getPointerSizeInBits(AS) == 160) ||
1281 DL.getPointerSizeInBits(AS) == 192))
1282 return MVT::v8i32;
1284}
1285
1286static unsigned getIntrMemWidth(unsigned IntrID) {
1287 switch (IntrID) {
1288 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1289 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1290 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1291 return 8;
1292 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1293 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1294 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1295 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1296 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1297 return 32;
1298 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1300 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1301 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1302 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1303 return 64;
1304 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1305 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1306 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1307 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1308 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1309 return 128;
1310 default:
1311 llvm_unreachable("Unknown width");
1312 }
1313}
1314
1315static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad,
1317 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1318 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1319 switch (AtomicOrderingCABI(Ord)) {
1322 break;
1325 break;
1328 break;
1329 default:
1331 break;
1332 }
1333
1334 Info.flags =
1336 Info.flags |= MOCooperative;
1337
1338 MDNode *ScopeMD = cast<MDNode>(
1339 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1340 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1341 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1342}
1343
1345 const CallBase &CI,
1346 MachineFunction &MF,
1347 unsigned IntrID) const {
1348 Info.flags = MachineMemOperand::MONone;
1349 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1350 Info.flags |= MachineMemOperand::MOInvariant;
1351 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1353 Info.flags |= getTargetMMOFlags(CI);
1354
1355 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1357 AttributeSet Attr =
1359 MemoryEffects ME = Attr.getMemoryEffects();
1360 if (ME.doesNotAccessMemory())
1361 return false;
1362
1363 // TODO: Should images get their own address space?
1364 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1365
1366 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1367 if (RsrcIntr->IsImage) {
1368 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1370 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1371 Info.align.reset();
1372 }
1373
1374 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1375 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1376 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1377 // We conservatively set the memory operand of a buffer intrinsic to the
1378 // base resource pointer, so that we can access alias information about
1379 // those pointers. Cases like "this points at the same value
1380 // but with a different offset" are handled in
1381 // areMemAccessesTriviallyDisjoint.
1382 Info.ptrVal = RsrcArg;
1383 }
1384
1385 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1386 if (!IsSPrefetch) {
1387 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1388 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1389 Info.flags |= MachineMemOperand::MOVolatile;
1390 }
1391
1393 if (ME.onlyReadsMemory()) {
1394 if (RsrcIntr->IsImage) {
1395 unsigned MaxNumLanes = 4;
1396
1397 if (!BaseOpcode->Gather4) {
1398 // If this isn't a gather, we may have excess loaded elements in the
1399 // IR type. Check the dmask for the real number of elements loaded.
1400 unsigned DMask =
1401 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1402 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1403 }
1404
1405 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1406 CI.getType(), MaxNumLanes);
1407 } else {
1408 Info.memVT =
1410 std::numeric_limits<unsigned>::max());
1411 }
1412
1413 // FIXME: What does alignment mean for an image?
1414 Info.opc = ISD::INTRINSIC_W_CHAIN;
1415 Info.flags |= MachineMemOperand::MOLoad;
1416 } else if (ME.onlyWritesMemory()) {
1417 Info.opc = ISD::INTRINSIC_VOID;
1418
1419 Type *DataTy = CI.getArgOperand(0)->getType();
1420 if (RsrcIntr->IsImage) {
1421 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1422 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1423 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1424 DMaskLanes);
1425 } else
1426 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1427
1428 Info.flags |= MachineMemOperand::MOStore;
1429 } else {
1430 // Atomic, NoReturn Sampler or prefetch
1431 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1433 Info.flags |=
1435
1436 if (!IsSPrefetch)
1437 Info.flags |= MachineMemOperand::MOStore;
1438
1439 switch (IntrID) {
1440 default:
1441 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1442 // Fake memory access type for no return sampler intrinsics
1443 Info.memVT = MVT::i32;
1444 } else {
1445 // XXX - Should this be volatile without known ordering?
1446 Info.flags |= MachineMemOperand::MOVolatile;
1447 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1448 }
1449 break;
1450 case Intrinsic::amdgcn_raw_buffer_load_lds:
1451 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1452 case Intrinsic::amdgcn_struct_buffer_load_lds:
1453 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1454 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1455 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1456 Info.ptrVal = CI.getArgOperand(1);
1457 return true;
1458 }
1459 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1460 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1461 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1462 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1463 Info.memVT =
1465 std::numeric_limits<unsigned>::max());
1466 Info.flags &= ~MachineMemOperand::MOStore;
1467 return true;
1468 }
1469 }
1470 }
1471 return true;
1472 }
1473
1474 switch (IntrID) {
1475 case Intrinsic::amdgcn_ds_ordered_add:
1476 case Intrinsic::amdgcn_ds_ordered_swap: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getType());
1479 Info.ptrVal = CI.getOperand(0);
1480 Info.align.reset();
1482
1483 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1484 if (!Vol->isZero())
1485 Info.flags |= MachineMemOperand::MOVolatile;
1486
1487 return true;
1488 }
1489 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1490 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1491 Info.opc = ISD::INTRINSIC_W_CHAIN;
1492 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1493 Info.ptrVal = nullptr;
1494 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1496 return true;
1497 }
1498 case Intrinsic::amdgcn_ds_append:
1499 case Intrinsic::amdgcn_ds_consume: {
1500 Info.opc = ISD::INTRINSIC_W_CHAIN;
1501 Info.memVT = MVT::getVT(CI.getType());
1502 Info.ptrVal = CI.getOperand(0);
1503 Info.align.reset();
1505
1506 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1507 if (!Vol->isZero())
1508 Info.flags |= MachineMemOperand::MOVolatile;
1509
1510 return true;
1511 }
1512 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1513 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1514 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1517 Info.memVT = MVT::getVT(CI.getType());
1518 Info.ptrVal = CI.getOperand(0);
1519 Info.memVT = MVT::i64;
1520 Info.size = 8;
1521 Info.align.reset();
1523 return true;
1524 }
1525 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1526 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1527 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1528 Info.opc = ISD::INTRINSIC_W_CHAIN;
1529 Info.memVT =
1530 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1531 ? CI.getType()
1533 ->getElementType(0)); // XXX: what is correct VT?
1534
1535 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1536 Info.align.reset();
1537 Info.flags |=
1539 return true;
1540 }
1541 case Intrinsic::amdgcn_global_atomic_fmin_num:
1542 case Intrinsic::amdgcn_global_atomic_fmax_num:
1543 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1544 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1545 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1546 Info.opc = ISD::INTRINSIC_W_CHAIN;
1547 Info.memVT = MVT::getVT(CI.getType());
1548 Info.ptrVal = CI.getOperand(0);
1549 Info.align.reset();
1553 return true;
1554 }
1555 case Intrinsic::amdgcn_flat_load_monitor_b32:
1556 case Intrinsic::amdgcn_flat_load_monitor_b64:
1557 case Intrinsic::amdgcn_flat_load_monitor_b128:
1558 case Intrinsic::amdgcn_global_load_monitor_b32:
1559 case Intrinsic::amdgcn_global_load_monitor_b64:
1560 case Intrinsic::amdgcn_global_load_monitor_b128:
1561 case Intrinsic::amdgcn_cluster_load_b32:
1562 case Intrinsic::amdgcn_cluster_load_b64:
1563 case Intrinsic::amdgcn_cluster_load_b128:
1564 case Intrinsic::amdgcn_ds_load_tr6_b96:
1565 case Intrinsic::amdgcn_ds_load_tr4_b64:
1566 case Intrinsic::amdgcn_ds_load_tr8_b64:
1567 case Intrinsic::amdgcn_ds_load_tr16_b128:
1568 case Intrinsic::amdgcn_global_load_tr6_b96:
1569 case Intrinsic::amdgcn_global_load_tr4_b64:
1570 case Intrinsic::amdgcn_global_load_tr_b64:
1571 case Intrinsic::amdgcn_global_load_tr_b128:
1572 case Intrinsic::amdgcn_ds_read_tr4_b64:
1573 case Intrinsic::amdgcn_ds_read_tr6_b96:
1574 case Intrinsic::amdgcn_ds_read_tr8_b64:
1575 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1576 Info.opc = ISD::INTRINSIC_W_CHAIN;
1577 Info.memVT = MVT::getVT(CI.getType());
1578 Info.ptrVal = CI.getOperand(0);
1579 Info.align.reset();
1580 Info.flags |= MachineMemOperand::MOLoad;
1581 return true;
1582 }
1583 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1584 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1585 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1586 Info.opc = ISD::INTRINSIC_W_CHAIN;
1587 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1588 Info.ptrVal = CI.getOperand(0);
1589 Info.align.reset();
1590 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1591 return true;
1592 }
1593 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1594 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1595 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1596 Info.opc = ISD::INTRINSIC_VOID;
1597 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1598 Info.ptrVal = CI.getArgOperand(0);
1599 Info.align.reset();
1600 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1601 return true;
1602 }
1603 case Intrinsic::amdgcn_ds_gws_init:
1604 case Intrinsic::amdgcn_ds_gws_barrier:
1605 case Intrinsic::amdgcn_ds_gws_sema_v:
1606 case Intrinsic::amdgcn_ds_gws_sema_br:
1607 case Intrinsic::amdgcn_ds_gws_sema_p:
1608 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1609 Info.opc = ISD::INTRINSIC_VOID;
1610
1611 const GCNTargetMachine &TM =
1612 static_cast<const GCNTargetMachine &>(getTargetMachine());
1613
1615 Info.ptrVal = MFI->getGWSPSV(TM);
1616
1617 // This is an abstract access, but we need to specify a type and size.
1618 Info.memVT = MVT::i32;
1619 Info.size = 4;
1620 Info.align = Align(4);
1621
1622 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1623 Info.flags |= MachineMemOperand::MOLoad;
1624 else
1625 Info.flags |= MachineMemOperand::MOStore;
1626 return true;
1627 }
1628 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1632 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1633 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1634 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1635 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1636 Info.opc = ISD::INTRINSIC_VOID;
1637 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1638 Info.ptrVal = CI.getArgOperand(1);
1640 return true;
1641 }
1642 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1643 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1644 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1645 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1646 Info.opc = ISD::INTRINSIC_VOID;
1647 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1648 Info.ptrVal = CI.getArgOperand(0);
1650 return true;
1651 }
1652 case Intrinsic::amdgcn_load_to_lds:
1653 case Intrinsic::amdgcn_global_load_lds: {
1654 Info.opc = ISD::INTRINSIC_VOID;
1655 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1656 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1657 Info.ptrVal = CI.getArgOperand(1);
1659 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1660 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1661 Info.flags |= MachineMemOperand::MOVolatile;
1662 return true;
1663 }
1664 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1665 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1666 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1667 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1668 Info.opc = ISD::INTRINSIC_W_CHAIN;
1669
1670 const GCNTargetMachine &TM =
1671 static_cast<const GCNTargetMachine &>(getTargetMachine());
1672
1674 Info.ptrVal = MFI->getGWSPSV(TM);
1675
1676 // This is an abstract access, but we need to specify a type and size.
1677 Info.memVT = MVT::i32;
1678 Info.size = 4;
1679 Info.align = Align(4);
1680
1682 return true;
1683 }
1684 case Intrinsic::amdgcn_s_prefetch_data:
1685 case Intrinsic::amdgcn_flat_prefetch:
1686 case Intrinsic::amdgcn_global_prefetch: {
1687 Info.opc = ISD::INTRINSIC_VOID;
1688 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1689 Info.ptrVal = CI.getArgOperand(0);
1690 Info.flags |= MachineMemOperand::MOLoad;
1691 return true;
1692 }
1693 default:
1694 return false;
1695 }
1696}
1697
1699 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1701 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1702 // The DAG's ValueType loses the addrspaces.
1703 // Add them as 2 extra Constant operands "from" and "to".
1704 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1705 unsigned DstAS = I.getType()->getPointerAddressSpace();
1706 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1707 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1708 break;
1709 }
1710 default:
1711 break;
1712 }
1713}
1714
1717 Type *&AccessTy) const {
1718 Value *Ptr = nullptr;
1719 switch (II->getIntrinsicID()) {
1720 case Intrinsic::amdgcn_cluster_load_b128:
1721 case Intrinsic::amdgcn_cluster_load_b64:
1722 case Intrinsic::amdgcn_cluster_load_b32:
1723 case Intrinsic::amdgcn_ds_append:
1724 case Intrinsic::amdgcn_ds_consume:
1725 case Intrinsic::amdgcn_ds_load_tr8_b64:
1726 case Intrinsic::amdgcn_ds_load_tr16_b128:
1727 case Intrinsic::amdgcn_ds_load_tr4_b64:
1728 case Intrinsic::amdgcn_ds_load_tr6_b96:
1729 case Intrinsic::amdgcn_ds_read_tr4_b64:
1730 case Intrinsic::amdgcn_ds_read_tr6_b96:
1731 case Intrinsic::amdgcn_ds_read_tr8_b64:
1732 case Intrinsic::amdgcn_ds_read_tr16_b64:
1733 case Intrinsic::amdgcn_ds_ordered_add:
1734 case Intrinsic::amdgcn_ds_ordered_swap:
1735 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1736 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1737 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1738 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1739 case Intrinsic::amdgcn_flat_load_monitor_b128:
1740 case Intrinsic::amdgcn_flat_load_monitor_b32:
1741 case Intrinsic::amdgcn_flat_load_monitor_b64:
1742 case Intrinsic::amdgcn_global_atomic_fmax_num:
1743 case Intrinsic::amdgcn_global_atomic_fmin_num:
1744 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1745 case Intrinsic::amdgcn_global_load_monitor_b128:
1746 case Intrinsic::amdgcn_global_load_monitor_b32:
1747 case Intrinsic::amdgcn_global_load_monitor_b64:
1748 case Intrinsic::amdgcn_global_load_tr_b64:
1749 case Intrinsic::amdgcn_global_load_tr_b128:
1750 case Intrinsic::amdgcn_global_load_tr4_b64:
1751 case Intrinsic::amdgcn_global_load_tr6_b96:
1752 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1753 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1754 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1755 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1756 Ptr = II->getArgOperand(0);
1757 break;
1758 case Intrinsic::amdgcn_load_to_lds:
1759 case Intrinsic::amdgcn_global_load_lds:
1760 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1761 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1762 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1763 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1764 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1765 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1766 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1767 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1768 Ptr = II->getArgOperand(1);
1769 break;
1770 default:
1771 return false;
1772 }
1773 AccessTy = II->getType();
1774 Ops.push_back(Ptr);
1775 return true;
1776}
1777
1779 unsigned AddrSpace) const {
1780 if (!Subtarget->hasFlatInstOffsets()) {
1781 // Flat instructions do not have offsets, and only have the register
1782 // address.
1783 return AM.BaseOffs == 0 && AM.Scale == 0;
1784 }
1785
1786 decltype(SIInstrFlags::FLAT) FlatVariant =
1790
1791 return AM.Scale == 0 &&
1792 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1793 AM.BaseOffs, AddrSpace, FlatVariant));
1794}
1795
1797 if (Subtarget->hasFlatGlobalInsts())
1799
1800 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1801 // Assume the we will use FLAT for all global memory accesses
1802 // on VI.
1803 // FIXME: This assumption is currently wrong. On VI we still use
1804 // MUBUF instructions for the r + i addressing mode. As currently
1805 // implemented, the MUBUF instructions only work on buffer < 4GB.
1806 // It may be possible to support > 4GB buffers with MUBUF instructions,
1807 // by setting the stride value in the resource descriptor which would
1808 // increase the size limit to (stride * 4GB). However, this is risky,
1809 // because it has never been validated.
1811 }
1812
1813 return isLegalMUBUFAddressingMode(AM);
1814}
1815
1816bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1817 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1818 // additionally can do r + r + i with addr64. 32-bit has more addressing
1819 // mode options. Depending on the resource constant, it can also do
1820 // (i64 r0) + (i32 r1) * (i14 i).
1821 //
1822 // Private arrays end up using a scratch buffer most of the time, so also
1823 // assume those use MUBUF instructions. Scratch loads / stores are currently
1824 // implemented as mubuf instructions with offen bit set, so slightly
1825 // different than the normal addr64.
1826 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1827 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1828 return false;
1829
1830 // FIXME: Since we can split immediate into soffset and immediate offset,
1831 // would it make sense to allow any immediate?
1832
1833 switch (AM.Scale) {
1834 case 0: // r + i or just i, depending on HasBaseReg.
1835 return true;
1836 case 1:
1837 return true; // We have r + r or r + i.
1838 case 2:
1839 if (AM.HasBaseReg) {
1840 // Reject 2 * r + r.
1841 return false;
1842 }
1843
1844 // Allow 2 * r as r + r
1845 // Or 2 * r + i is allowed as r + r + i.
1846 return true;
1847 default: // Don't allow n * r
1848 return false;
1849 }
1850}
1851
1853 const AddrMode &AM, Type *Ty,
1854 unsigned AS,
1855 Instruction *I) const {
1856 // No global is ever allowed as a base.
1857 if (AM.BaseGV)
1858 return false;
1859
1860 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1861 return isLegalGlobalAddressingMode(AM);
1862
1863 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1867 // If the offset isn't a multiple of 4, it probably isn't going to be
1868 // correctly aligned.
1869 // FIXME: Can we get the real alignment here?
1870 if (AM.BaseOffs % 4 != 0)
1871 return isLegalMUBUFAddressingMode(AM);
1872
1873 if (!Subtarget->hasScalarSubwordLoads()) {
1874 // There are no SMRD extloads, so if we have to do a small type access we
1875 // will use a MUBUF load.
1876 // FIXME?: We also need to do this if unaligned, but we don't know the
1877 // alignment here.
1878 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1879 return isLegalGlobalAddressingMode(AM);
1880 }
1881
1882 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1883 // SMRD instructions have an 8-bit, dword offset on SI.
1884 if (!isUInt<8>(AM.BaseOffs / 4))
1885 return false;
1886 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1887 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1888 // in 8-bits, it can use a smaller encoding.
1889 if (!isUInt<32>(AM.BaseOffs / 4))
1890 return false;
1891 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1892 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1893 if (!isUInt<20>(AM.BaseOffs))
1894 return false;
1895 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1896 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1897 // for S_BUFFER_* instructions).
1898 if (!isInt<21>(AM.BaseOffs))
1899 return false;
1900 } else {
1901 // On GFX12, all offsets are signed 24-bit in bytes.
1902 if (!isInt<24>(AM.BaseOffs))
1903 return false;
1904 }
1905
1906 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1908 AM.BaseOffs < 0) {
1909 // Scalar (non-buffer) loads can only use a negative offset if
1910 // soffset+offset is non-negative. Since the compiler can only prove that
1911 // in a few special cases, it is safer to claim that negative offsets are
1912 // not supported.
1913 return false;
1914 }
1915
1916 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1917 return true;
1918
1919 if (AM.Scale == 1 && AM.HasBaseReg)
1920 return true;
1921
1922 return false;
1923 }
1924
1925 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1926 return Subtarget->enableFlatScratch()
1928 : isLegalMUBUFAddressingMode(AM);
1929
1930 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1931 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1932 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1933 // field.
1934 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1935 // an 8-bit dword offset but we don't know the alignment here.
1936 if (!isUInt<16>(AM.BaseOffs))
1937 return false;
1938
1939 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1940 return true;
1941
1942 if (AM.Scale == 1 && AM.HasBaseReg)
1943 return true;
1944
1945 return false;
1946 }
1947
1949 // For an unknown address space, this usually means that this is for some
1950 // reason being used for pure arithmetic, and not based on some addressing
1951 // computation. We don't have instructions that compute pointers with any
1952 // addressing modes, so treat them as having no offset like flat
1953 // instructions.
1955 }
1956
1957 // Assume a user alias of global for unknown address spaces.
1958 return isLegalGlobalAddressingMode(AM);
1959}
1960
1962 const MachineFunction &MF) const {
1964 return (MemVT.getSizeInBits() <= 4 * 32);
1965 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1966 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1967 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1968 }
1970 return (MemVT.getSizeInBits() <= 2 * 32);
1971 return true;
1972}
1973
1975 unsigned Size, unsigned AddrSpace, Align Alignment,
1976 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1977 if (IsFast)
1978 *IsFast = 0;
1979
1980 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1981 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1982 // Check if alignment requirements for ds_read/write instructions are
1983 // disabled.
1984 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1985 return false;
1986
1987 Align RequiredAlignment(
1988 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1989 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1990 Alignment < RequiredAlignment)
1991 return false;
1992
1993 // Either, the alignment requirements are "enabled", or there is an
1994 // unaligned LDS access related hardware bug though alignment requirements
1995 // are "disabled". In either case, we need to check for proper alignment
1996 // requirements.
1997 //
1998 switch (Size) {
1999 case 64:
2000 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2001 // address is negative, then the instruction is incorrectly treated as
2002 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2003 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2004 // load later in the SILoadStoreOptimizer.
2005 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2006 return false;
2007
2008 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2009 // can do a 4 byte aligned, 8 byte access in a single operation using
2010 // ds_read2/write2_b32 with adjacent offsets.
2011 RequiredAlignment = Align(4);
2012
2013 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2014 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2015 // ds_write2_b32 depending on the alignment. In either case with either
2016 // alignment there is no faster way of doing this.
2017
2018 // The numbers returned here and below are not additive, it is a 'speed
2019 // rank'. They are just meant to be compared to decide if a certain way
2020 // of lowering an operation is faster than another. For that purpose
2021 // naturally aligned operation gets it bitsize to indicate that "it
2022 // operates with a speed comparable to N-bit wide load". With the full
2023 // alignment ds128 is slower than ds96 for example. If underaligned it
2024 // is comparable to a speed of a single dword access, which would then
2025 // mean 32 < 128 and it is faster to issue a wide load regardless.
2026 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2027 // wider load which will not be aligned anymore the latter is slower.
2028 if (IsFast)
2029 *IsFast = (Alignment >= RequiredAlignment) ? 64
2030 : (Alignment < Align(4)) ? 32
2031 : 1;
2032 return true;
2033 }
2034
2035 break;
2036 case 96:
2037 if (!Subtarget->hasDS96AndDS128())
2038 return false;
2039
2040 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2041 // gfx8 and older.
2042
2043 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2044 // Naturally aligned access is fastest. However, also report it is Fast
2045 // if memory is aligned less than DWORD. A narrow load or store will be
2046 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2047 // be more of them, so overall we will pay less penalty issuing a single
2048 // instruction.
2049
2050 // See comment on the values above.
2051 if (IsFast)
2052 *IsFast = (Alignment >= RequiredAlignment) ? 96
2053 : (Alignment < Align(4)) ? 32
2054 : 1;
2055 return true;
2056 }
2057
2058 break;
2059 case 128:
2060 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2061 return false;
2062
2063 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2064 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2065 // single operation using ds_read2/write2_b64.
2066 RequiredAlignment = Align(8);
2067
2068 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2069 // Naturally aligned access is fastest. However, also report it is Fast
2070 // if memory is aligned less than DWORD. A narrow load or store will be
2071 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2072 // will be more of them, so overall we will pay less penalty issuing a
2073 // single instruction.
2074
2075 // See comment on the values above.
2076 if (IsFast)
2077 *IsFast = (Alignment >= RequiredAlignment) ? 128
2078 : (Alignment < Align(4)) ? 32
2079 : 1;
2080 return true;
2081 }
2082
2083 break;
2084 default:
2085 if (Size > 32)
2086 return false;
2087
2088 break;
2089 }
2090
2091 // See comment on the values above.
2092 // Note that we have a single-dword or sub-dword here, so if underaligned
2093 // it is a slowest possible access, hence returned value is 0.
2094 if (IsFast)
2095 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2096
2097 return Alignment >= RequiredAlignment ||
2098 Subtarget->hasUnalignedDSAccessEnabled();
2099 }
2100
2101 // FIXME: We have to be conservative here and assume that flat operations
2102 // will access scratch. If we had access to the IR function, then we
2103 // could determine if any private memory was used in the function.
2104 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2105 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2106 bool AlignedBy4 = Alignment >= Align(4);
2107 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2108 if (IsFast)
2109 *IsFast = AlignedBy4 ? Size : 1;
2110 return true;
2111 }
2112
2113 if (IsFast)
2114 *IsFast = AlignedBy4;
2115
2116 return AlignedBy4;
2117 }
2118
2119 // So long as they are correct, wide global memory operations perform better
2120 // than multiple smaller memory ops -- even when misaligned
2121 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2122 if (IsFast)
2123 *IsFast = Size;
2124
2125 return Alignment >= Align(4) ||
2126 Subtarget->hasUnalignedBufferAccessEnabled();
2127 }
2128
2129 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2130 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2131 // out-of-bounds behavior, but in the edge case where an access starts
2132 // out-of-bounds and then enter in-bounds, the entire access would be treated
2133 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2134 // natural alignment of buffer accesses.
2135 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2136 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2137 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2138 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2139 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2140 return false;
2141 }
2142
2143 // Smaller than dword value must be aligned.
2144 if (Size < 32)
2145 return false;
2146
2147 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2148 // byte-address are ignored, thus forcing Dword alignment.
2149 // This applies to private, global, and constant memory.
2150 if (IsFast)
2151 *IsFast = 1;
2152
2153 return Size >= 32 && Alignment >= Align(4);
2154}
2155
2157 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2158 unsigned *IsFast) const {
2160 Alignment, Flags, IsFast);
2161}
2162
2164 LLVMContext &Context, const MemOp &Op,
2165 const AttributeList &FuncAttributes) const {
2166 // FIXME: Should account for address space here.
2167
2168 // The default fallback uses the private pointer size as a guess for a type to
2169 // use. Make sure we switch these to 64-bit accesses.
2170
2171 if (Op.size() >= 16 &&
2172 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2173 return MVT::v4i32;
2174
2175 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2176 return MVT::v2i32;
2177
2178 // Use the default.
2179 return MVT::Other;
2180}
2181
2183 const MemSDNode *MemNode = cast<MemSDNode>(N);
2184 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2185}
2186
2191
2193 unsigned DestAS) const {
2194 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2195 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2196 Subtarget->hasGloballyAddressableScratch()) {
2197 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2198 return false;
2199 }
2200
2201 // Flat -> private/local is a simple truncate.
2202 // Flat -> global is no-op
2203 return true;
2204 }
2205
2206 const GCNTargetMachine &TM =
2207 static_cast<const GCNTargetMachine &>(getTargetMachine());
2208 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2209}
2210
2218
2220 Type *Ty) const {
2221 // FIXME: Could be smarter if called for vector constants.
2222 return true;
2223}
2224
2226 unsigned Index) const {
2228 return false;
2229
2230 // TODO: Add more cases that are cheap.
2231 return Index == 0;
2232}
2233
2234bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2235 // TODO: This should be more aggressive, particular for 16-bit element
2236 // vectors. However there are some mixed improvements and regressions.
2237 EVT EltTy = VT.getVectorElementType();
2238 return EltTy.getSizeInBits() % 32 == 0;
2239}
2240
2242 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2243 switch (Op) {
2244 case ISD::LOAD:
2245 case ISD::STORE:
2246 return true;
2247 default:
2248 return false;
2249 }
2250 }
2251
2252 // SimplifySetCC uses this function to determine whether or not it should
2253 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2254 if (VT == MVT::i1 && Op == ISD::SETCC)
2255 return false;
2256
2258}
2259
2262 // This isn't really a constant pool but close enough.
2265 return PtrInfo;
2266}
2267
2268SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2269 const SDLoc &SL,
2270 SDValue Chain,
2271 uint64_t Offset) const {
2272 const DataLayout &DL = DAG.getDataLayout();
2276
2277 auto [InputPtrReg, RC, ArgTy] =
2278 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2279
2280 // We may not have the kernarg segment argument if we have no kernel
2281 // arguments.
2282 if (!InputPtrReg)
2283 return DAG.getConstant(Offset, SL, PtrVT);
2284
2286 SDValue BasePtr = DAG.getCopyFromReg(
2287 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2288
2289 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2290}
2291
2292SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2293 const SDLoc &SL) const {
2296 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2297}
2298
2299SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2300 const SDLoc &SL) const {
2301
2303 std::optional<uint32_t> KnownSize =
2305 if (KnownSize.has_value())
2306 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2307 return SDValue();
2308}
2309
2310SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2311 const SDLoc &SL, SDValue Val,
2312 bool Signed,
2313 const ISD::InputArg *Arg) const {
2314 // First, if it is a widened vector, narrow it.
2315 if (VT.isVector() &&
2317 EVT NarrowedVT =
2320 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2321 DAG.getConstant(0, SL, MVT::i32));
2322 }
2323
2324 // Then convert the vector elements or scalar value.
2325 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2326 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2327 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2328 }
2329
2330 if (MemVT.isFloatingPoint())
2331 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2332 else if (Signed)
2333 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2334 else
2335 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2336
2337 return Val;
2338}
2339
2340SDValue SITargetLowering::lowerKernargMemParameter(
2341 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2342 uint64_t Offset, Align Alignment, bool Signed,
2343 const ISD::InputArg *Arg) const {
2344
2345 MachinePointerInfo PtrInfo =
2347
2348 // Try to avoid using an extload by loading earlier than the argument address,
2349 // and extracting the relevant bits. The load should hopefully be merged with
2350 // the previous argument.
2351 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2352 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2353 int64_t AlignDownOffset = alignDown(Offset, 4);
2354 int64_t OffsetDiff = Offset - AlignDownOffset;
2355
2356 EVT IntVT = MemVT.changeTypeToInteger();
2357
2358 // TODO: If we passed in the base kernel offset we could have a better
2359 // alignment than 4, but we don't really need it.
2360 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2361 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2362 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2365
2366 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2367 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2368
2369 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2370 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2371 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2372
2373 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2374 }
2375
2376 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2377 SDValue Load = DAG.getLoad(
2378 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2380
2381 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2382 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2383}
2384
2385/// Coerce an argument which was passed in a different ABI type to the original
2386/// expected value type.
2387SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2388 SDValue Val,
2389 CCValAssign &VA,
2390 const SDLoc &SL) const {
2391 EVT ValVT = VA.getValVT();
2392
2393 // If this is an 8 or 16-bit value, it is really passed promoted
2394 // to 32 bits. Insert an assert[sz]ext to capture this, then
2395 // truncate to the right size.
2396 switch (VA.getLocInfo()) {
2397 case CCValAssign::Full:
2398 return Val;
2399 case CCValAssign::BCvt:
2400 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2401 case CCValAssign::SExt:
2402 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2403 DAG.getValueType(ValVT));
2404 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2405 case CCValAssign::ZExt:
2406 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2407 DAG.getValueType(ValVT));
2408 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2409 case CCValAssign::AExt:
2410 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2411 default:
2412 llvm_unreachable("Unknown loc info!");
2413 }
2414}
2415
2416SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2417 CCValAssign &VA, const SDLoc &SL,
2418 SDValue Chain,
2419 const ISD::InputArg &Arg) const {
2420 MachineFunction &MF = DAG.getMachineFunction();
2421 MachineFrameInfo &MFI = MF.getFrameInfo();
2422
2423 if (Arg.Flags.isByVal()) {
2424 unsigned Size = Arg.Flags.getByValSize();
2425 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2426 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2427 }
2428
2429 unsigned ArgOffset = VA.getLocMemOffset();
2430 unsigned ArgSize = VA.getValVT().getStoreSize();
2431
2432 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2433
2434 // Create load nodes to retrieve arguments from the stack.
2435 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2436
2437 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2439 MVT MemVT = VA.getValVT();
2440
2441 switch (VA.getLocInfo()) {
2442 default:
2443 break;
2444 case CCValAssign::BCvt:
2445 MemVT = VA.getLocVT();
2446 break;
2447 case CCValAssign::SExt:
2448 ExtType = ISD::SEXTLOAD;
2449 break;
2450 case CCValAssign::ZExt:
2451 ExtType = ISD::ZEXTLOAD;
2452 break;
2453 case CCValAssign::AExt:
2454 ExtType = ISD::EXTLOAD;
2455 break;
2456 }
2457
2458 SDValue ArgValue = DAG.getExtLoad(
2459 ExtType, SL, VA.getLocVT(), Chain, FIN,
2461
2462 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2463 if (ConvertedVal == ArgValue)
2464 return ConvertedVal;
2465
2466 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2467}
2468
2469SDValue SITargetLowering::lowerWorkGroupId(
2470 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2473 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2474 if (!Subtarget->hasClusters())
2475 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2476
2477 // Clusters are supported. Return the global position in the grid. If clusters
2478 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2479
2480 // WorkGroupIdXYZ = ClusterId == 0 ?
2481 // ClusterIdXYZ :
2482 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2483 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2484 SDLoc SL(ClusterIdXYZ);
2485 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2486 SDValue One = DAG.getConstant(1, SL, VT);
2487 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2488 SDValue ClusterWorkGroupIdXYZ =
2489 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2490 SDValue GlobalIdXYZ =
2491 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2492 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2493
2494 switch (MFI.getClusterDims().getKind()) {
2497 return GlobalIdXYZ;
2499 return ClusterIdXYZ;
2501 using namespace AMDGPU::Hwreg;
2502 SDValue ClusterIdField =
2503 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2504 SDNode *GetReg =
2505 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2506 SDValue ClusterId(GetReg, 0);
2507 SDValue Zero = DAG.getConstant(0, SL, VT);
2508 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2509 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2510 }
2511 }
2512
2513 llvm_unreachable("nothing should reach here");
2514}
2515
2516SDValue SITargetLowering::getPreloadedValue(
2517 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2519 const ArgDescriptor *Reg = nullptr;
2520 const TargetRegisterClass *RC;
2521 LLT Ty;
2522
2524 const ArgDescriptor WorkGroupIDX =
2525 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2526 // If GridZ is not programmed in an entry function then the hardware will set
2527 // it to all zeros, so there is no need to mask the GridY value in the low
2528 // order bits.
2529 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2530 AMDGPU::TTMP7,
2531 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2532 const ArgDescriptor WorkGroupIDZ =
2533 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2534 const ArgDescriptor ClusterWorkGroupIDX =
2535 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2536 const ArgDescriptor ClusterWorkGroupIDY =
2537 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2538 const ArgDescriptor ClusterWorkGroupIDZ =
2539 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2540 const ArgDescriptor ClusterWorkGroupMaxIDX =
2541 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2542 const ArgDescriptor ClusterWorkGroupMaxIDY =
2543 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2544 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2545 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2546 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2547 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2548
2549 auto LoadConstant = [&](unsigned N) {
2550 return DAG.getConstant(N, SDLoc(), VT);
2551 };
2552
2553 if (Subtarget->hasArchitectedSGPRs() &&
2555 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2556 bool HasFixedDims = ClusterDims.isFixedDims();
2557
2558 switch (PVID) {
2560 Reg = &WorkGroupIDX;
2561 RC = &AMDGPU::SReg_32RegClass;
2562 Ty = LLT::scalar(32);
2563 break;
2565 Reg = &WorkGroupIDY;
2566 RC = &AMDGPU::SReg_32RegClass;
2567 Ty = LLT::scalar(32);
2568 break;
2570 Reg = &WorkGroupIDZ;
2571 RC = &AMDGPU::SReg_32RegClass;
2572 Ty = LLT::scalar(32);
2573 break;
2575 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2576 return LoadConstant(0);
2577 Reg = &ClusterWorkGroupIDX;
2578 RC = &AMDGPU::SReg_32RegClass;
2579 Ty = LLT::scalar(32);
2580 break;
2582 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2583 return LoadConstant(0);
2584 Reg = &ClusterWorkGroupIDY;
2585 RC = &AMDGPU::SReg_32RegClass;
2586 Ty = LLT::scalar(32);
2587 break;
2589 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2590 return LoadConstant(0);
2591 Reg = &ClusterWorkGroupIDZ;
2592 RC = &AMDGPU::SReg_32RegClass;
2593 Ty = LLT::scalar(32);
2594 break;
2596 if (HasFixedDims)
2597 return LoadConstant(ClusterDims.getDims()[0] - 1);
2598 Reg = &ClusterWorkGroupMaxIDX;
2599 RC = &AMDGPU::SReg_32RegClass;
2600 Ty = LLT::scalar(32);
2601 break;
2603 if (HasFixedDims)
2604 return LoadConstant(ClusterDims.getDims()[1] - 1);
2605 Reg = &ClusterWorkGroupMaxIDY;
2606 RC = &AMDGPU::SReg_32RegClass;
2607 Ty = LLT::scalar(32);
2608 break;
2610 if (HasFixedDims)
2611 return LoadConstant(ClusterDims.getDims()[2] - 1);
2612 Reg = &ClusterWorkGroupMaxIDZ;
2613 RC = &AMDGPU::SReg_32RegClass;
2614 Ty = LLT::scalar(32);
2615 break;
2617 Reg = &ClusterWorkGroupMaxFlatID;
2618 RC = &AMDGPU::SReg_32RegClass;
2619 Ty = LLT::scalar(32);
2620 break;
2621 default:
2622 break;
2623 }
2624 }
2625
2626 if (!Reg)
2627 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2628 if (!Reg) {
2630 // It's possible for a kernarg intrinsic call to appear in a kernel with
2631 // no allocated segment, in which case we do not add the user sgpr
2632 // argument, so just return null.
2633 return DAG.getConstant(0, SDLoc(), VT);
2634 }
2635
2636 // It's undefined behavior if a function marked with the amdgpu-no-*
2637 // attributes uses the corresponding intrinsic.
2638 return DAG.getPOISON(VT);
2639 }
2640
2641 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2642}
2643
2645 CallingConv::ID CallConv,
2646 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2647 FunctionType *FType,
2649 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2650 const ISD::InputArg *Arg = &Ins[I];
2651
2652 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2653 "vector type argument should have been split");
2654
2655 // First check if it's a PS input addr.
2656 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2657 PSInputNum <= 15) {
2658 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2659
2660 // Inconveniently only the first part of the split is marked as isSplit,
2661 // so skip to the end. We only want to increment PSInputNum once for the
2662 // entire split argument.
2663 if (Arg->Flags.isSplit()) {
2664 while (!Arg->Flags.isSplitEnd()) {
2665 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2666 "unexpected vector split in ps argument type");
2667 if (!SkipArg)
2668 Splits.push_back(*Arg);
2669 Arg = &Ins[++I];
2670 }
2671 }
2672
2673 if (SkipArg) {
2674 // We can safely skip PS inputs.
2675 Skipped.set(Arg->getOrigArgIndex());
2676 ++PSInputNum;
2677 continue;
2678 }
2679
2680 Info->markPSInputAllocated(PSInputNum);
2681 if (Arg->Used)
2682 Info->markPSInputEnabled(PSInputNum);
2683
2684 ++PSInputNum;
2685 }
2686
2687 Splits.push_back(*Arg);
2688 }
2689}
2690
2691// Allocate special inputs passed in VGPRs.
2693 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2694 SIMachineFunctionInfo &Info) const {
2695 const LLT S32 = LLT::scalar(32);
2697
2698 if (Info.hasWorkItemIDX()) {
2699 Register Reg = AMDGPU::VGPR0;
2700 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2701
2702 CCInfo.AllocateReg(Reg);
2703 unsigned Mask =
2704 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2705 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2706 }
2707
2708 if (Info.hasWorkItemIDY()) {
2709 assert(Info.hasWorkItemIDX());
2710 if (Subtarget->hasPackedTID()) {
2711 Info.setWorkItemIDY(
2712 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2713 } else {
2714 unsigned Reg = AMDGPU::VGPR1;
2715 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2716
2717 CCInfo.AllocateReg(Reg);
2718 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2719 }
2720 }
2721
2722 if (Info.hasWorkItemIDZ()) {
2723 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2724 if (Subtarget->hasPackedTID()) {
2725 Info.setWorkItemIDZ(
2726 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2727 } else {
2728 unsigned Reg = AMDGPU::VGPR2;
2729 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2730
2731 CCInfo.AllocateReg(Reg);
2732 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2733 }
2734 }
2735}
2736
2737// Try to allocate a VGPR at the end of the argument list, or if no argument
2738// VGPRs are left allocating a stack slot.
2739// If \p Mask is is given it indicates bitfield position in the register.
2740// If \p Arg is given use it with new ]p Mask instead of allocating new.
2741static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2742 ArgDescriptor Arg = ArgDescriptor()) {
2743 if (Arg.isSet())
2744 return ArgDescriptor::createArg(Arg, Mask);
2745
2746 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2747 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2748 if (RegIdx == ArgVGPRs.size()) {
2749 // Spill to stack required.
2750 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2751
2752 return ArgDescriptor::createStack(Offset, Mask);
2753 }
2754
2755 unsigned Reg = ArgVGPRs[RegIdx];
2756 Reg = CCInfo.AllocateReg(Reg);
2757 assert(Reg != AMDGPU::NoRegister);
2758
2759 MachineFunction &MF = CCInfo.getMachineFunction();
2760 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2761 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2762 return ArgDescriptor::createRegister(Reg, Mask);
2763}
2764
2766 const TargetRegisterClass *RC,
2767 unsigned NumArgRegs) {
2768 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2769 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2770 if (RegIdx == ArgSGPRs.size())
2771 report_fatal_error("ran out of SGPRs for arguments");
2772
2773 unsigned Reg = ArgSGPRs[RegIdx];
2774 Reg = CCInfo.AllocateReg(Reg);
2775 assert(Reg != AMDGPU::NoRegister);
2776
2777 MachineFunction &MF = CCInfo.getMachineFunction();
2778 MF.addLiveIn(Reg, RC);
2780}
2781
2782// If this has a fixed position, we still should allocate the register in the
2783// CCInfo state. Technically we could get away with this for values passed
2784// outside of the normal argument range.
2786 const TargetRegisterClass *RC,
2787 MCRegister Reg) {
2788 Reg = CCInfo.AllocateReg(Reg);
2789 assert(Reg != AMDGPU::NoRegister);
2790 MachineFunction &MF = CCInfo.getMachineFunction();
2791 MF.addLiveIn(Reg, RC);
2792}
2793
2794static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2795 if (Arg) {
2796 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2797 Arg.getRegister());
2798 } else
2799 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2800}
2801
2802static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2803 if (Arg) {
2804 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2805 Arg.getRegister());
2806 } else
2807 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2808}
2809
2810/// Allocate implicit function VGPR arguments at the end of allocated user
2811/// arguments.
2813 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2814 SIMachineFunctionInfo &Info) const {
2815 const unsigned Mask = 0x3ff;
2816 ArgDescriptor Arg;
2817
2818 if (Info.hasWorkItemIDX()) {
2819 Arg = allocateVGPR32Input(CCInfo, Mask);
2820 Info.setWorkItemIDX(Arg);
2821 }
2822
2823 if (Info.hasWorkItemIDY()) {
2824 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2825 Info.setWorkItemIDY(Arg);
2826 }
2827
2828 if (Info.hasWorkItemIDZ())
2829 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2830}
2831
2832/// Allocate implicit function VGPR arguments in fixed registers.
2834 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2835 SIMachineFunctionInfo &Info) const {
2836 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2837 if (!Reg)
2838 report_fatal_error("failed to allocate VGPR for implicit arguments");
2839
2840 const unsigned Mask = 0x3ff;
2841 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2842 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2843 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2844}
2845
2847 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2848 SIMachineFunctionInfo &Info) const {
2849 auto &ArgInfo = Info.getArgInfo();
2850 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2851
2852 // TODO: Unify handling with private memory pointers.
2853 if (UserSGPRInfo.hasDispatchPtr())
2854 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2855
2856 if (UserSGPRInfo.hasQueuePtr())
2857 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2858
2859 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2860 // constant offset from the kernarg segment.
2861 if (Info.hasImplicitArgPtr())
2862 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2863
2864 if (UserSGPRInfo.hasDispatchID())
2865 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2866
2867 // flat_scratch_init is not applicable for non-kernel functions.
2868
2869 if (Info.hasWorkGroupIDX())
2870 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2871
2872 if (Info.hasWorkGroupIDY())
2873 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2874
2875 if (Info.hasWorkGroupIDZ())
2876 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2877
2878 if (Info.hasLDSKernelId())
2879 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2880}
2881
2882// Allocate special inputs passed in user SGPRs.
2884 MachineFunction &MF,
2885 const SIRegisterInfo &TRI,
2886 SIMachineFunctionInfo &Info) const {
2887 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2888 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2889 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2890 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2891 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2892 }
2893
2894 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2895 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2896 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2897 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2898 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2899 }
2900
2901 if (UserSGPRInfo.hasDispatchPtr()) {
2902 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2903 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2904 CCInfo.AllocateReg(DispatchPtrReg);
2905 }
2906
2907 if (UserSGPRInfo.hasQueuePtr()) {
2908 Register QueuePtrReg = Info.addQueuePtr(TRI);
2909 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2910 CCInfo.AllocateReg(QueuePtrReg);
2911 }
2912
2913 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2915 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2916 CCInfo.AllocateReg(InputPtrReg);
2917
2918 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2919 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2920 }
2921
2922 if (UserSGPRInfo.hasDispatchID()) {
2923 Register DispatchIDReg = Info.addDispatchID(TRI);
2924 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2925 CCInfo.AllocateReg(DispatchIDReg);
2926 }
2927
2928 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2929 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2930 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2931 CCInfo.AllocateReg(FlatScratchInitReg);
2932 }
2933
2934 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2935 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2936 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2937 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2938 }
2939
2940 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2941 // these from the dispatch pointer.
2942}
2943
2944// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2945// sequential starting from the first argument.
2947 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2949 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2950 Function &F = MF.getFunction();
2951 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2952 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2953 bool InPreloadSequence = true;
2954 unsigned InIdx = 0;
2955 bool AlignedForImplictArgs = false;
2956 unsigned ImplicitArgOffset = 0;
2957 for (auto &Arg : F.args()) {
2958 if (!InPreloadSequence || !Arg.hasInRegAttr())
2959 break;
2960
2961 unsigned ArgIdx = Arg.getArgNo();
2962 // Don't preload non-original args or parts not in the current preload
2963 // sequence.
2964 if (InIdx < Ins.size() &&
2965 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2966 break;
2967
2968 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2969 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2970 InIdx++) {
2971 assert(ArgLocs[ArgIdx].isMemLoc());
2972 auto &ArgLoc = ArgLocs[InIdx];
2973 const Align KernelArgBaseAlign = Align(16);
2974 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2975 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2976 unsigned NumAllocSGPRs =
2977 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2978
2979 // Fix alignment for hidden arguments.
2980 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2981 if (!AlignedForImplictArgs) {
2982 ImplicitArgOffset =
2983 alignTo(LastExplicitArgOffset,
2984 Subtarget->getAlignmentForImplicitArgPtr()) -
2985 LastExplicitArgOffset;
2986 AlignedForImplictArgs = true;
2987 }
2988 ArgOffset += ImplicitArgOffset;
2989 }
2990
2991 // Arg is preloaded into the previous SGPR.
2992 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2993 assert(InIdx >= 1 && "No previous SGPR");
2994 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2995 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2996 continue;
2997 }
2998
2999 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3000 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
3001 // Check for free user SGPRs for preloading.
3002 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3003 InPreloadSequence = false;
3004 break;
3005 }
3006
3007 // Preload this argument.
3008 const TargetRegisterClass *RC =
3009 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3010 SmallVectorImpl<MCRegister> *PreloadRegs =
3011 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3012
3013 if (PreloadRegs->size() > 1)
3014 RC = &AMDGPU::SGPR_32RegClass;
3015 for (auto &Reg : *PreloadRegs) {
3016 assert(Reg);
3017 MF.addLiveIn(Reg, RC);
3018 CCInfo.AllocateReg(Reg);
3019 }
3020
3021 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3022 }
3023 }
3024}
3025
3027 const SIRegisterInfo &TRI,
3028 SIMachineFunctionInfo &Info) const {
3029 // Always allocate this last since it is a synthetic preload.
3030 if (Info.hasLDSKernelId()) {
3031 Register Reg = Info.addLDSKernelId();
3032 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3033 CCInfo.AllocateReg(Reg);
3034 }
3035}
3036
3037// Allocate special input registers that are initialized per-wave.
3040 CallingConv::ID CallConv,
3041 bool IsShader) const {
3042 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3043 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3044 // Note: user SGPRs are handled by the front-end for graphics shaders
3045 // Pad up the used user SGPRs with dead inputs.
3046
3047 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3048 // before enabling architected SGPRs for workgroup IDs.
3049 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3050
3051 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3052 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3053 // rely on it to reach 16 since if we end up having no stack usage, it will
3054 // not really be added.
3055 unsigned NumRequiredSystemSGPRs =
3056 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3057 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3058 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3059 Register Reg = Info.addReservedUserSGPR();
3060 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3061 CCInfo.AllocateReg(Reg);
3062 }
3063 }
3064
3065 if (!HasArchitectedSGPRs) {
3066 if (Info.hasWorkGroupIDX()) {
3067 Register Reg = Info.addWorkGroupIDX();
3068 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3069 CCInfo.AllocateReg(Reg);
3070 }
3071
3072 if (Info.hasWorkGroupIDY()) {
3073 Register Reg = Info.addWorkGroupIDY();
3074 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3075 CCInfo.AllocateReg(Reg);
3076 }
3077
3078 if (Info.hasWorkGroupIDZ()) {
3079 Register Reg = Info.addWorkGroupIDZ();
3080 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3081 CCInfo.AllocateReg(Reg);
3082 }
3083 }
3084
3085 if (Info.hasWorkGroupInfo()) {
3086 Register Reg = Info.addWorkGroupInfo();
3087 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3088 CCInfo.AllocateReg(Reg);
3089 }
3090
3091 if (Info.hasPrivateSegmentWaveByteOffset()) {
3092 // Scratch wave offset passed in system SGPR.
3093 unsigned PrivateSegmentWaveByteOffsetReg;
3094
3095 if (IsShader) {
3096 PrivateSegmentWaveByteOffsetReg =
3097 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3098
3099 // This is true if the scratch wave byte offset doesn't have a fixed
3100 // location.
3101 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3102 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3103 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3104 }
3105 } else
3106 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3107
3108 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3109 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3110 }
3111
3112 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3113 Info.getNumPreloadedSGPRs() >= 16);
3114}
3115
3117 MachineFunction &MF,
3118 const SIRegisterInfo &TRI,
3120 // Now that we've figured out where the scratch register inputs are, see if
3121 // should reserve the arguments and use them directly.
3122 MachineFrameInfo &MFI = MF.getFrameInfo();
3123 bool HasStackObjects = MFI.hasStackObjects();
3124 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3125
3126 // Record that we know we have non-spill stack objects so we don't need to
3127 // check all stack objects later.
3128 if (HasStackObjects)
3129 Info.setHasNonSpillStackObjects(true);
3130
3131 // Everything live out of a block is spilled with fast regalloc, so it's
3132 // almost certain that spilling will be required.
3134 HasStackObjects = true;
3135
3136 // For now assume stack access is needed in any callee functions, so we need
3137 // the scratch registers to pass in.
3138 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3139
3140 if (!ST.enableFlatScratch()) {
3141 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3142 // If we have stack objects, we unquestionably need the private buffer
3143 // resource. For the Code Object V2 ABI, this will be the first 4 user
3144 // SGPR inputs. We can reserve those and use them directly.
3145
3146 Register PrivateSegmentBufferReg =
3148 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3149 } else {
3150 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3151 // We tentatively reserve the last registers (skipping the last registers
3152 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3153 // we'll replace these with the ones immediately after those which were
3154 // really allocated. In the prologue copies will be inserted from the
3155 // argument to these reserved registers.
3156
3157 // Without HSA, relocations are used for the scratch pointer and the
3158 // buffer resource setup is always inserted in the prologue. Scratch wave
3159 // offset is still in an input SGPR.
3160 Info.setScratchRSrcReg(ReservedBufferReg);
3161 }
3162 }
3163
3165
3166 // For entry functions we have to set up the stack pointer if we use it,
3167 // whereas non-entry functions get this "for free". This means there is no
3168 // intrinsic advantage to using S32 over S34 in cases where we do not have
3169 // calls but do need a frame pointer (i.e. if we are requested to have one
3170 // because frame pointer elimination is disabled). To keep things simple we
3171 // only ever use S32 as the call ABI stack pointer, and so using it does not
3172 // imply we need a separate frame pointer.
3173 //
3174 // Try to use s32 as the SP, but move it if it would interfere with input
3175 // arguments. This won't work with calls though.
3176 //
3177 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3178 // registers.
3179 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3180 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3181 } else {
3183
3184 if (MFI.hasCalls())
3185 report_fatal_error("call in graphics shader with too many input SGPRs");
3186
3187 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3188 if (!MRI.isLiveIn(Reg)) {
3189 Info.setStackPtrOffsetReg(Reg);
3190 break;
3191 }
3192 }
3193
3194 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3195 report_fatal_error("failed to find register for SP");
3196 }
3197
3198 // hasFP should be accurate for entry functions even before the frame is
3199 // finalized, because it does not rely on the known stack size, only
3200 // properties like whether variable sized objects are present.
3201 if (ST.getFrameLowering()->hasFP(MF)) {
3202 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3203 }
3204}
3205
3208 return !Info->isEntryFunction();
3209}
3210
3212
3214 MachineBasicBlock *Entry,
3215 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3217
3218 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3219 if (!IStart)
3220 return;
3221
3222 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3223 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3224 MachineBasicBlock::iterator MBBI = Entry->begin();
3225 for (const MCPhysReg *I = IStart; *I; ++I) {
3226 const TargetRegisterClass *RC = nullptr;
3227 if (AMDGPU::SReg_64RegClass.contains(*I))
3228 RC = &AMDGPU::SGPR_64RegClass;
3229 else if (AMDGPU::SReg_32RegClass.contains(*I))
3230 RC = &AMDGPU::SGPR_32RegClass;
3231 else
3232 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3233
3234 Register NewVR = MRI->createVirtualRegister(RC);
3235 // Create copy from CSR to a virtual register.
3236 Entry->addLiveIn(*I);
3237 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3238 .addReg(*I);
3239
3240 // Insert the copy-back instructions right before the terminator.
3241 for (auto *Exit : Exits)
3242 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3243 TII->get(TargetOpcode::COPY), *I)
3244 .addReg(NewVR);
3245 }
3246}
3247
3249 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3250 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3251 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3253
3255 const Function &Fn = MF.getFunction();
3258 bool IsError = false;
3259
3260 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3262 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3263 IsError = true;
3264 }
3265
3268 BitVector Skipped(Ins.size());
3269 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3270 *DAG.getContext());
3271
3272 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3273 bool IsKernel = AMDGPU::isKernel(CallConv);
3274 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3275
3276 if (IsGraphics) {
3277 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3278 assert(!UserSGPRInfo.hasDispatchPtr() &&
3279 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3280 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3281 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3282 (void)UserSGPRInfo;
3283 if (!Subtarget->enableFlatScratch())
3284 assert(!UserSGPRInfo.hasFlatScratchInit());
3285 if ((CallConv != CallingConv::AMDGPU_CS &&
3286 CallConv != CallingConv::AMDGPU_Gfx &&
3287 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3288 !Subtarget->hasArchitectedSGPRs())
3289 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3290 !Info->hasWorkGroupIDZ());
3291 }
3292
3293 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3294
3295 if (CallConv == CallingConv::AMDGPU_PS) {
3296 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3297
3298 // At least one interpolation mode must be enabled or else the GPU will
3299 // hang.
3300 //
3301 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3302 // set PSInputAddr, the user wants to enable some bits after the compilation
3303 // based on run-time states. Since we can't know what the final PSInputEna
3304 // will look like, so we shouldn't do anything here and the user should take
3305 // responsibility for the correct programming.
3306 //
3307 // Otherwise, the following restrictions apply:
3308 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3309 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3310 // enabled too.
3311 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3312 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3313 CCInfo.AllocateReg(AMDGPU::VGPR0);
3314 CCInfo.AllocateReg(AMDGPU::VGPR1);
3315 Info->markPSInputAllocated(0);
3316 Info->markPSInputEnabled(0);
3317 }
3318 if (Subtarget->isAmdPalOS()) {
3319 // For isAmdPalOS, the user does not enable some bits after compilation
3320 // based on run-time states; the register values being generated here are
3321 // the final ones set in hardware. Therefore we need to apply the
3322 // workaround to PSInputAddr and PSInputEnable together. (The case where
3323 // a bit is set in PSInputAddr but not PSInputEnable is where the
3324 // frontend set up an input arg for a particular interpolation mode, but
3325 // nothing uses that input arg. Really we should have an earlier pass
3326 // that removes such an arg.)
3327 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3328 if ((PsInputBits & 0x7F) == 0 ||
3329 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3330 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3331 }
3332 } else if (IsKernel) {
3333 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3334 } else {
3335 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3336 Ins.end());
3337 }
3338
3339 if (IsKernel)
3340 analyzeFormalArgumentsCompute(CCInfo, Ins);
3341
3342 if (IsEntryFunc) {
3343 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3344 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3345 if (IsKernel && Subtarget->hasKernargPreload())
3346 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3347
3348 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3349 } else if (!IsGraphics) {
3350 // For the fixed ABI, pass workitem IDs in the last argument register.
3351 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3352
3353 // FIXME: Sink this into allocateSpecialInputSGPRs
3354 if (!Subtarget->enableFlatScratch())
3355 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3356
3357 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3358 }
3359
3360 if (!IsKernel) {
3361 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3362 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3363
3364 // This assumes the registers are allocated by CCInfo in ascending order
3365 // with no gaps.
3366 Info->setNumWaveDispatchSGPRs(
3367 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3368 Info->setNumWaveDispatchVGPRs(
3369 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3370 } else if (Info->getNumKernargPreloadedSGPRs()) {
3371 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3372 }
3373
3375
3376 if (IsWholeWaveFunc) {
3377 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3378 {MVT::i1, MVT::Other}, Chain);
3379 InVals.push_back(Setup.getValue(0));
3380 Chains.push_back(Setup.getValue(1));
3381 }
3382
3383 // FIXME: This is the minimum kernel argument alignment. We should improve
3384 // this to the maximum alignment of the arguments.
3385 //
3386 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3387 // kern arg offset.
3388 const Align KernelArgBaseAlign = Align(16);
3389
3390 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3391 ++i) {
3392 const ISD::InputArg &Arg = Ins[i];
3393 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3394 InVals.push_back(DAG.getPOISON(Arg.VT));
3395 continue;
3396 }
3397
3398 CCValAssign &VA = ArgLocs[ArgIdx++];
3399 MVT VT = VA.getLocVT();
3400
3401 if (IsEntryFunc && VA.isMemLoc()) {
3402 VT = Ins[i].VT;
3403 EVT MemVT = VA.getLocVT();
3404
3405 const uint64_t Offset = VA.getLocMemOffset();
3406 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3407
3408 if (Arg.Flags.isByRef()) {
3409 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3410
3411 const GCNTargetMachine &TM =
3412 static_cast<const GCNTargetMachine &>(getTargetMachine());
3413 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3414 Arg.Flags.getPointerAddrSpace())) {
3417 }
3418
3419 InVals.push_back(Ptr);
3420 continue;
3421 }
3422
3423 SDValue NewArg;
3424 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3425 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3426 // In this case the argument is packed into the previous preload SGPR.
3427 int64_t AlignDownOffset = alignDown(Offset, 4);
3428 int64_t OffsetDiff = Offset - AlignDownOffset;
3429 EVT IntVT = MemVT.changeTypeToInteger();
3430
3431 const SIMachineFunctionInfo *Info =
3434 Register Reg =
3435 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3436
3437 assert(Reg);
3438 Register VReg = MRI.getLiveInVirtReg(Reg);
3439 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3440
3441 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3442 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3443
3444 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3445 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3446 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3447 Ins[i].Flags.isSExt(), &Ins[i]);
3448
3449 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3450 } else {
3451 const SIMachineFunctionInfo *Info =
3454 const SmallVectorImpl<MCRegister> &PreloadRegs =
3455 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3456
3457 SDValue Copy;
3458 if (PreloadRegs.size() == 1) {
3459 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3460 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3461 NewArg = DAG.getCopyFromReg(
3462 Chain, DL, VReg,
3464 TRI->getRegSizeInBits(*RC)));
3465
3466 } else {
3467 // If the kernarg alignment does not match the alignment of the SGPR
3468 // tuple RC that can accommodate this argument, it will be built up
3469 // via copies from from the individual SGPRs that the argument was
3470 // preloaded to.
3472 for (auto Reg : PreloadRegs) {
3473 Register VReg = MRI.getLiveInVirtReg(Reg);
3474 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3475 Elts.push_back(Copy);
3476 }
3477 NewArg =
3478 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3479 PreloadRegs.size()),
3480 DL, Elts);
3481 }
3482
3483 // If the argument was preloaded to multiple consecutive 32-bit
3484 // registers because of misalignment between addressable SGPR tuples
3485 // and the argument size, we can still assume that because of kernarg
3486 // segment alignment restrictions that NewArg's size is the same as
3487 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3488 // truncate since we cannot preload to less than a single SGPR and the
3489 // MemVT may be smaller.
3490 EVT MemVTInt =
3492 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3493 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3494
3495 NewArg = DAG.getBitcast(MemVT, NewArg);
3496 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3497 Ins[i].Flags.isSExt(), &Ins[i]);
3498 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3499 }
3500 } else {
3501 // Hidden arguments that are in the kernel signature must be preloaded
3502 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3503 // the argument list and is not preloaded.
3504 if (Arg.isOrigArg()) {
3505 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3506 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3508 *OrigArg->getParent(),
3509 "hidden argument in kernel signature was not preloaded",
3510 DL.getDebugLoc()));
3511 }
3512 }
3513
3514 NewArg =
3515 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3516 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3517 }
3518 Chains.push_back(NewArg.getValue(1));
3519
3520 auto *ParamTy =
3521 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3522 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3523 ParamTy &&
3524 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3525 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3526 // On SI local pointers are just offsets into LDS, so they are always
3527 // less than 16-bits. On CI and newer they could potentially be
3528 // real pointers, so we can't guarantee their size.
3529 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3530 DAG.getValueType(MVT::i16));
3531 }
3532
3533 InVals.push_back(NewArg);
3534 continue;
3535 }
3536 if (!IsEntryFunc && VA.isMemLoc()) {
3537 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3538 InVals.push_back(Val);
3539 if (!Arg.Flags.isByVal())
3540 Chains.push_back(Val.getValue(1));
3541 continue;
3542 }
3543
3544 assert(VA.isRegLoc() && "Parameter must be in a register!");
3545
3546 Register Reg = VA.getLocReg();
3547 const TargetRegisterClass *RC = nullptr;
3548 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3549 RC = &AMDGPU::VGPR_32RegClass;
3550 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3551 RC = &AMDGPU::SGPR_32RegClass;
3552 else
3553 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3554
3555 Reg = MF.addLiveIn(Reg, RC);
3556 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3557
3558 if (Arg.Flags.isSRet()) {
3559 // The return object should be reasonably addressable.
3560
3561 // FIXME: This helps when the return is a real sret. If it is a
3562 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3563 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3564 unsigned NumBits =
3566 Val = DAG.getNode(
3567 ISD::AssertZext, DL, VT, Val,
3568 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3569 }
3570
3571 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3572 InVals.push_back(Val);
3573 }
3574
3575 // Start adding system SGPRs.
3576 if (IsEntryFunc)
3577 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3578
3579 if (DAG.getPass()) {
3580 auto &ArgUsageInfo =
3582 ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
3583 } else if (auto *MFAM = DAG.getMFAM()) {
3584 Module &M = *MF.getFunction().getParent();
3585 auto *ArgUsageInfo =
3587 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3588 if (ArgUsageInfo)
3589 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3590 }
3591
3592 unsigned StackArgSize = CCInfo.getStackSize();
3593 Info->setBytesInStackArgArea(StackArgSize);
3594
3595 return Chains.empty() ? Chain
3596 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3597}
3598
3599// TODO: If return values can't fit in registers, we should return as many as
3600// possible in registers before passing on stack.
3602 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3603 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3604 const Type *RetTy) const {
3605 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3606 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3607 // for shaders. Vector types should be explicitly handled by CC.
3608 if (AMDGPU::isEntryFunctionCC(CallConv))
3609 return true;
3610
3612 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3613 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3614 return false;
3615
3616 // We must use the stack if return would require unavailable registers.
3617 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3618 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3619 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3620 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3621 return false;
3622
3623 return true;
3624}
3625
3626SDValue
3628 bool isVarArg,
3630 const SmallVectorImpl<SDValue> &OutVals,
3631 const SDLoc &DL, SelectionDAG &DAG) const {
3635
3636 if (AMDGPU::isKernel(CallConv)) {
3637 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3638 OutVals, DL, DAG);
3639 }
3640
3641 bool IsShader = AMDGPU::isShader(CallConv);
3642
3643 Info->setIfReturnsVoid(Outs.empty());
3644 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3645
3646 // CCValAssign - represent the assignment of the return value to a location.
3648
3649 // CCState - Info about the registers and stack slots.
3650 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3651 *DAG.getContext());
3652
3653 // Analyze outgoing return values.
3654 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3655
3656 SDValue Glue;
3658 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3659
3660 SDValue ReadFirstLane =
3661 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3662 // Copy the result values into the output registers.
3663 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3664 ++I, ++RealRVLocIdx) {
3665 CCValAssign &VA = RVLocs[I];
3666 assert(VA.isRegLoc() && "Can only return in registers!");
3667 // TODO: Partially return in registers if return values don't fit.
3668 SDValue Arg = OutVals[RealRVLocIdx];
3669
3670 // Copied from other backends.
3671 switch (VA.getLocInfo()) {
3672 case CCValAssign::Full:
3673 break;
3674 case CCValAssign::BCvt:
3675 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3676 break;
3677 case CCValAssign::SExt:
3678 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3679 break;
3680 case CCValAssign::ZExt:
3681 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3682 break;
3683 case CCValAssign::AExt:
3684 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3685 break;
3686 default:
3687 llvm_unreachable("Unknown loc info!");
3688 }
3689 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3691 ReadFirstLane, Arg);
3692 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3693 Glue = Chain.getValue(1);
3694 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3695 }
3696
3697 // FIXME: Does sret work properly?
3698 if (!Info->isEntryFunction()) {
3699 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3700 const MCPhysReg *I =
3701 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3702 if (I) {
3703 for (; *I; ++I) {
3704 if (AMDGPU::SReg_64RegClass.contains(*I))
3705 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3706 else if (AMDGPU::SReg_32RegClass.contains(*I))
3707 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3708 else
3709 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3710 }
3711 }
3712 }
3713
3714 // Update chain and glue.
3715 RetOps[0] = Chain;
3716 if (Glue.getNode())
3717 RetOps.push_back(Glue);
3718
3719 unsigned Opc = AMDGPUISD::ENDPGM;
3720 if (!IsWaveEnd)
3721 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3722 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3723 : AMDGPUISD::RET_GLUE;
3724 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3725}
3726
3728 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3729 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3730 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3731 SDValue ThisVal) const {
3732 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3733
3734 // Assign locations to each value returned by this call.
3736 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3737 *DAG.getContext());
3738 CCInfo.AnalyzeCallResult(Ins, RetCC);
3739
3740 // Copy all of the result registers out of their specified physreg.
3741 for (CCValAssign VA : RVLocs) {
3742 SDValue Val;
3743
3744 if (VA.isRegLoc()) {
3745 Val =
3746 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3747 Chain = Val.getValue(1);
3748 InGlue = Val.getValue(2);
3749 } else if (VA.isMemLoc()) {
3750 report_fatal_error("TODO: return values in memory");
3751 } else
3752 llvm_unreachable("unknown argument location type");
3753
3754 switch (VA.getLocInfo()) {
3755 case CCValAssign::Full:
3756 break;
3757 case CCValAssign::BCvt:
3758 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3759 break;
3760 case CCValAssign::ZExt:
3761 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3762 DAG.getValueType(VA.getValVT()));
3763 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3764 break;
3765 case CCValAssign::SExt:
3766 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3767 DAG.getValueType(VA.getValVT()));
3768 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3769 break;
3770 case CCValAssign::AExt:
3771 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3772 break;
3773 default:
3774 llvm_unreachable("Unknown loc info!");
3775 }
3776
3777 InVals.push_back(Val);
3778 }
3779
3780 return Chain;
3781}
3782
3783// Add code to pass special inputs required depending on used features separate
3784// from the explicit user arguments present in the IR.
3786 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3787 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3788 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3789 // If we don't have a call site, this was a call inserted by
3790 // legalization. These can never use special inputs.
3791 if (!CLI.CB)
3792 return;
3793
3794 SelectionDAG &DAG = CLI.DAG;
3795 const SDLoc &DL = CLI.DL;
3796 const Function &F = DAG.getMachineFunction().getFunction();
3797
3798 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3799 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3800
3801 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3803 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3804 if (DAG.getPass()) {
3805 auto &ArgUsageInfo =
3807 CalleeArgInfo =
3808 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3809 } else if (auto *MFAM = DAG.getMFAM()) {
3811 auto *ArgUsageInfo =
3813 DAG.getMachineFunction())
3814 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3815 if (ArgUsageInfo)
3816 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3817 }
3818 }
3819
3820 // TODO: Unify with private memory register handling. This is complicated by
3821 // the fact that at least in kernels, the input argument is not necessarily
3822 // in the same location as the input.
3823 // clang-format off
3824 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3825 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3826 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3827 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3828 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3829 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3830 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3831 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3832 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3833 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3834 };
3835 // clang-format on
3836
3837 for (auto [InputID, Attrs] : ImplicitAttrs) {
3838 // If the callee does not use the attribute value, skip copying the value.
3839 if (all_of(Attrs, [&](StringRef Attr) {
3840 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3841 }))
3842 continue;
3843
3844 const auto [OutgoingArg, ArgRC, ArgTy] =
3845 CalleeArgInfo->getPreloadedValue(InputID);
3846 if (!OutgoingArg)
3847 continue;
3848
3849 const auto [IncomingArg, IncomingArgRC, Ty] =
3850 CallerArgInfo.getPreloadedValue(InputID);
3851 assert(IncomingArgRC == ArgRC);
3852
3853 // All special arguments are ints for now.
3854 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3855 SDValue InputReg;
3856
3857 if (IncomingArg) {
3858 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3859 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3860 // The implicit arg ptr is special because it doesn't have a corresponding
3861 // input for kernels, and is computed from the kernarg segment pointer.
3862 InputReg = getImplicitArgPtr(DAG, DL);
3863 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3864 std::optional<uint32_t> Id =
3866 if (Id.has_value()) {
3867 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3868 } else {
3869 InputReg = DAG.getPOISON(ArgVT);
3870 }
3871 } else {
3872 // We may have proven the input wasn't needed, although the ABI is
3873 // requiring it. We just need to allocate the register appropriately.
3874 InputReg = DAG.getPOISON(ArgVT);
3875 }
3876
3877 if (OutgoingArg->isRegister()) {
3878 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3879 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3880 report_fatal_error("failed to allocate implicit input argument");
3881 } else {
3882 unsigned SpecialArgOffset =
3883 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3884 SDValue ArgStore =
3885 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3886 MemOpChains.push_back(ArgStore);
3887 }
3888 }
3889
3890 // Pack workitem IDs into a single register or pass it as is if already
3891 // packed.
3892
3893 auto [OutgoingArg, ArgRC, Ty] =
3895 if (!OutgoingArg)
3896 std::tie(OutgoingArg, ArgRC, Ty) =
3898 if (!OutgoingArg)
3899 std::tie(OutgoingArg, ArgRC, Ty) =
3901 if (!OutgoingArg)
3902 return;
3903
3904 const ArgDescriptor *IncomingArgX = std::get<0>(
3906 const ArgDescriptor *IncomingArgY = std::get<0>(
3908 const ArgDescriptor *IncomingArgZ = std::get<0>(
3910
3911 SDValue InputReg;
3912 SDLoc SL;
3913
3914 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3915 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3916 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3917
3918 // If incoming ids are not packed we need to pack them.
3919 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3920 NeedWorkItemIDX) {
3921 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3922 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3923 } else {
3924 InputReg = DAG.getConstant(0, DL, MVT::i32);
3925 }
3926 }
3927
3928 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3929 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3930 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3931 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3932 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3933 InputReg = InputReg.getNode()
3934 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3935 : Y;
3936 }
3937
3938 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3939 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3940 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3941 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3942 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3943 InputReg = InputReg.getNode()
3944 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3945 : Z;
3946 }
3947
3948 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3949 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3950 // We're in a situation where the outgoing function requires the workitem
3951 // ID, but the calling function does not have it (e.g a graphics function
3952 // calling a C calling convention function). This is illegal, but we need
3953 // to produce something.
3954 InputReg = DAG.getPOISON(MVT::i32);
3955 } else {
3956 // Workitem ids are already packed, any of present incoming arguments
3957 // will carry all required fields.
3958 ArgDescriptor IncomingArg =
3959 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3960 : IncomingArgY ? *IncomingArgY
3961 : *IncomingArgZ,
3962 ~0u);
3963 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3964 }
3965 }
3966
3967 if (OutgoingArg->isRegister()) {
3968 if (InputReg)
3969 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3970
3971 CCInfo.AllocateReg(OutgoingArg->getRegister());
3972 } else {
3973 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3974 if (InputReg) {
3975 SDValue ArgStore =
3976 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3977 MemOpChains.push_back(ArgStore);
3978 }
3979 }
3980}
3981
3983 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3985 const SmallVectorImpl<SDValue> &OutVals,
3986 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3987 if (AMDGPU::isChainCC(CalleeCC))
3988 return true;
3989
3990 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3991 return false;
3992
3993 // For a divergent call target, we need to do a waterfall loop over the
3994 // possible callees which precludes us from using a simple jump.
3995 if (Callee->isDivergent())
3996 return false;
3997
3999 const Function &CallerF = MF.getFunction();
4000 CallingConv::ID CallerCC = CallerF.getCallingConv();
4002 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4003
4004 // Kernels aren't callable, and don't have a live in return address so it
4005 // doesn't make sense to do a tail call with entry functions.
4006 if (!CallerPreserved)
4007 return false;
4008
4009 bool CCMatch = CallerCC == CalleeCC;
4010
4012 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4013 return true;
4014 return false;
4015 }
4016
4017 // TODO: Can we handle var args?
4018 if (IsVarArg)
4019 return false;
4020
4021 for (const Argument &Arg : CallerF.args()) {
4022 if (Arg.hasByValAttr())
4023 return false;
4024 }
4025
4026 LLVMContext &Ctx = *DAG.getContext();
4027
4028 // Check that the call results are passed in the same way.
4029 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4030 CCAssignFnForCall(CalleeCC, IsVarArg),
4031 CCAssignFnForCall(CallerCC, IsVarArg)))
4032 return false;
4033
4034 // The callee has to preserve all registers the caller needs to preserve.
4035 if (!CCMatch) {
4036 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4037 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4038 return false;
4039 }
4040
4041 // Nothing more to check if the callee is taking no arguments.
4042 if (Outs.empty())
4043 return true;
4044
4046 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4047
4048 // FIXME: We are not allocating special input registers, so we will be
4049 // deciding based on incorrect register assignments.
4050 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4051
4052 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4053 // If the stack arguments for this call do not fit into our own save area then
4054 // the call cannot be made tail.
4055 // TODO: Is this really necessary?
4056 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4057 return false;
4058
4059 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4060 // FIXME: What about inreg arguments that end up passed in memory?
4061 if (!CCVA.isRegLoc())
4062 continue;
4063
4064 // If we are passing an argument in an SGPR, and the value is divergent,
4065 // this call requires a waterfall loop.
4066 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4067 LLVM_DEBUG(
4068 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4069 << printReg(CCVA.getLocReg(), TRI) << '\n');
4070 return false;
4071 }
4072 }
4073
4074 const MachineRegisterInfo &MRI = MF.getRegInfo();
4075 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4076}
4077
4079 if (!CI->isTailCall())
4080 return false;
4081
4082 const Function *ParentFn = CI->getFunction();
4084 return false;
4085 return true;
4086}
4087
4088namespace {
4089// Chain calls have special arguments that we need to handle. These are
4090// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4091// arguments (index 0 and 1 respectively).
4092enum ChainCallArgIdx {
4093 Exec = 2,
4094 Flags,
4095 NumVGPRs,
4096 FallbackExec,
4097 FallbackCallee
4098};
4099} // anonymous namespace
4100
4101// The wave scratch offset register is used as the global base pointer.
4103 SmallVectorImpl<SDValue> &InVals) const {
4104 CallingConv::ID CallConv = CLI.CallConv;
4105 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4106
4107 SelectionDAG &DAG = CLI.DAG;
4108
4109 const SDLoc &DL = CLI.DL;
4110 SDValue Chain = CLI.Chain;
4111 SDValue Callee = CLI.Callee;
4112
4113 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4114 bool UsesDynamicVGPRs = false;
4115 if (IsChainCallConv) {
4116 // The last arguments should be the value that we need to put in EXEC,
4117 // followed by the flags and any other arguments with special meanings.
4118 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4119 // we don't treat them like the "real" arguments.
4120 auto RequestedExecIt =
4121 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4122 return Arg.OrigArgIndex == 2;
4123 });
4124 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4125
4126 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4127 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4128 CLI.OutVals.end());
4129 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4130
4131 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4132 "Haven't popped all the special args");
4133
4134 TargetLowering::ArgListEntry RequestedExecArg =
4135 CLI.Args[ChainCallArgIdx::Exec];
4136 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4137 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4138
4139 // Convert constants into TargetConstants, so they become immediate operands
4140 // instead of being selected into S_MOV.
4141 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4142 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4143 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4144 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4145 } else
4146 ChainCallSpecialArgs.push_back(Arg.Node);
4147 };
4148
4149 PushNodeOrTargetConstant(RequestedExecArg);
4150
4151 // Process any other special arguments depending on the value of the flags.
4152 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4153
4154 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4155 if (FlagsValue.isZero()) {
4156 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4157 return lowerUnhandledCall(CLI, InVals,
4158 "no additional args allowed if flags == 0");
4159 } else if (FlagsValue.isOneBitSet(0)) {
4160 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4161 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4162 }
4163
4164 if (!Subtarget->isWave32()) {
4165 return lowerUnhandledCall(
4166 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4167 }
4168
4169 UsesDynamicVGPRs = true;
4170 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4171 CLI.Args.end(), PushNodeOrTargetConstant);
4172 }
4173 }
4174
4176 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4178 bool &IsTailCall = CLI.IsTailCall;
4179 bool IsVarArg = CLI.IsVarArg;
4180 bool IsSibCall = false;
4182
4183 if (Callee.isUndef() || isNullConstant(Callee)) {
4184 if (!CLI.IsTailCall) {
4185 for (ISD::InputArg &Arg : CLI.Ins)
4186 InVals.push_back(DAG.getPOISON(Arg.VT));
4187 }
4188
4189 return Chain;
4190 }
4191
4192 if (IsVarArg) {
4193 return lowerUnhandledCall(CLI, InVals,
4194 "unsupported call to variadic function ");
4195 }
4196
4197 if (!CLI.CB)
4198 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4199
4200 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4201 return lowerUnhandledCall(CLI, InVals,
4202 "unsupported required tail call to function ");
4203 }
4204
4205 if (IsTailCall) {
4206 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4207 Outs, OutVals, Ins, DAG);
4208 if (!IsTailCall &&
4209 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4210 report_fatal_error("failed to perform tail call elimination on a call "
4211 "site marked musttail or on llvm.amdgcn.cs.chain");
4212 }
4213
4214 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4215
4216 // A sibling call is one where we're under the usual C ABI and not planning
4217 // to change that but can still do a tail call:
4218 if (!TailCallOpt && IsTailCall)
4219 IsSibCall = true;
4220
4221 if (IsTailCall)
4222 ++NumTailCalls;
4223 }
4224
4227 SmallVector<SDValue, 8> MemOpChains;
4228
4229 // Analyze operands of the call, assigning locations to each operand.
4231 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4232 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4233
4234 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4236 // With a fixed ABI, allocate fixed registers before user arguments.
4237 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4238 }
4239
4240 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4241
4242 // Get a count of how many bytes are to be pushed on the stack.
4243 unsigned NumBytes = CCInfo.getStackSize();
4244
4245 if (IsSibCall) {
4246 // Since we're not changing the ABI to make this a tail call, the memory
4247 // operands are already available in the caller's incoming argument space.
4248 NumBytes = 0;
4249 }
4250
4251 // FPDiff is the byte offset of the call's argument area from the callee's.
4252 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4253 // by this amount for a tail call. In a sibling call it must be 0 because the
4254 // caller will deallocate the entire stack and the callee still expects its
4255 // arguments to begin at SP+0. Completely unused for non-tail calls.
4256 int32_t FPDiff = 0;
4257 MachineFrameInfo &MFI = MF.getFrameInfo();
4258 auto *TRI = Subtarget->getRegisterInfo();
4259
4260 // Adjust the stack pointer for the new arguments...
4261 // These operations are automatically eliminated by the prolog/epilog pass
4262 if (!IsSibCall)
4263 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4264
4265 if (!IsSibCall || IsChainCallConv) {
4266 if (!Subtarget->enableFlatScratch()) {
4267 SmallVector<SDValue, 4> CopyFromChains;
4268
4269 // In the HSA case, this should be an identity copy.
4270 SDValue ScratchRSrcReg =
4271 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4272 RegsToPass.emplace_back(IsChainCallConv
4273 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4274 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4275 ScratchRSrcReg);
4276 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4277 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4278 }
4279 }
4280
4281 const unsigned NumSpecialInputs = RegsToPass.size();
4282
4283 MVT PtrVT = MVT::i32;
4284
4285 // Walk the register/memloc assignments, inserting copies/loads.
4286 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4287 CCValAssign &VA = ArgLocs[i];
4288 SDValue Arg = OutVals[i];
4289
4290 // Promote the value if needed.
4291 switch (VA.getLocInfo()) {
4292 case CCValAssign::Full:
4293 break;
4294 case CCValAssign::BCvt:
4295 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4296 break;
4297 case CCValAssign::ZExt:
4298 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4299 break;
4300 case CCValAssign::SExt:
4301 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4302 break;
4303 case CCValAssign::AExt:
4304 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4305 break;
4306 case CCValAssign::FPExt:
4307 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4308 break;
4309 default:
4310 llvm_unreachable("Unknown loc info!");
4311 }
4312
4313 if (VA.isRegLoc()) {
4314 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4315 } else {
4316 assert(VA.isMemLoc());
4317
4318 SDValue DstAddr;
4319 MachinePointerInfo DstInfo;
4320
4321 unsigned LocMemOffset = VA.getLocMemOffset();
4322 int32_t Offset = LocMemOffset;
4323
4324 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4325 MaybeAlign Alignment;
4326
4327 if (IsTailCall) {
4328 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4329 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4330 : VA.getValVT().getStoreSize();
4331
4332 // FIXME: We can have better than the minimum byval required alignment.
4333 Alignment =
4334 Flags.isByVal()
4335 ? Flags.getNonZeroByValAlign()
4336 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4337
4338 Offset = Offset + FPDiff;
4339 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4340
4341 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4342 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4343
4344 // Make sure any stack arguments overlapping with where we're storing
4345 // are loaded before this eventual operation. Otherwise they'll be
4346 // clobbered.
4347
4348 // FIXME: Why is this really necessary? This seems to just result in a
4349 // lot of code to copy the stack and write them back to the same
4350 // locations, which are supposed to be immutable?
4351 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4352 } else {
4353 // Stores to the argument stack area are relative to the stack pointer.
4354 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4355 MVT::i32);
4356 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4357 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4358 Alignment =
4359 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4360 }
4361
4362 if (Outs[i].Flags.isByVal()) {
4363 SDValue SizeNode =
4364 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4365 SDValue Cpy =
4366 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4367 Outs[i].Flags.getNonZeroByValAlign(),
4368 /*isVol = */ false, /*AlwaysInline = */ true,
4369 /*CI=*/nullptr, std::nullopt, DstInfo,
4371
4372 MemOpChains.push_back(Cpy);
4373 } else {
4374 SDValue Store =
4375 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4376 MemOpChains.push_back(Store);
4377 }
4378 }
4379 }
4380
4381 if (!MemOpChains.empty())
4382 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4383
4384 SDValue ReadFirstLaneID =
4385 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4386
4387 SDValue TokenGlue;
4388 if (CLI.ConvergenceControlToken) {
4389 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4391 }
4392
4393 // Build a sequence of copy-to-reg nodes chained together with token chain
4394 // and flag operands which copy the outgoing args into the appropriate regs.
4395 SDValue InGlue;
4396
4397 unsigned ArgIdx = 0;
4398 for (auto [Reg, Val] : RegsToPass) {
4399 if (ArgIdx++ >= NumSpecialInputs &&
4400 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4401 // For chain calls, the inreg arguments are required to be
4402 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4403 // they are uniform.
4404 //
4405 // For other calls, if an inreg arguments is known to be uniform,
4406 // speculatively insert a readfirstlane in case it is in a VGPR.
4407 //
4408 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4409 // value, so let that continue to produce invalid code.
4410
4411 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4412 if (TokenGlue)
4413 ReadfirstlaneArgs.push_back(TokenGlue);
4415 ReadfirstlaneArgs);
4416 }
4417
4418 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4419 InGlue = Chain.getValue(1);
4420 }
4421
4422 // We don't usually want to end the call-sequence here because we would tidy
4423 // the frame up *after* the call, however in the ABI-changing tail-call case
4424 // we've carefully laid out the parameters so that when sp is reset they'll be
4425 // in the correct location.
4426 if (IsTailCall && !IsSibCall) {
4427 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4428 InGlue = Chain.getValue(1);
4429 }
4430
4431 std::vector<SDValue> Ops({Chain});
4432
4433 // Add a redundant copy of the callee global which will not be legalized, as
4434 // we need direct access to the callee later.
4436 const GlobalValue *GV = GSD->getGlobal();
4437 Ops.push_back(Callee);
4438 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4439 } else {
4440 if (IsTailCall) {
4441 // isEligibleForTailCallOptimization considered whether the call target is
4442 // divergent, but we may still end up with a uniform value in a VGPR.
4443 // Insert a readfirstlane just in case.
4444 SDValue ReadFirstLaneID =
4445 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4446
4447 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4448 if (TokenGlue)
4449 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4450 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4451 ReadfirstlaneArgs);
4452 }
4453
4454 Ops.push_back(Callee);
4455 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4456 }
4457
4458 if (IsTailCall) {
4459 // Each tail call may have to adjust the stack by a different amount, so
4460 // this information must travel along with the operation for eventual
4461 // consumption by emitEpilogue.
4462 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4463 }
4464
4465 if (IsChainCallConv)
4466 llvm::append_range(Ops, ChainCallSpecialArgs);
4467
4468 // Add argument registers to the end of the list so that they are known live
4469 // into the call.
4470 for (auto &[Reg, Val] : RegsToPass)
4471 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4472
4473 // Add a register mask operand representing the call-preserved registers.
4474 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4475 assert(Mask && "Missing call preserved mask for calling convention");
4476 Ops.push_back(DAG.getRegisterMask(Mask));
4477
4478 if (SDValue Token = CLI.ConvergenceControlToken) {
4480 GlueOps.push_back(Token);
4481 if (InGlue)
4482 GlueOps.push_back(InGlue);
4483
4484 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4485 MVT::Glue, GlueOps),
4486 0);
4487 }
4488
4489 if (InGlue)
4490 Ops.push_back(InGlue);
4491
4492 // If we're doing a tall call, use a TC_RETURN here rather than an
4493 // actual call instruction.
4494 if (IsTailCall) {
4495 MFI.setHasTailCall();
4496 unsigned OPC = AMDGPUISD::TC_RETURN;
4497 switch (CallConv) {
4499 OPC = AMDGPUISD::TC_RETURN_GFX;
4500 break;
4503 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4504 : AMDGPUISD::TC_RETURN_CHAIN;
4505 break;
4506 }
4507
4508 // If the caller is a whole wave function, we need to use a special opcode
4509 // so we can patch up EXEC.
4510 if (Info->isWholeWaveFunction())
4511 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4512
4513 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4514 }
4515
4516 // Returns a chain and a flag for retval copy to use.
4517 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4518 Chain = Call.getValue(0);
4519 InGlue = Call.getValue(1);
4520
4521 uint64_t CalleePopBytes = NumBytes;
4522 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4523 if (!Ins.empty())
4524 InGlue = Chain.getValue(1);
4525
4526 // Handle result values, copying them out of physregs into vregs that we
4527 // return.
4528 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4529 InVals, /*IsThisReturn=*/false, SDValue());
4530}
4531
4532// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4533// except for:
4534// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4535// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4537 SelectionDAG &DAG) const {
4538 const MachineFunction &MF = DAG.getMachineFunction();
4540
4541 SDLoc dl(Op);
4542 EVT VT = Op.getValueType();
4543 SDValue Chain = Op.getOperand(0);
4544 Register SPReg = Info->getStackPtrOffsetReg();
4545
4546 // Chain the dynamic stack allocation so that it doesn't modify the stack
4547 // pointer when other instructions are using the stack.
4548 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4549
4550 SDValue Size = Op.getOperand(1);
4551 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4552 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4553
4554 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4556 "Stack grows upwards for AMDGPU");
4557
4558 Chain = BaseAddr.getValue(1);
4559 Align StackAlign = TFL->getStackAlign();
4560 if (Alignment > StackAlign) {
4561 uint64_t ScaledAlignment = Alignment.value()
4562 << Subtarget->getWavefrontSizeLog2();
4563 uint64_t StackAlignMask = ScaledAlignment - 1;
4564 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4565 DAG.getConstant(StackAlignMask, dl, VT));
4566 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4567 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4568 }
4569
4570 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4571 SDValue NewSP;
4573 // For constant sized alloca, scale alloca size by wave-size
4574 SDValue ScaledSize = DAG.getNode(
4575 ISD::SHL, dl, VT, Size,
4576 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4577 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4578 } else {
4579 // For dynamic sized alloca, perform wave-wide reduction to get max of
4580 // alloca size(divergent) and then scale it by wave-size
4581 SDValue WaveReduction =
4582 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4583 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4584 Size, DAG.getConstant(0, dl, MVT::i32));
4585 SDValue ScaledSize = DAG.getNode(
4586 ISD::SHL, dl, VT, Size,
4587 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4588 NewSP =
4589 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4590 SDValue ReadFirstLaneID =
4591 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4592 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4593 NewSP);
4594 }
4595
4596 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4597 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4598
4599 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4600}
4601
4603 if (Op.getValueType() != MVT::i32)
4604 return Op; // Defer to cannot select error.
4605
4607 SDLoc SL(Op);
4608
4609 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4610
4611 // Convert from wave uniform to swizzled vector address. This should protect
4612 // from any edge cases where the stacksave result isn't directly used with
4613 // stackrestore.
4614 SDValue VectorAddress =
4615 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4616 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4617}
4618
4620 SelectionDAG &DAG) const {
4621 SDLoc SL(Op);
4622 assert(Op.getValueType() == MVT::i32);
4623
4624 uint32_t BothRoundHwReg =
4626 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4627
4628 SDValue IntrinID =
4629 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4630 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4631 Op.getOperand(0), IntrinID, GetRoundBothImm);
4632
4633 // There are two rounding modes, one for f32 and one for f64/f16. We only
4634 // report in the standard value range if both are the same.
4635 //
4636 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4637 // ties away from zero is not supported, and the other values are rotated by
4638 // 1.
4639 //
4640 // If the two rounding modes are not the same, report a target defined value.
4641
4642 // Mode register rounding mode fields:
4643 //
4644 // [1:0] Single-precision round mode.
4645 // [3:2] Double/Half-precision round mode.
4646 //
4647 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4648 //
4649 // Hardware Spec
4650 // Toward-0 3 0
4651 // Nearest Even 0 1
4652 // +Inf 1 2
4653 // -Inf 2 3
4654 // NearestAway0 N/A 4
4655 //
4656 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4657 // table we can index by the raw hardware mode.
4658 //
4659 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4660
4661 SDValue BitTable =
4663
4664 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4665 SDValue RoundModeTimesNumBits =
4666 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4667
4668 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4669 // knew only one mode was demanded.
4670 SDValue TableValue =
4671 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4672 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4673
4674 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4675 SDValue TableEntry =
4676 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4677
4678 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4679 // if it's an extended value.
4680 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4681 SDValue IsStandardValue =
4682 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4683 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4684 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4685 TableEntry, EnumOffset);
4686
4687 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4688}
4689
4691 SelectionDAG &DAG) const {
4692 SDLoc SL(Op);
4693
4694 SDValue NewMode = Op.getOperand(1);
4695 assert(NewMode.getValueType() == MVT::i32);
4696
4697 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4698 // hardware MODE.fp_round values.
4699 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4700 uint32_t ClampedVal = std::min(
4701 static_cast<uint32_t>(ConstMode->getZExtValue()),
4703 NewMode = DAG.getConstant(
4704 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4705 } else {
4706 // If we know the input can only be one of the supported standard modes in
4707 // the range 0-3, we can use a simplified mapping to hardware values.
4708 KnownBits KB = DAG.computeKnownBits(NewMode);
4709 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4710 // The supported standard values are 0-3. The extended values start at 8. We
4711 // need to offset by 4 if the value is in the extended range.
4712
4713 if (UseReducedTable) {
4714 // Truncate to the low 32-bits.
4715 SDValue BitTable = DAG.getConstant(
4716 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4717
4718 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4719 SDValue RoundModeTimesNumBits =
4720 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4721
4722 NewMode =
4723 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4724
4725 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4726 // the table extracted bits into inline immediates.
4727 } else {
4728 // table_index = umin(value, value - 4)
4729 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4730 SDValue BitTable =
4732
4733 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4734 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4735 SDValue IndexVal =
4736 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4737
4738 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4739 SDValue RoundModeTimesNumBits =
4740 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4741
4742 SDValue TableValue =
4743 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4744 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4745
4746 // No need to mask out the high bits since the setreg will ignore them
4747 // anyway.
4748 NewMode = TruncTable;
4749 }
4750
4751 // Insert a readfirstlane in case the value is a VGPR. We could do this
4752 // earlier and keep more operations scalar, but that interferes with
4753 // combining the source.
4754 SDValue ReadFirstLaneID =
4755 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4756 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4757 ReadFirstLaneID, NewMode);
4758 }
4759
4760 // N.B. The setreg will be later folded into s_round_mode on supported
4761 // targets.
4762 SDValue IntrinID =
4763 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4764 uint32_t BothRoundHwReg =
4766 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4767
4768 SDValue SetReg =
4769 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4770 IntrinID, RoundBothImm, NewMode);
4771
4772 return SetReg;
4773}
4774
4776 if (Op->isDivergent() &&
4777 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4778 // Cannot do I$ prefetch with divergent pointer.
4779 return SDValue();
4780
4781 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4785 break;
4787 if (Subtarget->hasSafeSmemPrefetch())
4788 break;
4789 [[fallthrough]];
4790 default:
4791 return SDValue();
4792 }
4793
4794 // I$ prefetch
4795 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4796 return SDValue();
4797
4798 return Op;
4799}
4800
4801// Work around DAG legality rules only based on the result type.
4803 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4804 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4805 EVT SrcVT = Src.getValueType();
4806
4807 if (SrcVT.getScalarType() != MVT::bf16)
4808 return Op;
4809
4810 SDLoc SL(Op);
4811 SDValue BitCast =
4812 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4813
4814 EVT DstVT = Op.getValueType();
4815 if (IsStrict)
4816 llvm_unreachable("Need STRICT_BF16_TO_FP");
4817
4818 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4819}
4820
4822 SDLoc SL(Op);
4823 if (Op.getValueType() != MVT::i64)
4824 return Op;
4825
4826 uint32_t ModeHwReg =
4828 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4829 uint32_t TrapHwReg =
4831 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4832
4833 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4834 SDValue IntrinID =
4835 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4836 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4837 Op.getOperand(0), IntrinID, ModeHwRegImm);
4838 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4839 Op.getOperand(0), IntrinID, TrapHwRegImm);
4840 SDValue TokenReg =
4841 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4842 GetTrapReg.getValue(1));
4843
4844 SDValue CvtPtr =
4845 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4846 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4847
4848 return DAG.getMergeValues({Result, TokenReg}, SL);
4849}
4850
4852 SDLoc SL(Op);
4853 if (Op.getOperand(1).getValueType() != MVT::i64)
4854 return Op;
4855
4856 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4857 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4858 DAG.getConstant(0, SL, MVT::i32));
4859 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4860 DAG.getConstant(1, SL, MVT::i32));
4861
4862 SDValue ReadFirstLaneID =
4863 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4864 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4865 ReadFirstLaneID, NewModeReg);
4866 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4867 ReadFirstLaneID, NewTrapReg);
4868
4869 unsigned ModeHwReg =
4871 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4872 unsigned TrapHwReg =
4874 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4875
4876 SDValue IntrinID =
4877 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4878 SDValue SetModeReg =
4879 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4880 IntrinID, ModeHwRegImm, NewModeReg);
4881 SDValue SetTrapReg =
4882 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4883 IntrinID, TrapHwRegImm, NewTrapReg);
4884 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4885}
4886
4888 const MachineFunction &MF) const {
4889 const Function &Fn = MF.getFunction();
4890
4892 .Case("m0", AMDGPU::M0)
4893 .Case("exec", AMDGPU::EXEC)
4894 .Case("exec_lo", AMDGPU::EXEC_LO)
4895 .Case("exec_hi", AMDGPU::EXEC_HI)
4896 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4897 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4898 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4899 .Default(Register());
4900 if (!Reg)
4901 return Reg;
4902
4903 if (!Subtarget->hasFlatScrRegister() &&
4904 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4905 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4906 "\" for subtarget."));
4907 }
4908
4909 switch (Reg) {
4910 case AMDGPU::M0:
4911 case AMDGPU::EXEC_LO:
4912 case AMDGPU::EXEC_HI:
4913 case AMDGPU::FLAT_SCR_LO:
4914 case AMDGPU::FLAT_SCR_HI:
4915 if (VT.getSizeInBits() == 32)
4916 return Reg;
4917 break;
4918 case AMDGPU::EXEC:
4919 case AMDGPU::FLAT_SCR:
4920 if (VT.getSizeInBits() == 64)
4921 return Reg;
4922 break;
4923 default:
4924 llvm_unreachable("missing register type checking");
4925 }
4926
4928 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4929}
4930
4931// If kill is not the last instruction, split the block so kill is always a
4932// proper terminator.
4935 MachineBasicBlock *BB) const {
4936 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4938 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4939 return SplitBB;
4940}
4941
4942// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4943// \p MI will be the only instruction in the loop body block. Otherwise, it will
4944// be the first instruction in the remainder block.
4945//
4946/// \returns { LoopBody, Remainder }
4947static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4949 MachineFunction *MF = MBB.getParent();
4951
4952 // To insert the loop we need to split the block. Move everything after this
4953 // point to a new block, and insert a new empty block between the two.
4955 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4957 ++MBBI;
4958
4959 MF->insert(MBBI, LoopBB);
4960 MF->insert(MBBI, RemainderBB);
4961
4962 LoopBB->addSuccessor(LoopBB);
4963 LoopBB->addSuccessor(RemainderBB);
4964
4965 // Move the rest of the block into a new block.
4966 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4967
4968 if (InstInLoop) {
4969 auto Next = std::next(I);
4970
4971 // Move instruction to loop body.
4972 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4973
4974 // Move the rest of the block.
4975 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4976 } else {
4977 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4978 }
4979
4980 MBB.addSuccessor(LoopBB);
4981
4982 return std::pair(LoopBB, RemainderBB);
4983}
4984
4985/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4987 MachineBasicBlock *MBB = MI.getParent();
4989 auto I = MI.getIterator();
4990 auto E = std::next(I);
4991
4992 // clang-format off
4993 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4994 .addImm(0);
4995 // clang-format on
4996
4997 MIBundleBuilder Bundler(*MBB, I, E);
4998 finalizeBundle(*MBB, Bundler.begin());
4999}
5000
5003 MachineBasicBlock *BB) const {
5004 const DebugLoc &DL = MI.getDebugLoc();
5005
5007
5009
5010 // Apparently kill flags are only valid if the def is in the same block?
5011 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
5012 Src->setIsKill(false);
5013
5014 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5015
5016 MachineBasicBlock::iterator I = LoopBB->end();
5017
5018 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5020
5021 // Clear TRAP_STS.MEM_VIOL
5022 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5023 .addImm(0)
5024 .addImm(EncodedReg);
5025
5027
5028 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5029
5030 // Load and check TRAP_STS.MEM_VIOL
5031 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5032 .addImm(EncodedReg);
5033
5034 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5035 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5036 .addReg(Reg, RegState::Kill)
5037 .addImm(0);
5038 // clang-format off
5039 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5040 .addMBB(LoopBB);
5041 // clang-format on
5042
5043 return RemainderBB;
5044}
5045
5046// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5047// wavefront. If the value is uniform and just happens to be in a VGPR, this
5048// will only do one iteration. In the worst case, this will loop 64 times.
5049//
5050// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5053 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5054 const DebugLoc &DL, const MachineOperand &Idx,
5055 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5056 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5057 Register &SGPRIdxReg) {
5058
5059 MachineFunction *MF = OrigBB.getParent();
5060 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5061 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5064
5065 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5066 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5067 Register NewExec = MRI.createVirtualRegister(BoolRC);
5068 Register CurrentIdxReg =
5069 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5070 Register CondReg = MRI.createVirtualRegister(BoolRC);
5071
5072 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5073 .addReg(InitReg)
5074 .addMBB(&OrigBB)
5075 .addReg(ResultReg)
5076 .addMBB(&LoopBB);
5077
5078 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5079 .addReg(InitSaveExecReg)
5080 .addMBB(&OrigBB)
5081 .addReg(NewExec)
5082 .addMBB(&LoopBB);
5083
5084 // Read the next variant <- also loop target.
5085 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5086 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5087
5088 // Compare the just read M0 value to all possible Idx values.
5089 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5090 .addReg(CurrentIdxReg)
5091 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5092
5093 // Update EXEC, save the original EXEC value to VCC.
5094 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5095 .addReg(CondReg, RegState::Kill);
5096
5097 MRI.setSimpleHint(NewExec, CondReg);
5098
5099 if (UseGPRIdxMode) {
5100 if (Offset == 0) {
5101 SGPRIdxReg = CurrentIdxReg;
5102 } else {
5103 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5104 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5105 .addReg(CurrentIdxReg, RegState::Kill)
5106 .addImm(Offset);
5107 }
5108 } else {
5109 // Move index from VCC into M0
5110 if (Offset == 0) {
5111 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5112 .addReg(CurrentIdxReg, RegState::Kill);
5113 } else {
5114 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5115 .addReg(CurrentIdxReg, RegState::Kill)
5116 .addImm(Offset);
5117 }
5118 }
5119
5120 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5121 MachineInstr *InsertPt =
5122 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5123 .addReg(LMC.ExecReg)
5124 .addReg(NewExec);
5125
5126 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5127 // s_cbranch_scc0?
5128
5129 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5130 // clang-format off
5131 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5132 .addMBB(&LoopBB);
5133 // clang-format on
5134
5135 return InsertPt->getIterator();
5136}
5137
5138// This has slightly sub-optimal regalloc when the source vector is killed by
5139// the read. The register allocator does not understand that the kill is
5140// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5141// subregister from it, using 1 more VGPR than necessary. This was saved when
5142// this was expanded after register allocation.
5145 unsigned InitResultReg, unsigned PhiReg, int Offset,
5146 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5147 MachineFunction *MF = MBB.getParent();
5148 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5149 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5151 const DebugLoc &DL = MI.getDebugLoc();
5153
5154 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5155 Register DstReg = MI.getOperand(0).getReg();
5156 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5157 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5159
5160 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5161
5162 // Save the EXEC mask
5163 // clang-format off
5164 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5165 .addReg(LMC.ExecReg);
5166 // clang-format on
5167
5168 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5169
5170 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5171
5172 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5173 InitResultReg, DstReg, PhiReg, TmpExec,
5174 Offset, UseGPRIdxMode, SGPRIdxReg);
5175
5176 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5178 ++MBBI;
5179 MF->insert(MBBI, LandingPad);
5180 LoopBB->removeSuccessor(RemainderBB);
5181 LandingPad->addSuccessor(RemainderBB);
5182 LoopBB->addSuccessor(LandingPad);
5183 MachineBasicBlock::iterator First = LandingPad->begin();
5184 // clang-format off
5185 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5186 .addReg(SaveExec);
5187 // clang-format on
5188
5189 return InsPt;
5190}
5191
5192// Returns subreg index, offset
5193static std::pair<unsigned, int>
5195 const TargetRegisterClass *SuperRC, unsigned VecReg,
5196 int Offset) {
5197 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5198
5199 // Skip out of bounds offsets, or else we would end up using an undefined
5200 // register.
5201 if (Offset >= NumElts || Offset < 0)
5202 return std::pair(AMDGPU::sub0, Offset);
5203
5204 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5205}
5206
5209 int Offset) {
5210 MachineBasicBlock *MBB = MI.getParent();
5211 const DebugLoc &DL = MI.getDebugLoc();
5213
5214 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5215
5216 assert(Idx->getReg() != AMDGPU::NoRegister);
5217
5218 if (Offset == 0) {
5219 // clang-format off
5220 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5221 .add(*Idx);
5222 // clang-format on
5223 } else {
5224 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5225 .add(*Idx)
5226 .addImm(Offset);
5227 }
5228}
5229
5232 int Offset) {
5233 MachineBasicBlock *MBB = MI.getParent();
5234 const DebugLoc &DL = MI.getDebugLoc();
5236
5237 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5238
5239 if (Offset == 0)
5240 return Idx->getReg();
5241
5242 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5243 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5244 .add(*Idx)
5245 .addImm(Offset);
5246 return Tmp;
5247}
5248
5251 const GCNSubtarget &ST) {
5252 const SIInstrInfo *TII = ST.getInstrInfo();
5253 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5254 MachineFunction *MF = MBB.getParent();
5256
5257 Register Dst = MI.getOperand(0).getReg();
5258 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5259 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5260 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5261
5262 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5263 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5264
5265 unsigned SubReg;
5266 std::tie(SubReg, Offset) =
5267 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5268
5269 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5270
5271 // Check for a SGPR index.
5272 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5274 const DebugLoc &DL = MI.getDebugLoc();
5275
5276 if (UseGPRIdxMode) {
5277 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5278 // to avoid interfering with other uses, so probably requires a new
5279 // optimization pass.
5281
5282 const MCInstrDesc &GPRIDXDesc =
5283 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5284 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5285 .addReg(SrcReg)
5286 .addReg(Idx)
5287 .addImm(SubReg);
5288 } else {
5290
5291 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5292 .addReg(SrcReg, 0, SubReg)
5293 .addReg(SrcReg, RegState::Implicit);
5294 }
5295
5296 MI.eraseFromParent();
5297
5298 return &MBB;
5299 }
5300
5301 // Control flow needs to be inserted if indexing with a VGPR.
5302 const DebugLoc &DL = MI.getDebugLoc();
5304
5305 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5306 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5307
5308 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5309
5310 Register SGPRIdxReg;
5311 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5312 UseGPRIdxMode, SGPRIdxReg);
5313
5314 MachineBasicBlock *LoopBB = InsPt->getParent();
5315
5316 if (UseGPRIdxMode) {
5317 const MCInstrDesc &GPRIDXDesc =
5318 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5319
5320 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5321 .addReg(SrcReg)
5322 .addReg(SGPRIdxReg)
5323 .addImm(SubReg);
5324 } else {
5325 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5326 .addReg(SrcReg, 0, SubReg)
5327 .addReg(SrcReg, RegState::Implicit);
5328 }
5329
5330 MI.eraseFromParent();
5331
5332 return LoopBB;
5333}
5334
5337 const GCNSubtarget &ST) {
5338 const SIInstrInfo *TII = ST.getInstrInfo();
5339 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5340 MachineFunction *MF = MBB.getParent();
5342
5343 Register Dst = MI.getOperand(0).getReg();
5344 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5345 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5346 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5347 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5348 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5349 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5350
5351 // This can be an immediate, but will be folded later.
5352 assert(Val->getReg());
5353
5354 unsigned SubReg;
5355 std::tie(SubReg, Offset) =
5356 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5357 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5358
5359 if (Idx->getReg() == AMDGPU::NoRegister) {
5361 const DebugLoc &DL = MI.getDebugLoc();
5362
5363 assert(Offset == 0);
5364
5365 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5366 .add(*SrcVec)
5367 .add(*Val)
5368 .addImm(SubReg);
5369
5370 MI.eraseFromParent();
5371 return &MBB;
5372 }
5373
5374 // Check for a SGPR index.
5375 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5377 const DebugLoc &DL = MI.getDebugLoc();
5378
5379 if (UseGPRIdxMode) {
5381
5382 const MCInstrDesc &GPRIDXDesc =
5383 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5384 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5385 .addReg(SrcVec->getReg())
5386 .add(*Val)
5387 .addReg(Idx)
5388 .addImm(SubReg);
5389 } else {
5391
5392 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5393 TRI.getRegSizeInBits(*VecRC), 32, false);
5394 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5395 .addReg(SrcVec->getReg())
5396 .add(*Val)
5397 .addImm(SubReg);
5398 }
5399 MI.eraseFromParent();
5400 return &MBB;
5401 }
5402
5403 // Control flow needs to be inserted if indexing with a VGPR.
5404 if (Val->isReg())
5405 MRI.clearKillFlags(Val->getReg());
5406
5407 const DebugLoc &DL = MI.getDebugLoc();
5408
5409 Register PhiReg = MRI.createVirtualRegister(VecRC);
5410
5411 Register SGPRIdxReg;
5412 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5413 UseGPRIdxMode, SGPRIdxReg);
5414 MachineBasicBlock *LoopBB = InsPt->getParent();
5415
5416 if (UseGPRIdxMode) {
5417 const MCInstrDesc &GPRIDXDesc =
5418 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5419
5420 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5421 .addReg(PhiReg)
5422 .add(*Val)
5423 .addReg(SGPRIdxReg)
5424 .addImm(SubReg);
5425 } else {
5426 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5427 TRI.getRegSizeInBits(*VecRC), 32, false);
5428 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5429 .addReg(PhiReg)
5430 .add(*Val)
5431 .addImm(SubReg);
5432 }
5433
5434 MI.eraseFromParent();
5435 return LoopBB;
5436}
5437
5439 MachineBasicBlock *BB) {
5440 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5441 // For GFX12, we emit s_add_u64 and s_sub_u64.
5442 MachineFunction *MF = BB->getParent();
5443 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5444 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5446 const DebugLoc &DL = MI.getDebugLoc();
5447 MachineOperand &Dest = MI.getOperand(0);
5448 MachineOperand &Src0 = MI.getOperand(1);
5449 MachineOperand &Src1 = MI.getOperand(2);
5450 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5451 if (ST.hasScalarAddSub64()) {
5452 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5453 // clang-format off
5454 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5455 .add(Src0)
5456 .add(Src1);
5457 // clang-format on
5458 } else {
5459 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5460 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5461
5462 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5463 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5464
5465 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5466 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5467 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5468 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5469
5470 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5471 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5472 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5473 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5474
5475 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5476 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5477 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5478 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5479 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5480 .addReg(DestSub0)
5481 .addImm(AMDGPU::sub0)
5482 .addReg(DestSub1)
5483 .addImm(AMDGPU::sub1);
5484 }
5485 MI.eraseFromParent();
5486 return BB;
5487}
5488
5490 switch (Opc) {
5491 case AMDGPU::S_MIN_U32:
5492 return std::numeric_limits<uint32_t>::max();
5493 case AMDGPU::S_MIN_I32:
5494 return std::numeric_limits<int32_t>::max();
5495 case AMDGPU::S_MAX_U32:
5496 return std::numeric_limits<uint32_t>::min();
5497 case AMDGPU::S_MAX_I32:
5498 return std::numeric_limits<int32_t>::min();
5499 case AMDGPU::V_ADD_F32_e64: // -0.0
5500 return 0x80000000;
5501 case AMDGPU::V_SUB_F32_e64: // +0.0
5502 return 0x0;
5503 case AMDGPU::S_ADD_I32:
5504 case AMDGPU::S_SUB_I32:
5505 case AMDGPU::S_OR_B32:
5506 case AMDGPU::S_XOR_B32:
5507 return std::numeric_limits<uint32_t>::min();
5508 case AMDGPU::S_AND_B32:
5509 return std::numeric_limits<uint32_t>::max();
5510 case AMDGPU::V_MIN_F32_e64:
5511 case AMDGPU::V_MAX_F32_e64:
5512 return 0x7fc00000; // qNAN
5513 default:
5515 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5516 }
5517}
5518
5520 switch (Opc) {
5521 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5522 return std::numeric_limits<uint64_t>::max();
5523 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5524 return std::numeric_limits<int64_t>::max();
5525 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5526 return std::numeric_limits<uint64_t>::min();
5527 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5528 return std::numeric_limits<int64_t>::min();
5529 case AMDGPU::S_ADD_U64_PSEUDO:
5530 case AMDGPU::S_SUB_U64_PSEUDO:
5531 case AMDGPU::S_OR_B64:
5532 case AMDGPU::S_XOR_B64:
5533 return std::numeric_limits<uint64_t>::min();
5534 case AMDGPU::S_AND_B64:
5535 return std::numeric_limits<uint64_t>::max();
5536 default:
5538 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5539 }
5540}
5541
5542static bool is32bitWaveReduceOperation(unsigned Opc) {
5543 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5544 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5545 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5546 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5547 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5548 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5549 Opc == AMDGPU::V_SUB_F32_e64;
5550}
5551
5553 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5554 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
5555}
5556
5559 const GCNSubtarget &ST,
5560 unsigned Opc) {
5562 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5563 const DebugLoc &DL = MI.getDebugLoc();
5564 const SIInstrInfo *TII = ST.getInstrInfo();
5565
5566 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5567 Register SrcReg = MI.getOperand(1).getReg();
5568 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5569 Register DstReg = MI.getOperand(0).getReg();
5570 MachineBasicBlock *RetBB = nullptr;
5571 if (isSGPR) {
5572 switch (Opc) {
5573 case AMDGPU::S_MIN_U32:
5574 case AMDGPU::S_MIN_I32:
5575 case AMDGPU::V_MIN_F32_e64:
5576 case AMDGPU::S_MAX_U32:
5577 case AMDGPU::S_MAX_I32:
5578 case AMDGPU::V_MAX_F32_e64:
5579 case AMDGPU::S_AND_B32:
5580 case AMDGPU::S_OR_B32: {
5581 // Idempotent operations.
5582 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5583 RetBB = &BB;
5584 break;
5585 }
5586 case AMDGPU::V_CMP_LT_U64_e64: // umin
5587 case AMDGPU::V_CMP_LT_I64_e64: // min
5588 case AMDGPU::V_CMP_GT_U64_e64: // umax
5589 case AMDGPU::V_CMP_GT_I64_e64: // max
5590 case AMDGPU::S_AND_B64:
5591 case AMDGPU::S_OR_B64: {
5592 // Idempotent operations.
5593 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5594 RetBB = &BB;
5595 break;
5596 }
5597 case AMDGPU::S_XOR_B32:
5598 case AMDGPU::S_XOR_B64:
5599 case AMDGPU::S_ADD_I32:
5600 case AMDGPU::S_ADD_U64_PSEUDO:
5601 case AMDGPU::V_ADD_F32_e64:
5602 case AMDGPU::S_SUB_I32:
5603 case AMDGPU::S_SUB_U64_PSEUDO:
5604 case AMDGPU::V_SUB_F32_e64: {
5605 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5606 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5607 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5608 Register NumActiveLanes =
5609 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5610
5611 bool IsWave32 = ST.isWave32();
5612 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5613 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5614 unsigned BitCountOpc =
5615 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5616
5617 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5618
5619 auto NewAccumulator =
5620 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5621 .addReg(ExecMask);
5622
5623 switch (Opc) {
5624 case AMDGPU::S_XOR_B32:
5625 case AMDGPU::S_XOR_B64: {
5626 // Performing an XOR operation on a uniform value
5627 // depends on the parity of the number of active lanes.
5628 // For even parity, the result will be 0, for odd
5629 // parity the result will be the same as the input value.
5630 Register ParityRegister =
5631 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5632
5633 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5634 .addReg(NewAccumulator->getOperand(0).getReg())
5635 .addImm(1)
5636 .setOperandDead(3); // Dead scc
5637 if (Opc == AMDGPU::S_XOR_B32) {
5638 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5639 .addReg(SrcReg)
5640 .addReg(ParityRegister);
5641 } else {
5642 Register DestSub0 =
5643 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5644 Register DestSub1 =
5645 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646
5647 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5648 const TargetRegisterClass *SrcSubRC =
5649 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5650
5651 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5652 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5653 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5654 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5655
5656 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5657 .add(Op1L)
5658 .addReg(ParityRegister);
5659
5660 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5661 .add(Op1H)
5662 .addReg(ParityRegister);
5663
5664 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5665 .addReg(DestSub0)
5666 .addImm(AMDGPU::sub0)
5667 .addReg(DestSub1)
5668 .addImm(AMDGPU::sub1);
5669 }
5670 break;
5671 }
5672 case AMDGPU::S_SUB_I32: {
5673 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5674
5675 // Take the negation of the source operand.
5676 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5677 .addImm(0)
5678 .addReg(SrcReg);
5679 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5680 .addReg(NegatedVal)
5681 .addReg(NewAccumulator->getOperand(0).getReg());
5682 break;
5683 }
5684 case AMDGPU::S_ADD_I32: {
5685 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5686 .addReg(SrcReg)
5687 .addReg(NewAccumulator->getOperand(0).getReg());
5688 break;
5689 }
5690 case AMDGPU::S_ADD_U64_PSEUDO:
5691 case AMDGPU::S_SUB_U64_PSEUDO: {
5692 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5693 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5694 Register Op1H_Op0L_Reg =
5695 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5696 Register Op1L_Op0H_Reg =
5697 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5698 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5699 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5700 Register NegatedValLo =
5701 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5702 Register NegatedValHi =
5703 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5704
5705 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5706 const TargetRegisterClass *Src1SubRC =
5707 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5708
5709 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5710 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5711 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5712 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5713
5714 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5715 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5716 .addImm(0)
5717 .addReg(NewAccumulator->getOperand(0).getReg())
5718 .setOperandDead(3); // Dead scc
5719 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5720 .addReg(NegatedValLo)
5721 .addImm(31)
5722 .setOperandDead(3); // Dead scc
5723 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5724 .add(Op1L)
5725 .addReg(NegatedValHi);
5726 }
5727 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5728 ? NegatedValLo
5729 : NewAccumulator->getOperand(0).getReg();
5730 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5731 .add(Op1L)
5732 .addReg(LowOpcode);
5733 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5734 .add(Op1L)
5735 .addReg(LowOpcode);
5736 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5737 .add(Op1H)
5738 .addReg(LowOpcode);
5739
5740 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5741 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5742 .addReg(CarryReg)
5743 .addReg(Op1H_Op0L_Reg)
5744 .setOperandDead(3); // Dead scc
5745
5746 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5747 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5748 .addReg(HiVal)
5749 .addReg(Op1L_Op0H_Reg)
5750 .setOperandDead(3); // Dead scc
5751 }
5752 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5753 .addReg(DestSub0)
5754 .addImm(AMDGPU::sub0)
5755 .addReg(DestSub1)
5756 .addImm(AMDGPU::sub1);
5757 break;
5758 }
5759 case AMDGPU::V_ADD_F32_e64:
5760 case AMDGPU::V_SUB_F32_e64: {
5761 Register ActiveLanesVreg =
5762 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5763 Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5764 // Get number of active lanes as a float val.
5765 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
5766 ActiveLanesVreg)
5767 .addReg(NewAccumulator->getOperand(0).getReg())
5768 .addImm(0) // clamp
5769 .addImm(0); // output-modifier
5770
5771 // Take negation of input for SUB reduction
5772 unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5773 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5774 .addImm(srcMod) // src0 modifier
5775 .addReg(SrcReg)
5776 .addImm(0) // src1 modifier
5777 .addReg(ActiveLanesVreg)
5778 .addImm(0) // clamp
5779 .addImm(0); // output-mod
5780 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5781 .addReg(DstVreg);
5782 }
5783 }
5784 RetBB = &BB;
5785 }
5786 }
5787 } else {
5788 // TODO: Implement DPP Strategy and switch based on immediate strategy
5789 // operand. For now, for all the cases (default, Iterative and DPP we use
5790 // iterative approach by default.)
5791
5792 // To reduce the VGPR using iterative approach, we need to iterate
5793 // over all the active lanes. Lowering consists of ComputeLoop,
5794 // which iterate over only active lanes. We use copy of EXEC register
5795 // as induction variable and every active lane modifies it using bitset0
5796 // so that we will get the next active lane for next iteration.
5798 Register SrcReg = MI.getOperand(1).getReg();
5799 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5801
5802 // Create Control flow for loop
5803 // Split MI's Machine Basic block into For loop
5804 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5805
5806 // Create virtual registers required for lowering.
5807 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5808 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5809 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5810 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5811 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5812 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5813 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5814 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5815 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5816
5817 bool IsWave32 = ST.isWave32();
5818 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5819 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5820
5821 // Create initial values of induction variable from Exec, Accumulator and
5822 // insert branch instr to newly created ComputeBlock
5823 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5824 if (is32BitOpc) {
5826 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5827 .addImm(IdentityValue);
5828 } else {
5830 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5831 .addImm(IdentityValue);
5832 }
5833 // clang-format off
5834 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5835 .addMBB(ComputeLoop);
5836 // clang-format on
5837
5838 // Start constructing ComputeLoop
5839 I = ComputeLoop->begin();
5840 auto Accumulator =
5841 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5842 .addReg(IdentityValReg)
5843 .addMBB(&BB);
5844 auto ActiveBits =
5845 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5846 .addReg(LoopIterator)
5847 .addMBB(&BB);
5848
5849 I = ComputeLoop->end();
5850 MachineInstr *NewAccumulator;
5851 // Perform the computations
5852 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5853 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5854 .addReg(ActiveBitsReg);
5855 if (is32BitOpc) {
5856 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5857 LaneValueReg)
5858 .addReg(SrcReg)
5859 .addReg(FF1Reg);
5860 if (isFPOp) {
5861 Register LaneValVreg =
5862 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5863 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5864 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5865 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5866 LaneValVreg)
5867 .addReg(LaneValueReg);
5868 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5869 .addImm(0) // src0 modifier
5870 .addReg(Accumulator->getOperand(0).getReg())
5871 .addImm(0) // src1 modifier
5872 .addReg(LaneValVreg)
5873 .addImm(0) // clamp
5874 .addImm(0); // omod
5875 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5876 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5877 .addReg(DstVreg);
5878 } else {
5879 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5880 .addReg(Accumulator->getOperand(0).getReg())
5881 .addReg(LaneValueReg);
5882 }
5883 } else {
5884 Register LaneValueLoReg =
5885 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5886 Register LaneValueHiReg =
5887 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5888 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5889 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5890 const TargetRegisterClass *SrcSubRC =
5891 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5892 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5893 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5894 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5895 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5896 // lane value input should be in an sgpr
5897 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5898 LaneValueLoReg)
5899 .add(Op1L)
5900 .addReg(FF1Reg);
5901 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5902 LaneValueHiReg)
5903 .add(Op1H)
5904 .addReg(FF1Reg);
5905 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5906 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5907 .addReg(LaneValueLoReg)
5908 .addImm(AMDGPU::sub0)
5909 .addReg(LaneValueHiReg)
5910 .addImm(AMDGPU::sub1);
5911 switch (Opc) {
5912 case AMDGPU::S_OR_B64:
5913 case AMDGPU::S_AND_B64:
5914 case AMDGPU::S_XOR_B64: {
5915 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5916 .addReg(Accumulator->getOperand(0).getReg())
5917 .addReg(LaneValue->getOperand(0).getReg())
5918 .setOperandDead(3); // Dead scc
5919 break;
5920 }
5921 case AMDGPU::V_CMP_GT_I64_e64:
5922 case AMDGPU::V_CMP_GT_U64_e64:
5923 case AMDGPU::V_CMP_LT_I64_e64:
5924 case AMDGPU::V_CMP_LT_U64_e64: {
5925 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5926 Register ComparisonResultReg =
5927 MRI.createVirtualRegister(WaveMaskRegClass);
5928 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5929 const TargetRegisterClass *VSubRegClass =
5930 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5931 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5932 MachineOperand SrcReg0Sub0 =
5933 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5934 VregClass, AMDGPU::sub0, VSubRegClass);
5935 MachineOperand SrcReg0Sub1 =
5936 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5937 VregClass, AMDGPU::sub1, VSubRegClass);
5938 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5939 AccumulatorVReg)
5940 .add(SrcReg0Sub0)
5941 .addImm(AMDGPU::sub0)
5942 .add(SrcReg0Sub1)
5943 .addImm(AMDGPU::sub1);
5944 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5945 .addReg(LaneValue->getOperand(0).getReg())
5946 .addReg(AccumulatorVReg);
5947
5948 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5949 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5950 .addReg(LaneMaskReg)
5951 .addReg(ActiveBitsReg);
5952
5953 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5954 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5955 .addReg(LaneValue->getOperand(0).getReg())
5956 .addReg(Accumulator->getOperand(0).getReg());
5957 break;
5958 }
5959 case AMDGPU::S_ADD_U64_PSEUDO:
5960 case AMDGPU::S_SUB_U64_PSEUDO: {
5961 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5962 .addReg(Accumulator->getOperand(0).getReg())
5963 .addReg(LaneValue->getOperand(0).getReg());
5964 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5965 break;
5966 }
5967 }
5968 }
5969 // Manipulate the iterator to get the next active lane
5970 unsigned BITSETOpc =
5971 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5972 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5973 .addReg(FF1Reg)
5974 .addReg(ActiveBitsReg);
5975
5976 // Add phi nodes
5977 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5978 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5979
5980 // Creating branching
5981 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5982 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5983 .addReg(NewActiveBitsReg)
5984 .addImm(0);
5985 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5986 .addMBB(ComputeLoop);
5987
5988 RetBB = ComputeEnd;
5989 }
5990 MI.eraseFromParent();
5991 return RetBB;
5992}
5993
5996 MachineBasicBlock *BB) const {
5997 MachineFunction *MF = BB->getParent();
5999 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6001 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6003 const DebugLoc &DL = MI.getDebugLoc();
6004
6005 switch (MI.getOpcode()) {
6006 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6007 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
6008 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6009 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
6010 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6011 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
6012 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6013 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6014 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6015 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6016 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6017 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6018 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6019 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6020 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6021 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6022 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6023 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6024 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6025 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6026 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6027 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6028 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6029 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6030 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6031 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6032 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6033 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6034 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6035 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6036 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6037 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6038 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6039 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6040 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6041 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6042 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6043 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6044 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6045 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6046 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6047 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6048 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6049 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6050 case AMDGPU::S_UADDO_PSEUDO:
6051 case AMDGPU::S_USUBO_PSEUDO: {
6052 MachineOperand &Dest0 = MI.getOperand(0);
6053 MachineOperand &Dest1 = MI.getOperand(1);
6054 MachineOperand &Src0 = MI.getOperand(2);
6055 MachineOperand &Src1 = MI.getOperand(3);
6056
6057 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6058 ? AMDGPU::S_ADD_U32
6059 : AMDGPU::S_SUB_U32;
6060 // clang-format off
6061 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6062 .add(Src0)
6063 .add(Src1);
6064 // clang-format on
6065
6066 unsigned SelOpc =
6067 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6068 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6069
6070 MI.eraseFromParent();
6071 return BB;
6072 }
6073 case AMDGPU::S_ADD_U64_PSEUDO:
6074 case AMDGPU::S_SUB_U64_PSEUDO: {
6075 return Expand64BitScalarArithmetic(MI, BB);
6076 }
6077 case AMDGPU::V_ADD_U64_PSEUDO:
6078 case AMDGPU::V_SUB_U64_PSEUDO: {
6079 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6080
6081 MachineOperand &Dest = MI.getOperand(0);
6082 MachineOperand &Src0 = MI.getOperand(1);
6083 MachineOperand &Src1 = MI.getOperand(2);
6084
6085 if (ST.hasAddSubU64Insts()) {
6086 auto I = BuildMI(*BB, MI, DL,
6087 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6088 : AMDGPU::V_SUB_U64_e64),
6089 Dest.getReg())
6090 .add(Src0)
6091 .add(Src1)
6092 .addImm(0); // clamp
6093 TII->legalizeOperands(*I);
6094 MI.eraseFromParent();
6095 return BB;
6096 }
6097
6098 if (IsAdd && ST.hasLshlAddU64Inst()) {
6099 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6100 Dest.getReg())
6101 .add(Src0)
6102 .addImm(0)
6103 .add(Src1);
6104 TII->legalizeOperands(*Add);
6105 MI.eraseFromParent();
6106 return BB;
6107 }
6108
6109 const auto *CarryRC = TRI->getWaveMaskRegClass();
6110
6111 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6112 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6113
6114 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6115 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6116
6117 const TargetRegisterClass *Src0RC = Src0.isReg()
6118 ? MRI.getRegClass(Src0.getReg())
6119 : &AMDGPU::VReg_64RegClass;
6120 const TargetRegisterClass *Src1RC = Src1.isReg()
6121 ? MRI.getRegClass(Src1.getReg())
6122 : &AMDGPU::VReg_64RegClass;
6123
6124 const TargetRegisterClass *Src0SubRC =
6125 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6126 const TargetRegisterClass *Src1SubRC =
6127 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6128
6129 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6130 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6131 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6132 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6133
6134 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6135 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6136 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6137 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6138
6139 unsigned LoOpc =
6140 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6141 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6142 .addReg(CarryReg, RegState::Define)
6143 .add(SrcReg0Sub0)
6144 .add(SrcReg1Sub0)
6145 .addImm(0); // clamp bit
6146
6147 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6148 MachineInstr *HiHalf =
6149 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6150 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6151 .add(SrcReg0Sub1)
6152 .add(SrcReg1Sub1)
6153 .addReg(CarryReg, RegState::Kill)
6154 .addImm(0); // clamp bit
6155
6156 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6157 .addReg(DestSub0)
6158 .addImm(AMDGPU::sub0)
6159 .addReg(DestSub1)
6160 .addImm(AMDGPU::sub1);
6161 TII->legalizeOperands(*LoHalf);
6162 TII->legalizeOperands(*HiHalf);
6163 MI.eraseFromParent();
6164 return BB;
6165 }
6166 case AMDGPU::S_ADD_CO_PSEUDO:
6167 case AMDGPU::S_SUB_CO_PSEUDO: {
6168 // This pseudo has a chance to be selected
6169 // only from uniform add/subcarry node. All the VGPR operands
6170 // therefore assumed to be splat vectors.
6172 MachineOperand &Dest = MI.getOperand(0);
6173 MachineOperand &CarryDest = MI.getOperand(1);
6174 MachineOperand &Src0 = MI.getOperand(2);
6175 MachineOperand &Src1 = MI.getOperand(3);
6176 MachineOperand &Src2 = MI.getOperand(4);
6177 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6178 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6179 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6180 .addReg(Src0.getReg());
6181 Src0.setReg(RegOp0);
6182 }
6183 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6184 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6185 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6186 .addReg(Src1.getReg());
6187 Src1.setReg(RegOp1);
6188 }
6189 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6190 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6191 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6192 .addReg(Src2.getReg());
6193 Src2.setReg(RegOp2);
6194 }
6195
6196 if (ST.isWave64()) {
6197 if (ST.hasScalarCompareEq64()) {
6198 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6199 .addReg(Src2.getReg())
6200 .addImm(0);
6201 } else {
6202 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6203 const TargetRegisterClass *SubRC =
6204 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6205 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6206 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6207 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6208 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6209 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6210
6211 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6212 .add(Src2Sub0)
6213 .add(Src2Sub1);
6214
6215 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6216 .addReg(Src2_32, RegState::Kill)
6217 .addImm(0);
6218 }
6219 } else {
6220 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6221 .addReg(Src2.getReg())
6222 .addImm(0);
6223 }
6224
6225 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6226 ? AMDGPU::S_ADDC_U32
6227 : AMDGPU::S_SUBB_U32;
6228
6229 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6230
6231 unsigned SelOpc =
6232 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6233
6234 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6235 .addImm(-1)
6236 .addImm(0);
6237
6238 MI.eraseFromParent();
6239 return BB;
6240 }
6241 case AMDGPU::SI_INIT_M0: {
6242 MachineOperand &M0Init = MI.getOperand(0);
6243 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6244 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6245 AMDGPU::M0)
6246 .add(M0Init);
6247 MI.eraseFromParent();
6248 return BB;
6249 }
6250 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6251 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6252 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6253 TII->get(AMDGPU::S_CMP_EQ_U32))
6254 .addImm(0)
6255 .addImm(0);
6256 return BB;
6257 }
6258 case AMDGPU::GET_GROUPSTATICSIZE: {
6259 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6260 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6261 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6262 .add(MI.getOperand(0))
6263 .addImm(MFI->getLDSSize());
6264 MI.eraseFromParent();
6265 return BB;
6266 }
6267 case AMDGPU::GET_SHADERCYCLESHILO: {
6269 // The algorithm is:
6270 //
6271 // hi1 = getreg(SHADER_CYCLES_HI)
6272 // lo1 = getreg(SHADER_CYCLES_LO)
6273 // hi2 = getreg(SHADER_CYCLES_HI)
6274 //
6275 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6276 // Otherwise there was overflow and the result is hi2:0. In both cases the
6277 // result should represent the actual time at some point during the sequence
6278 // of three getregs.
6279 using namespace AMDGPU::Hwreg;
6280 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6281 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6282 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6283 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6284 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6285 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6286 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6287 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6288 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6289 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6290 .addReg(RegHi1)
6291 .addReg(RegHi2);
6292 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6293 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6294 .addReg(RegLo1)
6295 .addImm(0);
6296 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6297 .add(MI.getOperand(0))
6298 .addReg(RegLo)
6299 .addImm(AMDGPU::sub0)
6300 .addReg(RegHi2)
6301 .addImm(AMDGPU::sub1);
6302 MI.eraseFromParent();
6303 return BB;
6304 }
6305 case AMDGPU::SI_INDIRECT_SRC_V1:
6306 case AMDGPU::SI_INDIRECT_SRC_V2:
6307 case AMDGPU::SI_INDIRECT_SRC_V4:
6308 case AMDGPU::SI_INDIRECT_SRC_V8:
6309 case AMDGPU::SI_INDIRECT_SRC_V9:
6310 case AMDGPU::SI_INDIRECT_SRC_V10:
6311 case AMDGPU::SI_INDIRECT_SRC_V11:
6312 case AMDGPU::SI_INDIRECT_SRC_V12:
6313 case AMDGPU::SI_INDIRECT_SRC_V16:
6314 case AMDGPU::SI_INDIRECT_SRC_V32:
6315 return emitIndirectSrc(MI, *BB, *getSubtarget());
6316 case AMDGPU::SI_INDIRECT_DST_V1:
6317 case AMDGPU::SI_INDIRECT_DST_V2:
6318 case AMDGPU::SI_INDIRECT_DST_V4:
6319 case AMDGPU::SI_INDIRECT_DST_V8:
6320 case AMDGPU::SI_INDIRECT_DST_V9:
6321 case AMDGPU::SI_INDIRECT_DST_V10:
6322 case AMDGPU::SI_INDIRECT_DST_V11:
6323 case AMDGPU::SI_INDIRECT_DST_V12:
6324 case AMDGPU::SI_INDIRECT_DST_V16:
6325 case AMDGPU::SI_INDIRECT_DST_V32:
6326 return emitIndirectDst(MI, *BB, *getSubtarget());
6327 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6328 case AMDGPU::SI_KILL_I1_PSEUDO:
6329 return splitKillBlock(MI, BB);
6330 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6331 Register Dst = MI.getOperand(0).getReg();
6332 const MachineOperand &Src0 = MI.getOperand(1);
6333 const MachineOperand &Src1 = MI.getOperand(2);
6334 Register SrcCond = MI.getOperand(3).getReg();
6335
6336 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6337 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6338 const auto *CondRC = TRI->getWaveMaskRegClass();
6339 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6340
6341 const TargetRegisterClass *Src0RC = Src0.isReg()
6342 ? MRI.getRegClass(Src0.getReg())
6343 : &AMDGPU::VReg_64RegClass;
6344 const TargetRegisterClass *Src1RC = Src1.isReg()
6345 ? MRI.getRegClass(Src1.getReg())
6346 : &AMDGPU::VReg_64RegClass;
6347
6348 const TargetRegisterClass *Src0SubRC =
6349 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6350 const TargetRegisterClass *Src1SubRC =
6351 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6352
6353 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6354 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6355 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6356 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6357
6358 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6359 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6360 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6361 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6362
6363 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6364 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6365 .addImm(0)
6366 .add(Src0Sub0)
6367 .addImm(0)
6368 .add(Src1Sub0)
6369 .addReg(SrcCondCopy);
6370 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6371 .addImm(0)
6372 .add(Src0Sub1)
6373 .addImm(0)
6374 .add(Src1Sub1)
6375 .addReg(SrcCondCopy);
6376
6377 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6378 .addReg(DstLo)
6379 .addImm(AMDGPU::sub0)
6380 .addReg(DstHi)
6381 .addImm(AMDGPU::sub1);
6382 MI.eraseFromParent();
6383 return BB;
6384 }
6385 case AMDGPU::SI_BR_UNDEF: {
6386 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6387 .add(MI.getOperand(0));
6388 Br->getOperand(1).setIsUndef(); // read undef SCC
6389 MI.eraseFromParent();
6390 return BB;
6391 }
6392 case AMDGPU::ADJCALLSTACKUP:
6393 case AMDGPU::ADJCALLSTACKDOWN: {
6395 MachineInstrBuilder MIB(*MF, &MI);
6396 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6397 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6398 return BB;
6399 }
6400 case AMDGPU::SI_CALL_ISEL: {
6401 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6402
6404 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6405
6406 for (const MachineOperand &MO : MI.operands())
6407 MIB.add(MO);
6408
6409 MIB.cloneMemRefs(MI);
6410 MI.eraseFromParent();
6411 return BB;
6412 }
6413 case AMDGPU::V_ADD_CO_U32_e32:
6414 case AMDGPU::V_SUB_CO_U32_e32:
6415 case AMDGPU::V_SUBREV_CO_U32_e32: {
6416 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6417 unsigned Opc = MI.getOpcode();
6418
6419 bool NeedClampOperand = false;
6420 if (TII->pseudoToMCOpcode(Opc) == -1) {
6422 NeedClampOperand = true;
6423 }
6424
6425 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6426 if (TII->isVOP3(*I)) {
6427 I.addReg(TRI->getVCC(), RegState::Define);
6428 }
6429 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6430 if (NeedClampOperand)
6431 I.addImm(0); // clamp bit for e64 encoding
6432
6433 TII->legalizeOperands(*I);
6434
6435 MI.eraseFromParent();
6436 return BB;
6437 }
6438 case AMDGPU::V_ADDC_U32_e32:
6439 case AMDGPU::V_SUBB_U32_e32:
6440 case AMDGPU::V_SUBBREV_U32_e32:
6441 // These instructions have an implicit use of vcc which counts towards the
6442 // constant bus limit.
6443 TII->legalizeOperands(MI);
6444 return BB;
6445 case AMDGPU::DS_GWS_INIT:
6446 case AMDGPU::DS_GWS_SEMA_BR:
6447 case AMDGPU::DS_GWS_BARRIER:
6448 case AMDGPU::DS_GWS_SEMA_V:
6449 case AMDGPU::DS_GWS_SEMA_P:
6450 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6451 // A s_waitcnt 0 is required to be the instruction immediately following.
6452 if (getSubtarget()->hasGWSAutoReplay()) {
6454 return BB;
6455 }
6456
6457 return emitGWSMemViolTestLoop(MI, BB);
6458 case AMDGPU::S_SETREG_B32: {
6459 // Try to optimize cases that only set the denormal mode or rounding mode.
6460 //
6461 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6462 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6463 // instead.
6464 //
6465 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6466 // allow you to have a no side effect instruction in the output of a
6467 // sideeffecting pattern.
6468 auto [ID, Offset, Width] =
6469 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6471 return BB;
6472
6473 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6474 const unsigned SetMask = WidthMask << Offset;
6475
6476 if (getSubtarget()->hasDenormModeInst()) {
6477 unsigned SetDenormOp = 0;
6478 unsigned SetRoundOp = 0;
6479
6480 // The dedicated instructions can only set the whole denorm or round mode
6481 // at once, not a subset of bits in either.
6482 if (SetMask ==
6484 // If this fully sets both the round and denorm mode, emit the two
6485 // dedicated instructions for these.
6486 SetRoundOp = AMDGPU::S_ROUND_MODE;
6487 SetDenormOp = AMDGPU::S_DENORM_MODE;
6488 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6489 SetRoundOp = AMDGPU::S_ROUND_MODE;
6490 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6491 SetDenormOp = AMDGPU::S_DENORM_MODE;
6492 }
6493
6494 if (SetRoundOp || SetDenormOp) {
6495 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6496 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6497 unsigned ImmVal = Def->getOperand(1).getImm();
6498 if (SetRoundOp) {
6499 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6500 .addImm(ImmVal & 0xf);
6501
6502 // If we also have the denorm mode, get just the denorm mode bits.
6503 ImmVal >>= 4;
6504 }
6505
6506 if (SetDenormOp) {
6507 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6508 .addImm(ImmVal & 0xf);
6509 }
6510
6511 MI.eraseFromParent();
6512 return BB;
6513 }
6514 }
6515 }
6516
6517 // If only FP bits are touched, used the no side effects pseudo.
6518 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6519 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6520 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6521
6522 return BB;
6523 }
6524 case AMDGPU::S_INVERSE_BALLOT_U32:
6525 case AMDGPU::S_INVERSE_BALLOT_U64:
6526 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6527 // necessary. After that they are equivalent to a COPY.
6528 MI.setDesc(TII->get(AMDGPU::COPY));
6529 return BB;
6530 case AMDGPU::ENDPGM_TRAP: {
6531 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6532 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6533 MI.addOperand(MachineOperand::CreateImm(0));
6534 return BB;
6535 }
6536
6537 // We need a block split to make the real endpgm a terminator. We also don't
6538 // want to break phis in successor blocks, so we can't just delete to the
6539 // end of the block.
6540
6541 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6543 MF->push_back(TrapBB);
6544 // clang-format off
6545 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6546 .addImm(0);
6547 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6548 .addMBB(TrapBB);
6549 // clang-format on
6550
6551 BB->addSuccessor(TrapBB);
6552 MI.eraseFromParent();
6553 return SplitBB;
6554 }
6555 case AMDGPU::SIMULATED_TRAP: {
6556 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6557 MachineBasicBlock *SplitBB =
6558 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6559 MI.eraseFromParent();
6560 return SplitBB;
6561 }
6562 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6563 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6565
6566 // During ISel, it's difficult to propagate the original EXEC mask to use as
6567 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6568 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6569 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6570 Register OriginalExec = Setup->getOperand(0).getReg();
6571 MF->getRegInfo().clearKillFlags(OriginalExec);
6572 MI.getOperand(0).setReg(OriginalExec);
6573 return BB;
6574 }
6575 default:
6576 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6577 if (!MI.mayStore())
6579 return BB;
6580 }
6582 }
6583}
6584
6586 // This currently forces unfolding various combinations of fsub into fma with
6587 // free fneg'd operands. As long as we have fast FMA (controlled by
6588 // isFMAFasterThanFMulAndFAdd), we should perform these.
6589
6590 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6591 // most of these combines appear to be cycle neutral but save on instruction
6592 // count / code size.
6593 return true;
6594}
6595
6597
6599 EVT VT) const {
6600 if (!VT.isVector()) {
6601 return MVT::i1;
6602 }
6603 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6604}
6605
6607 // TODO: Should i16 be used always if legal? For now it would force VALU
6608 // shifts.
6609 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6610}
6611
6613 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6614 ? Ty.changeElementSize(16)
6615 : Ty.changeElementSize(32);
6616}
6617
6618// Answering this is somewhat tricky and depends on the specific device which
6619// have different rates for fma or all f64 operations.
6620//
6621// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6622// regardless of which device (although the number of cycles differs between
6623// devices), so it is always profitable for f64.
6624//
6625// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6626// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6627// which we can always do even without fused FP ops since it returns the same
6628// result as the separate operations and since it is always full
6629// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6630// however does not support denormals, so we do report fma as faster if we have
6631// a fast fma device and require denormals.
6632//
6634 EVT VT) const {
6635 VT = VT.getScalarType();
6636
6637 switch (VT.getSimpleVT().SimpleTy) {
6638 case MVT::f32: {
6639 // If mad is not available this depends only on if f32 fma is full rate.
6640 if (!Subtarget->hasMadMacF32Insts())
6641 return Subtarget->hasFastFMAF32();
6642
6643 // Otherwise f32 mad is always full rate and returns the same result as
6644 // the separate operations so should be preferred over fma.
6645 // However does not support denormals.
6647 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6648
6649 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6650 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6651 }
6652 case MVT::f64:
6653 return true;
6654 case MVT::f16:
6655 case MVT::bf16:
6656 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6657 default:
6658 break;
6659 }
6660
6661 return false;
6662}
6663
6665 LLT Ty) const {
6666 switch (Ty.getScalarSizeInBits()) {
6667 case 16:
6668 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6669 case 32:
6670 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6671 case 64:
6672 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6673 default:
6674 break;
6675 }
6676
6677 return false;
6678}
6679
6681 if (!Ty.isScalar())
6682 return false;
6683
6684 if (Ty.getScalarSizeInBits() == 16)
6685 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6686 if (Ty.getScalarSizeInBits() == 32)
6687 return Subtarget->hasMadMacF32Insts() &&
6688 denormalModeIsFlushAllF32(*MI.getMF());
6689
6690 return false;
6691}
6692
6694 const SDNode *N) const {
6695 // TODO: Check future ftz flag
6696 // v_mad_f32/v_mac_f32 do not support denormals.
6697 EVT VT = N->getValueType(0);
6698 if (VT == MVT::f32)
6699 return Subtarget->hasMadMacF32Insts() &&
6701 if (VT == MVT::f16) {
6702 return Subtarget->hasMadF16() &&
6704 }
6705
6706 return false;
6707}
6708
6709//===----------------------------------------------------------------------===//
6710// Custom DAG Lowering Operations
6711//===----------------------------------------------------------------------===//
6712
6713// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6714// wider vector type is legal.
6716 SelectionDAG &DAG) const {
6717 unsigned Opc = Op.getOpcode();
6718 EVT VT = Op.getValueType();
6719 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6720 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6721 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6722 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6723 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6724 VT == MVT::v32bf16);
6725
6726 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6727
6728 SDLoc SL(Op);
6729 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6730 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6731
6732 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6733}
6734
6735// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6736// regression whereby extra unnecessary instructions were added to codegen
6737// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6738// instructions to extract the result from the vector.
6740 [[maybe_unused]] EVT VT = Op.getValueType();
6741
6742 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6743 VT == MVT::v16i32) &&
6744 "Unexpected ValueType.");
6745
6746 return DAG.UnrollVectorOp(Op.getNode());
6747}
6748
6749// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6750// wider vector type is legal.
6752 SelectionDAG &DAG) const {
6753 unsigned Opc = Op.getOpcode();
6754 EVT VT = Op.getValueType();
6755 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6756 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6757 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6758 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6759 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6760 VT == MVT::v32bf16);
6761
6762 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6763 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6764
6765 SDLoc SL(Op);
6766
6767 SDValue OpLo =
6768 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6769 SDValue OpHi =
6770 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6771
6772 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6773}
6774
6776 SelectionDAG &DAG) const {
6777 unsigned Opc = Op.getOpcode();
6778 EVT VT = Op.getValueType();
6779 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6780 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6781 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6782 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6783 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6784 VT == MVT::v32bf16);
6785
6786 SDValue Op0 = Op.getOperand(0);
6787 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6788 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6789 : std::pair(Op0, Op0);
6790
6791 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6792 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6793
6794 SDLoc SL(Op);
6795 auto ResVT = DAG.GetSplitDestVTs(VT);
6796
6797 SDValue OpLo =
6798 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6799 SDValue OpHi =
6800 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6801
6802 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6803}
6804
6806 switch (Op.getOpcode()) {
6807 default:
6809 case ISD::BRCOND:
6810 return LowerBRCOND(Op, DAG);
6811 case ISD::RETURNADDR:
6812 return LowerRETURNADDR(Op, DAG);
6813 case ISD::LOAD: {
6814 SDValue Result = LowerLOAD(Op, DAG);
6815 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6816 "Load should return a value and a chain");
6817 return Result;
6818 }
6819 case ISD::FSQRT: {
6820 EVT VT = Op.getValueType();
6821 if (VT == MVT::f32)
6822 return lowerFSQRTF32(Op, DAG);
6823 if (VT == MVT::f64)
6824 return lowerFSQRTF64(Op, DAG);
6825 return SDValue();
6826 }
6827 case ISD::FSIN:
6828 case ISD::FCOS:
6829 return LowerTrig(Op, DAG);
6830 case ISD::SELECT:
6831 return LowerSELECT(Op, DAG);
6832 case ISD::FDIV:
6833 return LowerFDIV(Op, DAG);
6834 case ISD::FFREXP:
6835 return LowerFFREXP(Op, DAG);
6836 case ISD::ATOMIC_CMP_SWAP:
6837 return LowerATOMIC_CMP_SWAP(Op, DAG);
6838 case ISD::STORE:
6839 return LowerSTORE(Op, DAG);
6840 case ISD::GlobalAddress: {
6843 return LowerGlobalAddress(MFI, Op, DAG);
6844 }
6846 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6848 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6850 return LowerINTRINSIC_VOID(Op, DAG);
6851 case ISD::ADDRSPACECAST:
6852 return lowerADDRSPACECAST(Op, DAG);
6854 return lowerINSERT_SUBVECTOR(Op, DAG);
6856 return lowerINSERT_VECTOR_ELT(Op, DAG);
6858 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6860 return lowerVECTOR_SHUFFLE(Op, DAG);
6862 return lowerSCALAR_TO_VECTOR(Op, DAG);
6863 case ISD::BUILD_VECTOR:
6864 return lowerBUILD_VECTOR(Op, DAG);
6865 case ISD::FP_ROUND:
6867 return lowerFP_ROUND(Op, DAG);
6868 case ISD::TRAP:
6869 return lowerTRAP(Op, DAG);
6870 case ISD::DEBUGTRAP:
6871 return lowerDEBUGTRAP(Op, DAG);
6872 case ISD::ABS:
6873 case ISD::FABS:
6874 case ISD::FNEG:
6875 case ISD::FCANONICALIZE:
6876 case ISD::BSWAP:
6877 return splitUnaryVectorOp(Op, DAG);
6878 case ISD::FMINNUM:
6879 case ISD::FMAXNUM:
6880 return lowerFMINNUM_FMAXNUM(Op, DAG);
6881 case ISD::FMINIMUMNUM:
6882 case ISD::FMAXIMUMNUM:
6883 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6884 case ISD::FMINIMUM:
6885 case ISD::FMAXIMUM:
6886 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6887 case ISD::FLDEXP:
6888 case ISD::STRICT_FLDEXP:
6889 return lowerFLDEXP(Op, DAG);
6890 case ISD::FMA:
6891 return splitTernaryVectorOp(Op, DAG);
6892 case ISD::FP_TO_SINT:
6893 case ISD::FP_TO_UINT:
6894 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
6895 Op.getValueType() == MVT::i16 &&
6896 Op.getOperand(0).getValueType() == MVT::f32) {
6897 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
6898 return Op;
6899 }
6900 return LowerFP_TO_INT(Op, DAG);
6901 case ISD::SHL:
6902 case ISD::SRA:
6903 case ISD::SRL:
6904 case ISD::ADD:
6905 case ISD::SUB:
6906 case ISD::SMIN:
6907 case ISD::SMAX:
6908 case ISD::UMIN:
6909 case ISD::UMAX:
6910 case ISD::FADD:
6911 case ISD::FMUL:
6912 case ISD::FMINNUM_IEEE:
6913 case ISD::FMAXNUM_IEEE:
6914 case ISD::UADDSAT:
6915 case ISD::USUBSAT:
6916 case ISD::SADDSAT:
6917 case ISD::SSUBSAT:
6918 return splitBinaryVectorOp(Op, DAG);
6919 case ISD::FCOPYSIGN:
6920 return lowerFCOPYSIGN(Op, DAG);
6921 case ISD::MUL:
6922 return lowerMUL(Op, DAG);
6923 case ISD::SMULO:
6924 case ISD::UMULO:
6925 return lowerXMULO(Op, DAG);
6926 case ISD::SMUL_LOHI:
6927 case ISD::UMUL_LOHI:
6928 return lowerXMUL_LOHI(Op, DAG);
6929 case ISD::DYNAMIC_STACKALLOC:
6930 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6931 case ISD::STACKSAVE:
6932 return LowerSTACKSAVE(Op, DAG);
6933 case ISD::GET_ROUNDING:
6934 return lowerGET_ROUNDING(Op, DAG);
6935 case ISD::SET_ROUNDING:
6936 return lowerSET_ROUNDING(Op, DAG);
6937 case ISD::PREFETCH:
6938 return lowerPREFETCH(Op, DAG);
6939 case ISD::FP_EXTEND:
6941 return lowerFP_EXTEND(Op, DAG);
6942 case ISD::GET_FPENV:
6943 return lowerGET_FPENV(Op, DAG);
6944 case ISD::SET_FPENV:
6945 return lowerSET_FPENV(Op, DAG);
6946 case ISD::ROTR:
6947 return lowerROTR(Op, DAG);
6948 }
6949 return SDValue();
6950}
6951
6952// Used for D16: Casts the result of an instruction into the right vector,
6953// packs values if loads return unpacked values.
6955 const SDLoc &DL, SelectionDAG &DAG,
6956 bool Unpacked) {
6957 if (!LoadVT.isVector())
6958 return Result;
6959
6960 // Cast back to the original packed type or to a larger type that is a
6961 // multiple of 32 bit for D16. Widening the return type is a required for
6962 // legalization.
6963 EVT FittingLoadVT = LoadVT;
6964 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6965 FittingLoadVT =
6967 LoadVT.getVectorNumElements() + 1);
6968 }
6969
6970 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6971 // Truncate to v2i16/v4i16.
6972 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6973
6974 // Workaround legalizer not scalarizing truncate after vector op
6975 // legalization but not creating intermediate vector trunc.
6977 DAG.ExtractVectorElements(Result, Elts);
6978 for (SDValue &Elt : Elts)
6979 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6980
6981 // Pad illegal v1i16/v3fi6 to v4i16
6982 if ((LoadVT.getVectorNumElements() % 2) == 1)
6983 Elts.push_back(DAG.getPOISON(MVT::i16));
6984
6985 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6986
6987 // Bitcast to original type (v2f16/v4f16).
6988 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6989 }
6990
6991 // Cast back to the original packed type.
6992 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6993}
6994
6995SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6996 SelectionDAG &DAG,
6998 bool IsIntrinsic) const {
6999 SDLoc DL(M);
7000
7001 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7002 EVT LoadVT = M->getValueType(0);
7003
7004 EVT EquivLoadVT = LoadVT;
7005 if (LoadVT.isVector()) {
7006 if (Unpacked) {
7007 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7008 LoadVT.getVectorNumElements());
7009 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7010 // Widen v3f16 to legal type
7011 EquivLoadVT =
7013 LoadVT.getVectorNumElements() + 1);
7014 }
7015 }
7016
7017 // Change from v4f16/v2f16 to EquivLoadVT.
7018 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7019
7021 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7022 M->getMemoryVT(), M->getMemOperand());
7023
7024 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7025
7026 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7027}
7028
7029SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7030 SelectionDAG &DAG,
7031 ArrayRef<SDValue> Ops) const {
7032 SDLoc DL(M);
7033 EVT LoadVT = M->getValueType(0);
7034 EVT EltType = LoadVT.getScalarType();
7035 EVT IntVT = LoadVT.changeTypeToInteger();
7036
7037 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7038
7039 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7040 bool IsTFE = M->getNumValues() == 3;
7041
7042 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7043 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7044 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7045 : AMDGPUISD::BUFFER_LOAD;
7046
7047 if (IsD16) {
7048 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7049 }
7050
7051 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7052 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7053 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7054 IsTFE);
7055
7056 if (isTypeLegal(LoadVT)) {
7057 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7058 M->getMemOperand(), DAG);
7059 }
7060
7061 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7062 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7063 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7064 M->getMemOperand(), DAG);
7065 return DAG.getMergeValues(
7066 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7067 DL);
7068}
7069
7071 SelectionDAG &DAG) {
7072 EVT VT = N->getValueType(0);
7073 unsigned CondCode = N->getConstantOperandVal(3);
7074 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7075 return DAG.getPOISON(VT);
7076
7077 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7078
7079 SDValue LHS = N->getOperand(1);
7080 SDValue RHS = N->getOperand(2);
7081
7082 SDLoc DL(N);
7083
7084 EVT CmpVT = LHS.getValueType();
7085 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7086 unsigned PromoteOp =
7088 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7089 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7090 }
7091
7092 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7093
7094 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7095 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7096
7097 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7098 DAG.getCondCode(CCOpcode));
7099 if (VT.bitsEq(CCVT))
7100 return SetCC;
7101 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7102}
7103
7105 SelectionDAG &DAG) {
7106 EVT VT = N->getValueType(0);
7107
7108 unsigned CondCode = N->getConstantOperandVal(3);
7109 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7110 return DAG.getPOISON(VT);
7111
7112 SDValue Src0 = N->getOperand(1);
7113 SDValue Src1 = N->getOperand(2);
7114 EVT CmpVT = Src0.getValueType();
7115 SDLoc SL(N);
7116
7117 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7118 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7119 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7120 }
7121
7122 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7123 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7124 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7125 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7126 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7127 DAG.getCondCode(CCOpcode));
7128 if (VT.bitsEq(CCVT))
7129 return SetCC;
7130 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7131}
7132
7134 SelectionDAG &DAG) {
7135 EVT VT = N->getValueType(0);
7136 SDValue Src = N->getOperand(1);
7137 SDLoc SL(N);
7138
7139 if (Src.getOpcode() == ISD::SETCC) {
7140 SDValue Op0 = Src.getOperand(0);
7141 SDValue Op1 = Src.getOperand(1);
7142 // Need to expand bfloat to float for comparison (setcc).
7143 if (Op0.getValueType() == MVT::bf16) {
7144 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7145 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7146 }
7147 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7148 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7149 }
7150 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7151 // (ballot 0) -> 0
7152 if (Arg->isZero())
7153 return DAG.getConstant(0, SL, VT);
7154
7155 // (ballot 1) -> EXEC/EXEC_LO
7156 if (Arg->isOne()) {
7157 Register Exec;
7158 if (VT.getScalarSizeInBits() == 32)
7159 Exec = AMDGPU::EXEC_LO;
7160 else if (VT.getScalarSizeInBits() == 64)
7161 Exec = AMDGPU::EXEC;
7162 else
7163 return SDValue();
7164
7165 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7166 }
7167 }
7168
7169 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7170 // ISD::SETNE)
7171 return DAG.getNode(
7172 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7173 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7174}
7175
7177 SelectionDAG &DAG) {
7178 EVT VT = N->getValueType(0);
7179 unsigned ValSize = VT.getSizeInBits();
7180 unsigned IID = N->getConstantOperandVal(0);
7181 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7182 IID == Intrinsic::amdgcn_permlanex16;
7183 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7184 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7185 SDLoc SL(N);
7186 MVT IntVT = MVT::getIntegerVT(ValSize);
7187 const GCNSubtarget *ST = TLI.getSubtarget();
7188 unsigned SplitSize = 32;
7189 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7190 ST->hasDPALU_DPP() &&
7191 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7192 SplitSize = 64;
7193
7194 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7195 SDValue Src2, MVT ValT) -> SDValue {
7196 SmallVector<SDValue, 8> Operands;
7197 switch (IID) {
7198 case Intrinsic::amdgcn_permlane16:
7199 case Intrinsic::amdgcn_permlanex16:
7200 case Intrinsic::amdgcn_update_dpp:
7201 Operands.push_back(N->getOperand(6));
7202 Operands.push_back(N->getOperand(5));
7203 Operands.push_back(N->getOperand(4));
7204 [[fallthrough]];
7205 case Intrinsic::amdgcn_writelane:
7206 Operands.push_back(Src2);
7207 [[fallthrough]];
7208 case Intrinsic::amdgcn_readlane:
7209 case Intrinsic::amdgcn_set_inactive:
7210 case Intrinsic::amdgcn_set_inactive_chain_arg:
7211 case Intrinsic::amdgcn_mov_dpp8:
7212 Operands.push_back(Src1);
7213 [[fallthrough]];
7214 case Intrinsic::amdgcn_readfirstlane:
7215 case Intrinsic::amdgcn_permlane64:
7216 Operands.push_back(Src0);
7217 break;
7218 default:
7219 llvm_unreachable("unhandled lane op");
7220 }
7221
7222 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7223 std::reverse(Operands.begin(), Operands.end());
7224
7225 if (SDNode *GL = N->getGluedNode()) {
7226 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7227 GL = GL->getOperand(0).getNode();
7228 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7229 SDValue(GL, 0)));
7230 }
7231
7232 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7233 };
7234
7235 SDValue Src0 = N->getOperand(1);
7236 SDValue Src1, Src2;
7237 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7238 IID == Intrinsic::amdgcn_mov_dpp8 ||
7239 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7240 Src1 = N->getOperand(2);
7241 if (IID == Intrinsic::amdgcn_writelane ||
7242 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7243 Src2 = N->getOperand(3);
7244 }
7245
7246 if (ValSize == SplitSize) {
7247 // Already legal
7248 return SDValue();
7249 }
7250
7251 if (ValSize < 32) {
7252 bool IsFloat = VT.isFloatingPoint();
7253 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7254 SL, MVT::i32);
7255
7256 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7257 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7258 SL, MVT::i32);
7259 }
7260
7261 if (IID == Intrinsic::amdgcn_writelane) {
7262 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7263 SL, MVT::i32);
7264 }
7265
7266 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7267 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7268 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7269 }
7270
7271 if (ValSize % SplitSize != 0)
7272 return SDValue();
7273
7274 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7275 EVT VT = N->getValueType(0);
7276 unsigned NE = VT.getVectorNumElements();
7277 EVT EltVT = VT.getVectorElementType();
7279 unsigned NumOperands = N->getNumOperands();
7280 SmallVector<SDValue, 4> Operands(NumOperands);
7281 SDNode *GL = N->getGluedNode();
7282
7283 // only handle convergencectrl_glue
7284 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7285
7286 for (unsigned i = 0; i != NE; ++i) {
7287 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7288 ++j) {
7289 SDValue Operand = N->getOperand(j);
7290 EVT OperandVT = Operand.getValueType();
7291 if (OperandVT.isVector()) {
7292 // A vector operand; extract a single element.
7293 EVT OperandEltVT = OperandVT.getVectorElementType();
7294 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7295 Operand, DAG.getVectorIdxConstant(i, SL));
7296 } else {
7297 // A scalar operand; just use it as is.
7298 Operands[j] = Operand;
7299 }
7300 }
7301
7302 if (GL)
7303 Operands[NumOperands - 1] =
7304 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7305 SDValue(GL->getOperand(0).getNode(), 0));
7306
7307 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7308 }
7309
7310 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7311 return DAG.getBuildVector(VecVT, SL, Scalars);
7312 };
7313
7314 if (VT.isVector()) {
7315 switch (MVT::SimpleValueType EltTy =
7317 case MVT::i32:
7318 case MVT::f32:
7319 if (SplitSize == 32) {
7320 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7321 return unrollLaneOp(LaneOp.getNode());
7322 }
7323 [[fallthrough]];
7324 case MVT::i16:
7325 case MVT::f16:
7326 case MVT::bf16: {
7327 unsigned SubVecNumElt =
7328 SplitSize / VT.getVectorElementType().getSizeInBits();
7329 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7331 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7332 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7333 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7334 DAG.getConstant(EltIdx, SL, MVT::i32));
7335
7336 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7337 IsPermLane16)
7338 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7339 DAG.getConstant(EltIdx, SL, MVT::i32));
7340
7341 if (IID == Intrinsic::amdgcn_writelane)
7342 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7343 DAG.getConstant(EltIdx, SL, MVT::i32));
7344
7345 Pieces.push_back(
7346 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7347 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7348 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7349 EltIdx += SubVecNumElt;
7350 }
7351 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7352 }
7353 default:
7354 // Handle all other cases by bitcasting to i32 vectors
7355 break;
7356 }
7357 }
7358
7359 MVT VecVT =
7360 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7361 Src0 = DAG.getBitcast(VecVT, Src0);
7362
7363 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7364 Src1 = DAG.getBitcast(VecVT, Src1);
7365
7366 if (IID == Intrinsic::amdgcn_writelane)
7367 Src2 = DAG.getBitcast(VecVT, Src2);
7368
7369 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7370 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7371 return DAG.getBitcast(VT, UnrolledLaneOp);
7372}
7373
7376 SelectionDAG &DAG) const {
7377 switch (N->getOpcode()) {
7379 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7380 Results.push_back(Res);
7381 return;
7382 }
7384 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7385 Results.push_back(Res);
7386 return;
7387 }
7389 unsigned IID = N->getConstantOperandVal(0);
7390 switch (IID) {
7391 case Intrinsic::amdgcn_make_buffer_rsrc:
7392 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7393 return;
7394 case Intrinsic::amdgcn_cvt_pkrtz: {
7395 SDValue Src0 = N->getOperand(1);
7396 SDValue Src1 = N->getOperand(2);
7397 SDLoc SL(N);
7398 SDValue Cvt =
7399 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7400 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7401 return;
7402 }
7403 case Intrinsic::amdgcn_cvt_pknorm_i16:
7404 case Intrinsic::amdgcn_cvt_pknorm_u16:
7405 case Intrinsic::amdgcn_cvt_pk_i16:
7406 case Intrinsic::amdgcn_cvt_pk_u16: {
7407 SDValue Src0 = N->getOperand(1);
7408 SDValue Src1 = N->getOperand(2);
7409 SDLoc SL(N);
7410 unsigned Opcode;
7411
7412 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7413 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7414 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7415 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7416 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7417 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7418 else
7419 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7420
7421 EVT VT = N->getValueType(0);
7422 if (isTypeLegal(VT))
7423 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7424 else {
7425 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7426 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7427 }
7428 return;
7429 }
7430 case Intrinsic::amdgcn_s_buffer_load: {
7431 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7432 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7433 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7434 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7435 // s_buffer_load_i8.
7436 if (!Subtarget->hasScalarSubwordLoads())
7437 return;
7438 SDValue Op = SDValue(N, 0);
7439 SDValue Rsrc = Op.getOperand(1);
7440 SDValue Offset = Op.getOperand(2);
7441 SDValue CachePolicy = Op.getOperand(3);
7442 EVT VT = Op.getValueType();
7443 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7444 SDLoc DL(Op);
7446 const DataLayout &DataLayout = DAG.getDataLayout();
7447 Align Alignment =
7453 VT.getStoreSize(), Alignment);
7454 SDValue LoadVal;
7455 if (!Offset->isDivergent()) {
7456 SDValue Ops[] = {Rsrc, // source register
7457 Offset, CachePolicy};
7458 SDValue BufferLoad =
7459 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7460 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7461 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7462 } else {
7463 SDValue Ops[] = {
7464 DAG.getEntryNode(), // Chain
7465 Rsrc, // rsrc
7466 DAG.getConstant(0, DL, MVT::i32), // vindex
7467 {}, // voffset
7468 {}, // soffset
7469 {}, // offset
7470 CachePolicy, // cachepolicy
7471 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7472 };
7473 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7474 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7475 }
7476 Results.push_back(LoadVal);
7477 return;
7478 }
7479 case Intrinsic::amdgcn_dead: {
7480 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7481 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7482 return;
7483 }
7484 }
7485 break;
7486 }
7488 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7489 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7490 // FIXME: Hacky
7491 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7492 Results.push_back(Res.getOperand(I));
7493 }
7494 } else {
7495 Results.push_back(Res);
7496 Results.push_back(Res.getValue(1));
7497 }
7498 return;
7499 }
7500
7501 break;
7502 }
7503 case ISD::SELECT: {
7504 SDLoc SL(N);
7505 EVT VT = N->getValueType(0);
7506 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7507 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7508 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7509
7510 EVT SelectVT = NewVT;
7511 if (NewVT.bitsLT(MVT::i32)) {
7512 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7513 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7514 SelectVT = MVT::i32;
7515 }
7516
7517 SDValue NewSelect =
7518 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7519
7520 if (NewVT != SelectVT)
7521 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7522 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7523 return;
7524 }
7525 case ISD::FNEG: {
7526 if (N->getValueType(0) != MVT::v2f16)
7527 break;
7528
7529 SDLoc SL(N);
7530 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7531
7532 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7533 DAG.getConstant(0x80008000, SL, MVT::i32));
7534 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7535 return;
7536 }
7537 case ISD::FABS: {
7538 if (N->getValueType(0) != MVT::v2f16)
7539 break;
7540
7541 SDLoc SL(N);
7542 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7543
7544 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7545 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7546 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7547 return;
7548 }
7549 case ISD::FSQRT: {
7550 if (N->getValueType(0) != MVT::f16)
7551 break;
7552 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7553 break;
7554 }
7555 default:
7557 break;
7558 }
7559}
7560
7561/// Helper function for LowerBRCOND
7562static SDNode *findUser(SDValue Value, unsigned Opcode) {
7563
7564 for (SDUse &U : Value->uses()) {
7565 if (U.get() != Value)
7566 continue;
7567
7568 if (U.getUser()->getOpcode() == Opcode)
7569 return U.getUser();
7570 }
7571 return nullptr;
7572}
7573
7574unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7575 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7576 switch (Intr->getConstantOperandVal(1)) {
7577 case Intrinsic::amdgcn_if:
7578 return AMDGPUISD::IF;
7579 case Intrinsic::amdgcn_else:
7580 return AMDGPUISD::ELSE;
7581 case Intrinsic::amdgcn_loop:
7582 return AMDGPUISD::LOOP;
7583 case Intrinsic::amdgcn_end_cf:
7584 llvm_unreachable("should not occur");
7585 default:
7586 return 0;
7587 }
7588 }
7589
7590 // break, if_break, else_break are all only used as inputs to loop, not
7591 // directly as branch conditions.
7592 return 0;
7593}
7594
7601
7603 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7604 return false;
7605
7606 // FIXME: Either avoid relying on address space here or change the default
7607 // address space for functions to avoid the explicit check.
7608 return (GV->getValueType()->isFunctionTy() ||
7611}
7612
7614 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7615}
7616
7618 if (!GV->hasExternalLinkage())
7619 return true;
7620
7621 const auto OS = getTargetMachine().getTargetTriple().getOS();
7622 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7623}
7624
7625/// This transforms the control flow intrinsics to get the branch destination as
7626/// last parameter, also switches branch target with BR if the need arise
7627SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7628 SDLoc DL(BRCOND);
7629
7630 SDNode *Intr = BRCOND.getOperand(1).getNode();
7631 SDValue Target = BRCOND.getOperand(2);
7632 SDNode *BR = nullptr;
7633 SDNode *SetCC = nullptr;
7634
7635 switch (Intr->getOpcode()) {
7636 case ISD::SETCC: {
7637 // As long as we negate the condition everything is fine
7638 SetCC = Intr;
7639 Intr = SetCC->getOperand(0).getNode();
7640 break;
7641 }
7642 case ISD::XOR: {
7643 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7644 SDValue LHS = Intr->getOperand(0);
7645 SDValue RHS = Intr->getOperand(1);
7646 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7647 Intr = LHS.getNode();
7648 break;
7649 }
7650 [[fallthrough]];
7651 }
7652 default: {
7653 // Get the target from BR if we don't negate the condition
7654 BR = findUser(BRCOND, ISD::BR);
7655 assert(BR && "brcond missing unconditional branch user");
7656 Target = BR->getOperand(1);
7657 }
7658 }
7659
7660 unsigned CFNode = isCFIntrinsic(Intr);
7661 if (CFNode == 0) {
7662 // This is a uniform branch so we don't need to legalize.
7663 return BRCOND;
7664 }
7665
7666 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7668
7669 assert(!SetCC ||
7670 (SetCC->getConstantOperandVal(1) == 1 &&
7671 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7672 ISD::SETNE));
7673
7674 // operands of the new intrinsic call
7676 if (HaveChain)
7677 Ops.push_back(BRCOND.getOperand(0));
7678
7679 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7680 Ops.push_back(Target);
7681
7682 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7683
7684 // build the new intrinsic call
7685 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7686
7687 if (!HaveChain) {
7688 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7689
7691 }
7692
7693 if (BR) {
7694 // Give the branch instruction our target
7695 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7696 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7697 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7698 }
7699
7700 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7701
7702 // Copy the intrinsic results to registers
7703 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7704 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7705 if (!CopyToReg)
7706 continue;
7707
7708 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7709 SDValue(Result, i - 1), SDValue());
7710
7711 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7712 }
7713
7714 // Remove the old intrinsic from the chain
7715 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7716 Intr->getOperand(0));
7717
7718 return Chain;
7719}
7720
7721SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7722 MVT VT = Op.getSimpleValueType();
7723 SDLoc DL(Op);
7724 // Checking the depth
7725 if (Op.getConstantOperandVal(0) != 0)
7726 return DAG.getConstant(0, DL, VT);
7727
7728 MachineFunction &MF = DAG.getMachineFunction();
7729 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7730 // Check for kernel and shader functions
7731 if (Info->isEntryFunction())
7732 return DAG.getConstant(0, DL, VT);
7733
7734 MachineFrameInfo &MFI = MF.getFrameInfo();
7735 // There is a call to @llvm.returnaddress in this function
7736 MFI.setReturnAddressIsTaken(true);
7737
7738 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7739 // Get the return address reg and mark it as an implicit live-in
7740 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7741 getRegClassFor(VT, Op.getNode()->isDivergent()));
7742
7743 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7744}
7745
7746SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7747 const SDLoc &DL, EVT VT) const {
7748 return Op.getValueType().bitsLE(VT)
7749 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7750 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7751 DAG.getTargetConstant(0, DL, MVT::i32));
7752}
7753
7754SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7755 SelectionDAG &DAG) const {
7756 EVT DstVT = Op.getValueType();
7757 unsigned NumElts = DstVT.getVectorNumElements();
7758 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7759
7760 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7761
7762 SDLoc DL(Op);
7763 unsigned Opc = Op.getOpcode();
7764 SDValue Flags = Op.getOperand(1);
7765 EVT HalfDstVT =
7766 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7767 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7768 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7769
7770 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7771}
7772
7773SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7774 SDValue Src = Op.getOperand(0);
7775 EVT SrcVT = Src.getValueType();
7776 EVT DstVT = Op.getValueType();
7777
7778 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7779 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7780 if (SrcVT.getScalarType() != MVT::f32)
7781 return SDValue();
7782 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7783 }
7784
7785 if (SrcVT.getScalarType() != MVT::f64)
7786 return Op;
7787
7788 SDLoc DL(Op);
7789 if (DstVT == MVT::f16) {
7790 // TODO: Handle strictfp
7791 if (Op.getOpcode() != ISD::FP_ROUND)
7792 return Op;
7793
7794 if (!Subtarget->has16BitInsts()) {
7795 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7796 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7797 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7798 }
7799 if (Op->getFlags().hasApproximateFuncs()) {
7800 SDValue Flags = Op.getOperand(1);
7801 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7802 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7803 }
7804 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7805 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7806 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7807 }
7808
7809 assert(DstVT.getScalarType() == MVT::bf16 &&
7810 "custom lower FP_ROUND for f16 or bf16");
7811 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7812
7813 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7814 // hardware f32 -> bf16 instruction.
7815 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7816 MVT::f32;
7817 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7818 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7819 DAG.getTargetConstant(0, DL, MVT::i32));
7820}
7821
7822SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7823 SelectionDAG &DAG) const {
7824 EVT VT = Op.getValueType();
7825 const MachineFunction &MF = DAG.getMachineFunction();
7826 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7827 bool IsIEEEMode = Info->getMode().IEEE;
7828
7829 // FIXME: Assert during selection that this is only selected for
7830 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7831 // mode functions, but this happens to be OK since it's only done in cases
7832 // where there is known no sNaN.
7833 if (IsIEEEMode)
7834 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7835
7836 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7837 VT == MVT::v16bf16)
7838 return splitBinaryVectorOp(Op, DAG);
7839 return Op;
7840}
7841
7842SDValue
7843SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7844 SelectionDAG &DAG) const {
7845 EVT VT = Op.getValueType();
7846 const MachineFunction &MF = DAG.getMachineFunction();
7847 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7848 bool IsIEEEMode = Info->getMode().IEEE;
7849
7850 if (IsIEEEMode)
7851 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7852
7853 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7854 VT == MVT::v16bf16)
7855 return splitBinaryVectorOp(Op, DAG);
7856 return Op;
7857}
7858
7859SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7860 SelectionDAG &DAG) const {
7861 EVT VT = Op.getValueType();
7862 if (VT.isVector())
7863 return splitBinaryVectorOp(Op, DAG);
7864
7865 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7866 !Subtarget->hasMinimum3Maximum3F16() &&
7867 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7868 "should not need to widen f16 minimum/maximum to v2f16");
7869
7870 // Widen f16 operation to v2f16
7871
7872 // fminimum f16:x, f16:y ->
7873 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7874 // (v2f16 (scalar_to_vector y))), 0
7875 SDLoc SL(Op);
7876 SDValue WideSrc0 =
7877 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7878 SDValue WideSrc1 =
7879 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7880
7881 SDValue Widened =
7882 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7883
7884 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7885 DAG.getConstant(0, SL, MVT::i32));
7886}
7887
7888SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7889 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7890 EVT VT = Op.getValueType();
7891 assert(VT == MVT::f16);
7892
7893 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7894 EVT ExpVT = Exp.getValueType();
7895 if (ExpVT == MVT::i16)
7896 return Op;
7897
7898 SDLoc DL(Op);
7899
7900 // Correct the exponent type for f16 to i16.
7901 // Clamp the range of the exponent to the instruction's range.
7902
7903 // TODO: This should be a generic narrowing legalization, and can easily be
7904 // for GlobalISel.
7905
7906 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7907 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7908
7909 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7910 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7911
7912 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7913
7914 if (IsStrict) {
7915 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7916 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7917 }
7918
7919 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7920}
7921
7923 switch (Op->getOpcode()) {
7924 case ISD::SRA:
7925 case ISD::SMIN:
7926 case ISD::SMAX:
7927 return ISD::SIGN_EXTEND;
7928 case ISD::SRL:
7929 case ISD::UMIN:
7930 case ISD::UMAX:
7931 return ISD::ZERO_EXTEND;
7932 case ISD::ADD:
7933 case ISD::SUB:
7934 case ISD::AND:
7935 case ISD::OR:
7936 case ISD::XOR:
7937 case ISD::SHL:
7938 case ISD::SELECT:
7939 case ISD::MUL:
7940 // operation result won't be influenced by garbage high bits.
7941 // TODO: are all of those cases correct, and are there more?
7942 return ISD::ANY_EXTEND;
7943 case ISD::SETCC: {
7944 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7946 }
7947 default:
7948 llvm_unreachable("unexpected opcode!");
7949 }
7950}
7951
7952SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7953 DAGCombinerInfo &DCI) const {
7954 const unsigned Opc = Op.getOpcode();
7955 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7956 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7957 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7958 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7959 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7960
7961 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7962 : Op->getOperand(0).getValueType();
7963 auto ExtTy = OpTy.changeElementType(MVT::i32);
7964
7965 if (DCI.isBeforeLegalizeOps() ||
7966 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7967 return SDValue();
7968
7969 auto &DAG = DCI.DAG;
7970
7971 SDLoc DL(Op);
7972 SDValue LHS;
7973 SDValue RHS;
7974 if (Opc == ISD::SELECT) {
7975 LHS = Op->getOperand(1);
7976 RHS = Op->getOperand(2);
7977 } else {
7978 LHS = Op->getOperand(0);
7979 RHS = Op->getOperand(1);
7980 }
7981
7982 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7983 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7984
7985 // Special case: for shifts, the RHS always needs a zext.
7986 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7987 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7988 else
7989 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7990
7991 // setcc always return i1/i1 vec so no need to truncate after.
7992 if (Opc == ISD::SETCC) {
7993 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7994 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7995 }
7996
7997 // For other ops, we extend the operation's return type as well so we need to
7998 // truncate back to the original type.
7999 SDValue NewVal;
8000 if (Opc == ISD::SELECT)
8001 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8002 else
8003 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8004
8005 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8006}
8007
8008SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8009 SDValue Mag = Op.getOperand(0);
8010 EVT MagVT = Mag.getValueType();
8011
8012 if (MagVT.getVectorNumElements() > 2)
8013 return splitBinaryVectorOp(Op, DAG);
8014
8015 SDValue Sign = Op.getOperand(1);
8016 EVT SignVT = Sign.getValueType();
8017
8018 if (MagVT == SignVT)
8019 return Op;
8020
8021 // fcopysign v2f16:mag, v2f32:sign ->
8022 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8023
8024 SDLoc SL(Op);
8025 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8026 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8027
8028 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8029
8030 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8031}
8032
8033// Custom lowering for vector multiplications and s_mul_u64.
8034SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8035 EVT VT = Op.getValueType();
8036
8037 // Split vector operands.
8038 if (VT.isVector())
8039 return splitBinaryVectorOp(Op, DAG);
8040
8041 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8042
8043 // There are four ways to lower s_mul_u64:
8044 //
8045 // 1. If all the operands are uniform, then we lower it as it is.
8046 //
8047 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8048 // multiplications because there is not a vector equivalent of s_mul_u64.
8049 //
8050 // 3. If the cost model decides that it is more efficient to use vector
8051 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8052 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8053 //
8054 // 4. If the cost model decides to use vector registers and both of the
8055 // operands are zero-extended/sign-extended from 32-bits, then we split the
8056 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8057 // possible to check if the operands are zero-extended or sign-extended in
8058 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8059 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8060 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8061 // If the cost model decides that we have to use vector registers, then
8062 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8063 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8064 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8065 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8066 // SIInstrInfo.cpp .
8067
8068 if (Op->isDivergent())
8069 return SDValue();
8070
8071 SDValue Op0 = Op.getOperand(0);
8072 SDValue Op1 = Op.getOperand(1);
8073 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8074 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8075 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8076 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8077 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8078 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8079 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8080 SDLoc SL(Op);
8081 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8082 return SDValue(
8083 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8084 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8085 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8086 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8087 return SDValue(
8088 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8089 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8090 return Op;
8091}
8092
8093SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8094 EVT VT = Op.getValueType();
8095 SDLoc SL(Op);
8096 SDValue LHS = Op.getOperand(0);
8097 SDValue RHS = Op.getOperand(1);
8098 bool isSigned = Op.getOpcode() == ISD::SMULO;
8099
8100 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8101 const APInt &C = RHSC->getAPIntValue();
8102 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8103 if (C.isPowerOf2()) {
8104 // smulo(x, signed_min) is same as umulo(x, signed_min).
8105 bool UseArithShift = isSigned && !C.isMinSignedValue();
8106 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8107 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8108 SDValue Overflow =
8109 DAG.getSetCC(SL, MVT::i1,
8110 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8111 Result, ShiftAmt),
8112 LHS, ISD::SETNE);
8113 return DAG.getMergeValues({Result, Overflow}, SL);
8114 }
8115 }
8116
8117 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8118 SDValue Top =
8119 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8120
8121 SDValue Sign = isSigned
8122 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8123 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8124 SL, MVT::i32))
8125 : DAG.getConstant(0, SL, VT);
8126 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8127
8128 return DAG.getMergeValues({Result, Overflow}, SL);
8129}
8130
8131SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8132 if (Op->isDivergent()) {
8133 // Select to V_MAD_[IU]64_[IU]32.
8134 return Op;
8135 }
8136 if (Subtarget->hasSMulHi()) {
8137 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8138 return SDValue();
8139 }
8140 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8141 // calculate the high part, so we might as well do the whole thing with
8142 // V_MAD_[IU]64_[IU]32.
8143 return Op;
8144}
8145
8146SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8147 if (!Subtarget->isTrapHandlerEnabled() ||
8148 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8149 return lowerTrapEndpgm(Op, DAG);
8150
8151 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8152 : lowerTrapHsaQueuePtr(Op, DAG);
8153}
8154
8155SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8156 SDLoc SL(Op);
8157 SDValue Chain = Op.getOperand(0);
8158 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8159}
8160
8161SDValue
8162SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8163 const SDLoc &DL, Align Alignment,
8164 ImplicitParameter Param) const {
8165 MachineFunction &MF = DAG.getMachineFunction();
8166 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8167 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8168 MachinePointerInfo PtrInfo =
8170 return DAG.getLoad(
8171 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8173}
8174
8175SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8176 SelectionDAG &DAG) const {
8177 SDLoc SL(Op);
8178 SDValue Chain = Op.getOperand(0);
8179
8180 SDValue QueuePtr;
8181 // For code object version 5, QueuePtr is passed through implicit kernarg.
8182 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8184 QueuePtr =
8185 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8186 } else {
8187 MachineFunction &MF = DAG.getMachineFunction();
8188 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8189 Register UserSGPR = Info->getQueuePtrUserSGPR();
8190
8191 if (UserSGPR == AMDGPU::NoRegister) {
8192 // We probably are in a function incorrectly marked with
8193 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8194 // trap, so just use a null pointer.
8195 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8196 } else {
8197 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8198 MVT::i64);
8199 }
8200 }
8201
8202 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8203 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8204
8205 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8206 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8207 ToReg.getValue(1)};
8208 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8209}
8210
8211SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8212 SDLoc SL(Op);
8213 SDValue Chain = Op.getOperand(0);
8214
8215 // We need to simulate the 's_trap 2' instruction on targets that run in
8216 // PRIV=1 (where it is treated as a nop).
8217 if (Subtarget->hasPrivEnabledTrap2NopBug())
8218 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8219
8220 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8221 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8222 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8223}
8224
8225SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8226 SDLoc SL(Op);
8227 SDValue Chain = Op.getOperand(0);
8228 MachineFunction &MF = DAG.getMachineFunction();
8229
8230 if (!Subtarget->isTrapHandlerEnabled() ||
8231 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8232 LLVMContext &Ctx = MF.getFunction().getContext();
8233 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8234 "debugtrap handler not supported",
8235 Op.getDebugLoc(), DS_Warning));
8236 return Chain;
8237 }
8238
8239 uint64_t TrapID =
8240 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8241 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8242 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8243}
8244
8245SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8246 SelectionDAG &DAG) const {
8247 if (Subtarget->hasApertureRegs()) {
8248 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8249 ? AMDGPU::SRC_SHARED_BASE
8250 : AMDGPU::SRC_PRIVATE_BASE;
8251 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8252 !Subtarget->hasGloballyAddressableScratch()) &&
8253 "Cannot use src_private_base with globally addressable scratch!");
8254 // Note: this feature (register) is broken. When used as a 32-bit operand,
8255 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8256 // bits.
8257 //
8258 // To work around the issue, emit a 64 bit copy from this register
8259 // then extract the high bits. Note that this shouldn't even result in a
8260 // shift being emitted and simply become a pair of registers (e.g.):
8261 // s_mov_b64 s[6:7], src_shared_base
8262 // v_mov_b32_e32 v1, s7
8263 SDValue Copy =
8264 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8265 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8266 }
8267
8268 // For code object version 5, private_base and shared_base are passed through
8269 // implicit kernargs.
8270 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8274 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8275 }
8276
8277 MachineFunction &MF = DAG.getMachineFunction();
8278 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8279 Register UserSGPR = Info->getQueuePtrUserSGPR();
8280 if (UserSGPR == AMDGPU::NoRegister) {
8281 // We probably are in a function incorrectly marked with
8282 // amdgpu-no-queue-ptr. This is undefined.
8283 return DAG.getPOISON(MVT::i32);
8284 }
8285
8286 SDValue QueuePtr =
8287 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8288
8289 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8290 // private_segment_aperture_base_hi.
8291 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8292
8293 SDValue Ptr =
8294 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8295
8296 // TODO: Use custom target PseudoSourceValue.
8297 // TODO: We should use the value from the IR intrinsic call, but it might not
8298 // be available and how do we get it?
8299 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8300 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8301 commonAlignment(Align(64), StructOffset),
8304}
8305
8306/// Return true if the value is a known valid address, such that a null check is
8307/// not necessary.
8309 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8311 return true;
8312
8313 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8314 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8315
8316 // TODO: Search through arithmetic, handle arguments and loads
8317 // marked nonnull.
8318 return false;
8319}
8320
8321SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8322 SelectionDAG &DAG) const {
8323 SDLoc SL(Op);
8324
8325 const AMDGPUTargetMachine &TM =
8326 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8327
8328 unsigned DestAS, SrcAS;
8329 SDValue Src;
8330 bool IsNonNull = false;
8331 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8332 SrcAS = ASC->getSrcAddressSpace();
8333 Src = ASC->getOperand(0);
8334 DestAS = ASC->getDestAddressSpace();
8335 } else {
8336 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8337 Op.getConstantOperandVal(0) ==
8338 Intrinsic::amdgcn_addrspacecast_nonnull);
8339 Src = Op->getOperand(1);
8340 SrcAS = Op->getConstantOperandVal(2);
8341 DestAS = Op->getConstantOperandVal(3);
8342 IsNonNull = true;
8343 }
8344
8345 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8346
8347 // flat -> local/private
8348 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8349 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8350 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8351 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8352
8353 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8354 Subtarget->hasGloballyAddressableScratch()) {
8355 // flat -> private with globally addressable scratch: subtract
8356 // src_flat_scratch_base_lo.
8357 SDValue FlatScratchBaseLo(
8358 DAG.getMachineNode(
8359 AMDGPU::S_MOV_B32, SL, MVT::i32,
8360 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8361 0);
8362 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8363 }
8364
8365 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8366 return Ptr;
8367
8368 unsigned NullVal = TM.getNullPointerValue(DestAS);
8369 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8370 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8371
8372 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8373 SegmentNullPtr);
8374 }
8375 }
8376
8377 // local/private -> flat
8378 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8379 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8380 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8381 SDValue CvtPtr;
8382 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8383 Subtarget->hasGloballyAddressableScratch()) {
8384 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8385 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8386 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8387 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8388 ThreadID = DAG.getNode(
8389 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8390 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8391 AllOnes, ThreadID);
8392 if (Subtarget->isWave64())
8393 ThreadID = DAG.getNode(
8394 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8395 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8396 AllOnes, ThreadID);
8397 SDValue ShAmt = DAG.getShiftAmountConstant(
8398 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8399 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8400 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8401 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8402 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8403 // 64-bit hi:lo value.
8404 SDValue FlatScratchBase = {
8405 DAG.getMachineNode(
8406 AMDGPU::S_MOV_B64, SL, MVT::i64,
8407 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8408 0};
8409 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8410 } else {
8411 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8412 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8413 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8414 }
8415
8416 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8417 return CvtPtr;
8418
8419 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8420 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8421
8422 SDValue NonNull =
8423 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8424
8425 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8426 FlatNullPtr);
8427 }
8428 }
8429
8430 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8431 Op.getValueType() == MVT::i64) {
8432 const SIMachineFunctionInfo *Info =
8433 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8434 if (Info->get32BitAddressHighBits() == 0)
8435 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8436
8437 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8438 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8439 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8440 }
8441
8442 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8443 Src.getValueType() == MVT::i64)
8444 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8445
8446 // global <-> flat are no-ops and never emitted.
8447
8448 // Invalid casts are poison.
8449 return DAG.getPOISON(Op->getValueType(0));
8450}
8451
8452// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8453// the small vector and inserting them into the big vector. That is better than
8454// the default expansion of doing it via a stack slot. Even though the use of
8455// the stack slot would be optimized away afterwards, the stack slot itself
8456// remains.
8457SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8458 SelectionDAG &DAG) const {
8459 SDValue Vec = Op.getOperand(0);
8460 SDValue Ins = Op.getOperand(1);
8461 SDValue Idx = Op.getOperand(2);
8462 EVT VecVT = Vec.getValueType();
8463 EVT InsVT = Ins.getValueType();
8464 EVT EltVT = VecVT.getVectorElementType();
8465 unsigned InsNumElts = InsVT.getVectorNumElements();
8466 unsigned IdxVal = Idx->getAsZExtVal();
8467 SDLoc SL(Op);
8468
8469 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8470 // Insert 32-bit registers at a time.
8471 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8472
8473 unsigned VecNumElts = VecVT.getVectorNumElements();
8474 EVT NewVecVT =
8475 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8476 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8478 MVT::i32, InsNumElts / 2);
8479
8480 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8481 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8482
8483 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8484 SDValue Elt;
8485 if (InsNumElts == 2) {
8486 Elt = Ins;
8487 } else {
8488 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8489 DAG.getConstant(I, SL, MVT::i32));
8490 }
8491 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8492 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8493 }
8494
8495 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8496 }
8497
8498 for (unsigned I = 0; I != InsNumElts; ++I) {
8499 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8500 DAG.getConstant(I, SL, MVT::i32));
8501 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8502 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8503 }
8504 return Vec;
8505}
8506
8507SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8508 SelectionDAG &DAG) const {
8509 SDValue Vec = Op.getOperand(0);
8510 SDValue InsVal = Op.getOperand(1);
8511 SDValue Idx = Op.getOperand(2);
8512 EVT VecVT = Vec.getValueType();
8513 EVT EltVT = VecVT.getVectorElementType();
8514 unsigned VecSize = VecVT.getSizeInBits();
8515 unsigned EltSize = EltVT.getSizeInBits();
8516 SDLoc SL(Op);
8517
8518 // Specially handle the case of v4i16 with static indexing.
8519 unsigned NumElts = VecVT.getVectorNumElements();
8520 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8521 if (NumElts == 4 && EltSize == 16 && KIdx) {
8522 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8523
8524 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8525 DAG.getConstant(0, SL, MVT::i32));
8526 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8527 DAG.getConstant(1, SL, MVT::i32));
8528
8529 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8530 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8531
8532 unsigned Idx = KIdx->getZExtValue();
8533 bool InsertLo = Idx < 2;
8534 SDValue InsHalf = DAG.getNode(
8535 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8536 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8537 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8538
8539 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8540
8541 SDValue Concat =
8542 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8543 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8544
8545 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8546 }
8547
8548 // Static indexing does not lower to stack access, and hence there is no need
8549 // for special custom lowering to avoid stack access.
8550 if (isa<ConstantSDNode>(Idx))
8551 return SDValue();
8552
8553 // Avoid stack access for dynamic indexing by custom lowering to
8554 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8555
8556 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8557
8558 MVT IntVT = MVT::getIntegerVT(VecSize);
8559
8560 // Convert vector index to bit-index and get the required bit mask.
8561 assert(isPowerOf2_32(EltSize));
8562 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8563 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8564 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8565 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8566 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8567
8568 // 1. Create a congruent vector with the target value in each element.
8569 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8570 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8571
8572 // 2. Mask off all other indices except the required index within (1).
8573 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8574
8575 // 3. Mask off the required index within the target vector.
8576 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8577 SDValue RHS =
8578 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8579
8580 // 4. Get (2) and (3) ORed into the target vector.
8581 SDValue BFI =
8582 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8583
8584 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8585}
8586
8587SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8588 SelectionDAG &DAG) const {
8589 SDLoc SL(Op);
8590
8591 EVT ResultVT = Op.getValueType();
8592 SDValue Vec = Op.getOperand(0);
8593 SDValue Idx = Op.getOperand(1);
8594 EVT VecVT = Vec.getValueType();
8595 unsigned VecSize = VecVT.getSizeInBits();
8596 EVT EltVT = VecVT.getVectorElementType();
8597
8598 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8599
8600 // Make sure we do any optimizations that will make it easier to fold
8601 // source modifiers before obscuring it with bit operations.
8602
8603 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8604 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8605 return Combined;
8606
8607 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8608 SDValue Lo, Hi;
8609 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8610
8611 if (VecSize == 128) {
8612 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8613 Lo = DAG.getBitcast(LoVT,
8614 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8615 DAG.getConstant(0, SL, MVT::i32)));
8616 Hi = DAG.getBitcast(HiVT,
8617 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8618 DAG.getConstant(1, SL, MVT::i32)));
8619 } else if (VecSize == 256) {
8620 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8621 SDValue Parts[4];
8622 for (unsigned P = 0; P < 4; ++P) {
8623 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8624 DAG.getConstant(P, SL, MVT::i32));
8625 }
8626
8627 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8628 Parts[0], Parts[1]));
8629 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8630 Parts[2], Parts[3]));
8631 } else {
8632 assert(VecSize == 512);
8633
8634 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8635 SDValue Parts[8];
8636 for (unsigned P = 0; P < 8; ++P) {
8637 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8638 DAG.getConstant(P, SL, MVT::i32));
8639 }
8640
8641 Lo = DAG.getBitcast(LoVT,
8642 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8643 Parts[0], Parts[1], Parts[2], Parts[3]));
8644 Hi = DAG.getBitcast(HiVT,
8645 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8646 Parts[4], Parts[5], Parts[6], Parts[7]));
8647 }
8648
8649 EVT IdxVT = Idx.getValueType();
8650 unsigned NElem = VecVT.getVectorNumElements();
8651 assert(isPowerOf2_32(NElem));
8652 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8653 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8654 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8655 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8656 }
8657
8658 assert(VecSize <= 64);
8659
8660 MVT IntVT = MVT::getIntegerVT(VecSize);
8661
8662 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8663 SDValue VecBC = peekThroughBitcasts(Vec);
8664 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8665 SDValue Src = VecBC.getOperand(0);
8666 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8667 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8668 }
8669
8670 unsigned EltSize = EltVT.getSizeInBits();
8671 assert(isPowerOf2_32(EltSize));
8672
8673 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8674
8675 // Convert vector index to bit-index (* EltSize)
8676 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8677
8678 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8679 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8680
8681 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8682 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8683 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8684 }
8685
8686 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8687}
8688
8689static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8690 assert(Elt % 2 == 0);
8691 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8692}
8693
8694static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8695 assert(Elt % 2 == 0);
8696 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8697 !(Mask[Elt + 1] & 1);
8698}
8699
8700SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8701 SelectionDAG &DAG) const {
8702 SDLoc SL(Op);
8703 EVT ResultVT = Op.getValueType();
8704 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8705 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8706 const int NewSrcNumElts = 2;
8707 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8708 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8709
8710 // Break up the shuffle into registers sized pieces.
8711 //
8712 // We're trying to form sub-shuffles that the register allocation pipeline
8713 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8714 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8715 // pair of copies into a consecutive register copy, so use the ordinary
8716 // extract_vector_elt lowering unless we can use the shuffle.
8717 //
8718 // TODO: This is a bit of hack, and we should probably always use
8719 // extract_subvector for the largest possible subvector we can (or at least
8720 // use it for PackVT aligned pieces). However we have worse support for
8721 // combines on them don't directly treat extract_subvector / insert_subvector
8722 // as legal. The DAG scheduler also ends up doing a worse job with the
8723 // extract_subvectors.
8724 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8725
8726 // vector_shuffle <0,1,6,7> lhs, rhs
8727 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8728 //
8729 // vector_shuffle <6,7,2,3> lhs, rhs
8730 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8731 //
8732 // vector_shuffle <6,7,0,1> lhs, rhs
8733 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8734
8735 // Avoid scalarizing when both halves are reading from consecutive elements.
8736
8737 // If we're treating 2 element shuffles as legal, also create odd-to-even
8738 // shuffles of neighboring pairs.
8739 //
8740 // vector_shuffle <3,2,7,6> lhs, rhs
8741 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8742 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8743
8745 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8746 if (ShouldUseConsecutiveExtract &&
8748 const int Idx = SVN->getMaskElt(I);
8749 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8750 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8751 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8752 SVN->getOperand(VecIdx),
8753 DAG.getConstant(EltIdx, SL, MVT::i32));
8754 Pieces.push_back(SubVec);
8755 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8757 int Idx0 = SVN->getMaskElt(I);
8758 int Idx1 = SVN->getMaskElt(I + 1);
8759
8760 SDValue SrcOp0 = SVN->getOperand(0);
8761 SDValue SrcOp1 = SrcOp0;
8762 if (Idx0 >= SrcNumElts) {
8763 SrcOp0 = SVN->getOperand(1);
8764 Idx0 -= SrcNumElts;
8765 }
8766
8767 if (Idx1 >= SrcNumElts) {
8768 SrcOp1 = SVN->getOperand(1);
8769 Idx1 -= SrcNumElts;
8770 }
8771
8772 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8773 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8774
8775 // Extract nearest even aligned piece.
8776 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8777 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8778 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8779 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8780
8781 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8782 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8783
8784 SDValue Result0 = SubVec0;
8785 SDValue Result1 = SubVec0;
8786
8787 if (SubVec0 != SubVec1) {
8788 NewMaskIdx1 += NewSrcNumElts;
8789 Result1 = SubVec1;
8790 } else {
8791 Result1 = DAG.getPOISON(PackVT);
8792 }
8793
8794 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8795 {NewMaskIdx0, NewMaskIdx1});
8796 Pieces.push_back(Shuf);
8797 } else {
8798 const int Idx0 = SVN->getMaskElt(I);
8799 const int Idx1 = SVN->getMaskElt(I + 1);
8800 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8801 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8802 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8803 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8804
8805 SDValue Vec0 = SVN->getOperand(VecIdx0);
8806 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8807 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8808
8809 SDValue Vec1 = SVN->getOperand(VecIdx1);
8810 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8811 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8812 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8813 }
8814 }
8815
8816 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8817}
8818
8819SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8820 SelectionDAG &DAG) const {
8821 SDValue SVal = Op.getOperand(0);
8822 EVT ResultVT = Op.getValueType();
8823 EVT SValVT = SVal.getValueType();
8824 SDValue UndefVal = DAG.getPOISON(SValVT);
8825 SDLoc SL(Op);
8826
8828 VElts.push_back(SVal);
8829 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8830 VElts.push_back(UndefVal);
8831
8832 return DAG.getBuildVector(ResultVT, SL, VElts);
8833}
8834
8835SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8836 SelectionDAG &DAG) const {
8837 SDLoc SL(Op);
8838 EVT VT = Op.getValueType();
8839
8840 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8841 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8842
8843 SDValue Lo = Op.getOperand(0);
8844 SDValue Hi = Op.getOperand(1);
8845
8846 // Avoid adding defined bits with the zero_extend.
8847 if (Hi.isUndef()) {
8848 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8849 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8850 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8851 }
8852
8853 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8854 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8855
8856 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8857 DAG.getConstant(16, SL, MVT::i32));
8858 if (Lo.isUndef())
8859 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8860
8861 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8862 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8863
8864 SDValue Or =
8865 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8866 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8867 }
8868
8869 // Split into 2-element chunks.
8870 const unsigned NumParts = VT.getVectorNumElements() / 2;
8871 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8872 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8873
8875 for (unsigned P = 0; P < NumParts; ++P) {
8876 SDValue Vec = DAG.getBuildVector(
8877 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8878 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8879 }
8880
8881 SDValue Blend =
8882 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8883 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8884}
8885
8887 const GlobalAddressSDNode *GA) const {
8888 // OSes that use ELF REL relocations (instead of RELA) can only store a
8889 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8890 // which can create arbitrary 64-bit addends. (This is only a problem for
8891 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8892 // the high 32 bits of the addend.)
8893 //
8894 // This should be kept in sync with how HasRelocationAddend is initialized in
8895 // the constructor of ELFAMDGPUAsmBackend.
8896 if (!Subtarget->isAmdHsaOS())
8897 return false;
8898
8899 // We can fold offsets for anything that doesn't require a GOT relocation.
8900 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8904}
8905
8906static SDValue
8908 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8909 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8910 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8911 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8912 // lowered to the following code sequence:
8913 //
8914 // For constant address space:
8915 // s_getpc_b64 s[0:1]
8916 // s_add_u32 s0, s0, $symbol
8917 // s_addc_u32 s1, s1, 0
8918 //
8919 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8920 // a fixup or relocation is emitted to replace $symbol with a literal
8921 // constant, which is a pc-relative offset from the encoding of the $symbol
8922 // operand to the global variable.
8923 //
8924 // For global address space:
8925 // s_getpc_b64 s[0:1]
8926 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8927 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8928 //
8929 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8930 // fixups or relocations are emitted to replace $symbol@*@lo and
8931 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8932 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8933 // operand to the global variable.
8934 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8935 assert(GAFlags != SIInstrInfo::MO_NONE);
8936
8937 SDValue Ptr =
8938 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8939 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8940 }
8941
8942 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8943 SDValue PtrHi;
8944 if (GAFlags == SIInstrInfo::MO_NONE)
8945 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8946 else
8947 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8948 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8949}
8950
8951SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8952 SDValue Op,
8953 SelectionDAG &DAG) const {
8954 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8955 SDLoc DL(GSD);
8956 EVT PtrVT = Op.getValueType();
8957
8958 const GlobalValue *GV = GSD->getGlobal();
8964 GV->hasExternalLinkage()) {
8965 Type *Ty = GV->getValueType();
8966 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8967 // zero-sized type in other languages to declare the dynamic shared
8968 // memory which size is not known at the compile time. They will be
8969 // allocated by the runtime and placed directly after the static
8970 // allocated ones. They all share the same offset.
8971 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8972 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8973 // Adjust alignment for that dynamic shared memory array.
8976 MFI->setUsesDynamicLDS(true);
8977 return SDValue(
8978 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8979 }
8980 }
8982 }
8983
8985 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8987 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8988 }
8989
8990 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8991 if (Subtarget->has64BitLiterals()) {
8993 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8994 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8995 0);
8996 }
8997
8998 SDValue AddrLo = DAG.getTargetGlobalAddress(
8999 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9000 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9001
9002 SDValue AddrHi = DAG.getTargetGlobalAddress(
9003 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9004 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9005
9006 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9007 }
9008
9009 if (shouldEmitFixup(GV))
9010 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9011
9012 if (shouldEmitPCReloc(GV))
9013 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9015
9016 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9018 PointerType *PtrTy =
9020 const DataLayout &DataLayout = DAG.getDataLayout();
9021 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9022 MachinePointerInfo PtrInfo =
9024
9025 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9028}
9029
9031 const SDLoc &DL, SDValue V) const {
9032 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9033 // the destination register.
9034 //
9035 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9036 // so we will end up with redundant moves to m0.
9037 //
9038 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9039
9040 // A Null SDValue creates a glue result.
9041 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9042 V, Chain);
9043 return SDValue(M0, 0);
9044}
9045
9046SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9047 MVT VT,
9048 unsigned Offset) const {
9049 SDLoc SL(Op);
9050 SDValue Param = lowerKernargMemParameter(
9051 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9052 // The local size values will have the hi 16-bits as zero.
9053 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9054 DAG.getValueType(VT));
9055}
9056
9058 EVT VT) {
9061 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9062 return DAG.getPOISON(VT);
9063}
9064
9066 EVT VT) {
9069 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9070 return DAG.getPOISON(VT);
9071}
9072
9074 ArrayRef<SDValue> Elts) {
9075 assert(!Elts.empty());
9076 MVT Type;
9077 unsigned NumElts = Elts.size();
9078
9079 if (NumElts <= 12) {
9080 Type = MVT::getVectorVT(MVT::f32, NumElts);
9081 } else {
9082 assert(Elts.size() <= 16);
9083 Type = MVT::v16f32;
9084 NumElts = 16;
9085 }
9086
9087 SmallVector<SDValue, 16> VecElts(NumElts);
9088 for (unsigned i = 0; i < Elts.size(); ++i) {
9089 SDValue Elt = Elts[i];
9090 if (Elt.getValueType() != MVT::f32)
9091 Elt = DAG.getBitcast(MVT::f32, Elt);
9092 VecElts[i] = Elt;
9093 }
9094 for (unsigned i = Elts.size(); i < NumElts; ++i)
9095 VecElts[i] = DAG.getPOISON(MVT::f32);
9096
9097 if (NumElts == 1)
9098 return VecElts[0];
9099 return DAG.getBuildVector(Type, DL, VecElts);
9100}
9101
9102static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9103 SDValue Src, int ExtraElts) {
9104 EVT SrcVT = Src.getValueType();
9105
9107
9108 if (SrcVT.isVector())
9109 DAG.ExtractVectorElements(Src, Elts);
9110 else
9111 Elts.push_back(Src);
9112
9113 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9114 while (ExtraElts--)
9115 Elts.push_back(Undef);
9116
9117 return DAG.getBuildVector(CastVT, DL, Elts);
9118}
9119
9120// Re-construct the required return value for a image load intrinsic.
9121// This is more complicated due to the optional use TexFailCtrl which means the
9122// required return type is an aggregate
9124 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9125 bool Unpacked, bool IsD16, int DMaskPop,
9126 int NumVDataDwords, bool IsAtomicPacked16Bit,
9127 const SDLoc &DL) {
9128 // Determine the required return type. This is the same regardless of
9129 // IsTexFail flag
9130 EVT ReqRetVT = ResultTypes[0];
9131 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9132 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9133 ? (ReqRetNumElts + 1) / 2
9134 : ReqRetNumElts;
9135
9136 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9137
9138 MVT DataDwordVT =
9139 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9140
9141 MVT MaskPopVT =
9142 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9143
9144 SDValue Data(Result, 0);
9145 SDValue TexFail;
9146
9147 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9148 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9149 if (MaskPopVT.isVector()) {
9150 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9151 SDValue(Result, 0), ZeroIdx);
9152 } else {
9153 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9154 SDValue(Result, 0), ZeroIdx);
9155 }
9156 }
9157
9158 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9159 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9160 NumDataDwords - MaskPopDwords);
9161
9162 if (IsD16)
9163 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9164
9165 EVT LegalReqRetVT = ReqRetVT;
9166 if (!ReqRetVT.isVector()) {
9167 if (!Data.getValueType().isInteger())
9168 Data = DAG.getNode(ISD::BITCAST, DL,
9169 Data.getValueType().changeTypeToInteger(), Data);
9170 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9171 } else {
9172 // We need to widen the return vector to a legal type
9173 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9174 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9175 LegalReqRetVT =
9177 ReqRetVT.getVectorNumElements() + 1);
9178 }
9179 }
9180 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9181
9182 if (IsTexFail) {
9183 TexFail =
9184 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9185 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9186
9187 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9188 }
9189
9190 if (Result->getNumValues() == 1)
9191 return Data;
9192
9193 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9194}
9195
9196static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9197 SDValue *LWE, bool &IsTexFail) {
9198 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9199
9200 uint64_t Value = TexFailCtrlConst->getZExtValue();
9201 if (Value) {
9202 IsTexFail = true;
9203 }
9204
9205 SDLoc DL(TexFailCtrlConst);
9206 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9207 Value &= ~(uint64_t)0x1;
9208 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9209 Value &= ~(uint64_t)0x2;
9210
9211 return Value == 0;
9212}
9213
9215 MVT PackVectorVT,
9216 SmallVectorImpl<SDValue> &PackedAddrs,
9217 unsigned DimIdx, unsigned EndIdx,
9218 unsigned NumGradients) {
9219 SDLoc DL(Op);
9220 for (unsigned I = DimIdx; I < EndIdx; I++) {
9221 SDValue Addr = Op.getOperand(I);
9222
9223 // Gradients are packed with undef for each coordinate.
9224 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9225 // 1D: undef,dx/dh; undef,dx/dv
9226 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9227 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9228 if (((I + 1) >= EndIdx) ||
9229 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9230 I == DimIdx + NumGradients - 1))) {
9231 if (Addr.getValueType() != MVT::i16)
9232 Addr = DAG.getBitcast(MVT::i16, Addr);
9233 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9234 } else {
9235 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9236 I++;
9237 }
9238 Addr = DAG.getBitcast(MVT::f32, Addr);
9239 PackedAddrs.push_back(Addr);
9240 }
9241}
9242
9243SDValue SITargetLowering::lowerImage(SDValue Op,
9245 SelectionDAG &DAG, bool WithChain) const {
9246 SDLoc DL(Op);
9247 MachineFunction &MF = DAG.getMachineFunction();
9248 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9249 unsigned IntrOpcode = Intr->BaseOpcode;
9250 // For image atomic: use no-return opcode if result is unused.
9251 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9252 !Op.getNode()->hasAnyUseOfValue(0))
9253 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9254 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9256 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9257 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9258 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9259 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9260
9261 SmallVector<EVT, 3> ResultTypes(Op->values());
9262 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9263 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9264 ResultTypes.erase(&ResultTypes[0]);
9265
9266 bool IsD16 = false;
9267 bool IsG16 = false;
9268 bool IsA16 = false;
9269 SDValue VData;
9270 int NumVDataDwords = 0;
9271 bool AdjustRetType = false;
9272 bool IsAtomicPacked16Bit = false;
9273
9274 // Offset of intrinsic arguments
9275 const unsigned ArgOffset = WithChain ? 2 : 1;
9276
9277 unsigned DMask;
9278 unsigned DMaskLanes = 0;
9279
9280 if (BaseOpcode->Atomic) {
9281 VData = Op.getOperand(2);
9282
9283 IsAtomicPacked16Bit =
9284 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9285 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9286 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9287 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9288
9289 bool Is64Bit = VData.getValueSizeInBits() == 64;
9290 if (BaseOpcode->AtomicX2) {
9291 SDValue VData2 = Op.getOperand(3);
9292 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9293 {VData, VData2});
9294 if (Is64Bit)
9295 VData = DAG.getBitcast(MVT::v4i32, VData);
9296
9297 if (!BaseOpcode->NoReturn)
9298 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9299
9300 DMask = Is64Bit ? 0xf : 0x3;
9301 NumVDataDwords = Is64Bit ? 4 : 2;
9302 } else {
9303 DMask = Is64Bit ? 0x3 : 0x1;
9304 NumVDataDwords = Is64Bit ? 2 : 1;
9305 }
9306 } else {
9307 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9308 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9309
9310 if (BaseOpcode->Store) {
9311 VData = Op.getOperand(2);
9312
9313 MVT StoreVT = VData.getSimpleValueType();
9314 if (StoreVT.getScalarType() == MVT::f16) {
9315 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9316 return Op; // D16 is unsupported for this instruction
9317
9318 IsD16 = true;
9319 VData = handleD16VData(VData, DAG, true);
9320 }
9321
9322 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9323 } else if (!BaseOpcode->NoReturn) {
9324 // Work out the num dwords based on the dmask popcount and underlying type
9325 // and whether packing is supported.
9326 MVT LoadVT = ResultTypes[0].getSimpleVT();
9327 if (LoadVT.getScalarType() == MVT::f16) {
9328 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9329 return Op; // D16 is unsupported for this instruction
9330
9331 IsD16 = true;
9332 }
9333
9334 // Confirm that the return type is large enough for the dmask specified
9335 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9336 (!LoadVT.isVector() && DMaskLanes > 1))
9337 return Op;
9338
9339 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9340 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9341 // instructions.
9342 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9343 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9344 NumVDataDwords = (DMaskLanes + 1) / 2;
9345 else
9346 NumVDataDwords = DMaskLanes;
9347
9348 AdjustRetType = true;
9349 }
9350 }
9351
9352 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9354
9355 // Check for 16 bit addresses or derivatives and pack if true.
9356 MVT VAddrVT =
9357 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9358 MVT VAddrScalarVT = VAddrVT.getScalarType();
9359 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9360 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9361
9362 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9363 VAddrScalarVT = VAddrVT.getScalarType();
9364 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9365 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9366
9367 // Push back extra arguments.
9368 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9369 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9370 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9371 // Special handling of bias when A16 is on. Bias is of type half but
9372 // occupies full 32-bit.
9373 SDValue Bias = DAG.getBuildVector(
9374 MVT::v2f16, DL,
9375 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9376 VAddrs.push_back(Bias);
9377 } else {
9378 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9379 "Bias needs to be converted to 16 bit in A16 mode");
9380 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9381 }
9382 }
9383
9384 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9385 // 16 bit gradients are supported, but are tied to the A16 control
9386 // so both gradients and addresses must be 16 bit
9387 LLVM_DEBUG(
9388 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9389 "require 16 bit args for both gradients and addresses");
9390 return Op;
9391 }
9392
9393 if (IsA16) {
9394 if (!ST->hasA16()) {
9395 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9396 "support 16 bit addresses\n");
9397 return Op;
9398 }
9399 }
9400
9401 // We've dealt with incorrect input so we know that if IsA16, IsG16
9402 // are set then we have to compress/pack operands (either address,
9403 // gradient or both)
9404 // In the case where a16 and gradients are tied (no G16 support) then we
9405 // have already verified that both IsA16 and IsG16 are true
9406 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9407 // Activate g16
9408 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9410 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9411 }
9412
9413 // Add gradients (packed or unpacked)
9414 if (IsG16) {
9415 // Pack the gradients
9416 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9417 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9418 ArgOffset + Intr->GradientStart,
9419 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9420 } else {
9421 for (unsigned I = ArgOffset + Intr->GradientStart;
9422 I < ArgOffset + Intr->CoordStart; I++)
9423 VAddrs.push_back(Op.getOperand(I));
9424 }
9425
9426 // Add addresses (packed or unpacked)
9427 if (IsA16) {
9428 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9429 ArgOffset + Intr->CoordStart, VAddrEnd,
9430 0 /* No gradients */);
9431 } else {
9432 // Add uncompressed address
9433 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9434 VAddrs.push_back(Op.getOperand(I));
9435 }
9436
9437 // If the register allocator cannot place the address registers contiguously
9438 // without introducing moves, then using the non-sequential address encoding
9439 // is always preferable, since it saves VALU instructions and is usually a
9440 // wash in terms of code size or even better.
9441 //
9442 // However, we currently have no way of hinting to the register allocator that
9443 // MIMG addresses should be placed contiguously when it is possible to do so,
9444 // so force non-NSA for the common 2-address case as a heuristic.
9445 //
9446 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9447 // allocation when possible.
9448 //
9449 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9450 // set of the remaining addresses.
9451 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9452 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9453 const bool UseNSA = ST->hasNSAEncoding() &&
9454 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9455 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9456 const bool UsePartialNSA =
9457 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9458
9459 SDValue VAddr;
9460 if (UsePartialNSA) {
9461 VAddr = getBuildDwordsVector(DAG, DL,
9462 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9463 } else if (!UseNSA) {
9464 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9465 }
9466
9467 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9468 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9469 SDValue Unorm;
9470 if (!BaseOpcode->Sampler) {
9471 Unorm = True;
9472 } else {
9473 uint64_t UnormConst =
9474 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9475
9476 Unorm = UnormConst ? True : False;
9477 }
9478
9479 SDValue TFE;
9480 SDValue LWE;
9481 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9482 bool IsTexFail = false;
9483 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9484 return Op;
9485
9486 if (IsTexFail) {
9487 if (!DMaskLanes) {
9488 // Expecting to get an error flag since TFC is on - and dmask is 0
9489 // Force dmask to be at least 1 otherwise the instruction will fail
9490 DMask = 0x1;
9491 DMaskLanes = 1;
9492 NumVDataDwords = 1;
9493 }
9494 NumVDataDwords += 1;
9495 AdjustRetType = true;
9496 }
9497
9498 // Has something earlier tagged that the return type needs adjusting
9499 // This happens if the instruction is a load or has set TexFailCtrl flags
9500 if (AdjustRetType) {
9501 // NumVDataDwords reflects the true number of dwords required in the return
9502 // type
9503 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9504 // This is a no-op load. This can be eliminated
9505 SDValue Undef = DAG.getPOISON(Op.getValueType());
9506 if (isa<MemSDNode>(Op))
9507 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9508 return Undef;
9509 }
9510
9511 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9512 MVT::i32, NumVDataDwords)
9513 : MVT::i32;
9514
9515 ResultTypes[0] = NewVT;
9516 if (ResultTypes.size() == 3) {
9517 // Original result was aggregate type used for TexFailCtrl results
9518 // The actual instruction returns as a vector type which has now been
9519 // created. Remove the aggregate result.
9520 ResultTypes.erase(&ResultTypes[1]);
9521 }
9522 }
9523
9524 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9525 // Keep GLC only when the atomic's result is actually used.
9526 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9528 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9530 return Op;
9531
9533 if (BaseOpcode->Store || BaseOpcode->Atomic)
9534 Ops.push_back(VData); // vdata
9535 if (UsePartialNSA) {
9536 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9537 Ops.push_back(VAddr);
9538 } else if (UseNSA)
9539 append_range(Ops, VAddrs);
9540 else
9541 Ops.push_back(VAddr);
9542 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9543 EVT RsrcVT = Rsrc.getValueType();
9544 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9545 return Op;
9546 Ops.push_back(Rsrc);
9547 if (BaseOpcode->Sampler) {
9548 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9549 if (Samp.getValueType() != MVT::v4i32)
9550 return Op;
9551 Ops.push_back(Samp);
9552 }
9553 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9554 if (IsGFX10Plus)
9555 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9556 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9557 Ops.push_back(Unorm);
9558 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9559 Ops.push_back(IsA16 && // r128, a16 for gfx9
9560 ST->hasFeature(AMDGPU::FeatureR128A16)
9561 ? True
9562 : False);
9563 if (IsGFX10Plus)
9564 Ops.push_back(IsA16 ? True : False);
9565
9566 if (!Subtarget->hasGFX90AInsts())
9567 Ops.push_back(TFE); // tfe
9568 else if (TFE->getAsZExtVal()) {
9569 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9571 "TFE is not supported on this GPU", DL.getDebugLoc()));
9572 }
9573
9574 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9575 Ops.push_back(LWE); // lwe
9576 if (!IsGFX10Plus)
9577 Ops.push_back(DimInfo->DA ? True : False);
9578 if (BaseOpcode->HasD16)
9579 Ops.push_back(IsD16 ? True : False);
9580 if (isa<MemSDNode>(Op))
9581 Ops.push_back(Op.getOperand(0)); // chain
9582
9583 int NumVAddrDwords =
9584 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9585 int Opcode = -1;
9586
9587 if (IsGFX12Plus) {
9588 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9589 NumVDataDwords, NumVAddrDwords);
9590 } else if (IsGFX11Plus) {
9591 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9592 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9593 : AMDGPU::MIMGEncGfx11Default,
9594 NumVDataDwords, NumVAddrDwords);
9595 } else if (IsGFX10Plus) {
9596 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9597 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9598 : AMDGPU::MIMGEncGfx10Default,
9599 NumVDataDwords, NumVAddrDwords);
9600 } else {
9601 if (Subtarget->hasGFX90AInsts()) {
9602 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9603 NumVDataDwords, NumVAddrDwords);
9604 if (Opcode == -1) {
9605 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9607 "requested image instruction is not supported on this GPU",
9608 DL.getDebugLoc()));
9609
9610 unsigned Idx = 0;
9611 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9612 for (EVT VT : OrigResultTypes) {
9613 if (VT == MVT::Other)
9614 RetValues[Idx++] = Op.getOperand(0); // Chain
9615 else
9616 RetValues[Idx++] = DAG.getPOISON(VT);
9617 }
9618
9619 return DAG.getMergeValues(RetValues, DL);
9620 }
9621 }
9622 if (Opcode == -1 &&
9623 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9624 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9625 NumVDataDwords, NumVAddrDwords);
9626 if (Opcode == -1)
9627 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9628 NumVDataDwords, NumVAddrDwords);
9629 }
9630 if (Opcode == -1)
9631 return Op;
9632
9633 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9634 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9635 MachineMemOperand *MemRef = MemOp->getMemOperand();
9636 DAG.setNodeMemRefs(NewNode, {MemRef});
9637 }
9638
9639 if (BaseOpcode->NoReturn) {
9640 if (BaseOpcode->Atomic)
9641 return DAG.getMergeValues(
9642 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9643
9644 return SDValue(NewNode, 0);
9645 }
9646
9647 if (BaseOpcode->AtomicX2) {
9649 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9650 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9651 }
9652
9653 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9654 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9655 NumVDataDwords, IsAtomicPacked16Bit, DL);
9656}
9657
9658SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9659 SDValue Offset, SDValue CachePolicy,
9660 SelectionDAG &DAG) const {
9661 MachineFunction &MF = DAG.getMachineFunction();
9662
9663 const DataLayout &DataLayout = DAG.getDataLayout();
9664 Align Alignment =
9665 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9666
9667 MachineMemOperand *MMO = MF.getMachineMemOperand(
9668 MachinePointerInfo(),
9671 VT.getStoreSize(), Alignment);
9672
9673 if (!Offset->isDivergent()) {
9674 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9675
9676 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9677 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9678 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9679 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9680 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9681 SDValue BufferLoad =
9682 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
9683 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9684 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9685 }
9686
9687 // Widen vec3 load to vec4.
9688 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9689 !Subtarget->hasScalarDwordx3Loads()) {
9690 EVT WidenedVT =
9692 auto WidenedOp = DAG.getMemIntrinsicNode(
9693 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9694 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9695 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9696 DAG.getVectorIdxConstant(0, DL));
9697 return Subvector;
9698 }
9699
9700 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
9701 DAG.getVTList(VT), Ops, VT, MMO);
9702 }
9703
9704 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9705 // assume that the buffer is unswizzled.
9706 SDValue Ops[] = {
9707 DAG.getEntryNode(), // Chain
9708 Rsrc, // rsrc
9709 DAG.getConstant(0, DL, MVT::i32), // vindex
9710 {}, // voffset
9711 {}, // soffset
9712 {}, // offset
9713 CachePolicy, // cachepolicy
9714 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9715 };
9716 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9717 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9718 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9719 }
9720
9722 unsigned NumLoads = 1;
9723 MVT LoadVT = VT.getSimpleVT();
9724 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9725 assert((LoadVT.getScalarType() == MVT::i32 ||
9726 LoadVT.getScalarType() == MVT::f32));
9727
9728 if (NumElts == 8 || NumElts == 16) {
9729 NumLoads = NumElts / 4;
9730 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9731 }
9732
9733 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9734
9735 // Use the alignment to ensure that the required offsets will fit into the
9736 // immediate offsets.
9737 setBufferOffsets(Offset, DAG, &Ops[3],
9738 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9739
9740 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9741 for (unsigned i = 0; i < NumLoads; ++i) {
9742 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9743 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9744 LoadVT, MMO, DAG));
9745 }
9746
9747 if (NumElts == 8 || NumElts == 16)
9748 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9749
9750 return Loads[0];
9751}
9752
9753SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9754 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9755 if (!Subtarget->hasArchitectedSGPRs())
9756 return {};
9757 SDLoc SL(Op);
9758 MVT VT = MVT::i32;
9759 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9760 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9761 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9762}
9763
9764SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9765 AMDGPU::Hwreg::Id HwReg,
9766 unsigned LowBit,
9767 unsigned Width) const {
9768 SDLoc SL(Op);
9769 using namespace AMDGPU::Hwreg;
9770 return {DAG.getMachineNode(
9771 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9772 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9773 SL, MVT::i32)),
9774 0};
9775}
9776
9777SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9778 unsigned Dim,
9779 const ArgDescriptor &Arg) const {
9780 SDLoc SL(Op);
9781 MachineFunction &MF = DAG.getMachineFunction();
9782 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9783 if (MaxID == 0)
9784 return DAG.getConstant(0, SL, MVT::i32);
9785
9786 // It's undefined behavior if a function marked with the amdgpu-no-*
9787 // attributes uses the corresponding intrinsic.
9788 if (!Arg)
9789 return DAG.getPOISON(Op->getValueType(0));
9790
9791 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9792 SDLoc(DAG.getEntryNode()), Arg);
9793
9794 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9795 // masking operations anyway.
9796 //
9797 // TODO: We could assert the top bit is 0 for the source copy.
9798 if (Arg.isMasked())
9799 return Val;
9800
9801 // Preserve the known bits after expansion to a copy.
9802 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9803 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9804 DAG.getValueType(SmallVT));
9805}
9806
9807SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9808 SelectionDAG &DAG) const {
9809 MachineFunction &MF = DAG.getMachineFunction();
9810 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9811
9812 EVT VT = Op.getValueType();
9813 SDLoc DL(Op);
9814 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9815
9816 // TODO: Should this propagate fast-math-flags?
9817
9818 switch (IntrinsicID) {
9819 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9820 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9821 return emitNonHSAIntrinsicError(DAG, DL, VT);
9822 return getPreloadedValue(DAG, *MFI, VT,
9824 }
9825 case Intrinsic::amdgcn_dispatch_ptr:
9826 case Intrinsic::amdgcn_queue_ptr: {
9827 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9828 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9829 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9830 DL.getDebugLoc()));
9831 return DAG.getPOISON(VT);
9832 }
9833
9834 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9837 return getPreloadedValue(DAG, *MFI, VT, RegID);
9838 }
9839 case Intrinsic::amdgcn_implicitarg_ptr: {
9840 if (MFI->isEntryFunction())
9841 return getImplicitArgPtr(DAG, DL);
9842 return getPreloadedValue(DAG, *MFI, VT,
9844 }
9845 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9846 if (!AMDGPU::isKernel(MF.getFunction())) {
9847 // This only makes sense to call in a kernel, so just lower to null.
9848 return DAG.getConstant(0, DL, VT);
9849 }
9850
9851 return getPreloadedValue(DAG, *MFI, VT,
9853 }
9854 case Intrinsic::amdgcn_dispatch_id: {
9855 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9856 }
9857 case Intrinsic::amdgcn_rcp:
9858 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9859 case Intrinsic::amdgcn_rsq:
9860 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9861 case Intrinsic::amdgcn_rsq_legacy:
9862 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9863 return emitRemovedIntrinsicError(DAG, DL, VT);
9864 return SDValue();
9865 case Intrinsic::amdgcn_rcp_legacy:
9866 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9867 return emitRemovedIntrinsicError(DAG, DL, VT);
9868 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9869 case Intrinsic::amdgcn_rsq_clamp: {
9870 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9871 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9872
9873 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9874 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9875 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9876
9877 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9878 SDValue Tmp =
9879 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9880 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9881 DAG.getConstantFP(Min, DL, VT));
9882 }
9883 case Intrinsic::r600_read_ngroups_x:
9884 if (Subtarget->isAmdHsaOS())
9885 return emitNonHSAIntrinsicError(DAG, DL, VT);
9886
9887 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9889 false);
9890 case Intrinsic::r600_read_ngroups_y:
9891 if (Subtarget->isAmdHsaOS())
9892 return emitNonHSAIntrinsicError(DAG, DL, VT);
9893
9894 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9896 false);
9897 case Intrinsic::r600_read_ngroups_z:
9898 if (Subtarget->isAmdHsaOS())
9899 return emitNonHSAIntrinsicError(DAG, DL, VT);
9900
9901 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9903 false);
9904 case Intrinsic::r600_read_local_size_x:
9905 if (Subtarget->isAmdHsaOS())
9906 return emitNonHSAIntrinsicError(DAG, DL, VT);
9907
9908 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9910 case Intrinsic::r600_read_local_size_y:
9911 if (Subtarget->isAmdHsaOS())
9912 return emitNonHSAIntrinsicError(DAG, DL, VT);
9913
9914 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9916 case Intrinsic::r600_read_local_size_z:
9917 if (Subtarget->isAmdHsaOS())
9918 return emitNonHSAIntrinsicError(DAG, DL, VT);
9919
9920 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9922 case Intrinsic::amdgcn_workgroup_id_x:
9923 return lowerWorkGroupId(DAG, *MFI, VT,
9927 case Intrinsic::amdgcn_workgroup_id_y:
9928 return lowerWorkGroupId(DAG, *MFI, VT,
9932 case Intrinsic::amdgcn_workgroup_id_z:
9933 return lowerWorkGroupId(DAG, *MFI, VT,
9937 case Intrinsic::amdgcn_cluster_id_x:
9938 return Subtarget->hasClusters()
9939 ? getPreloadedValue(DAG, *MFI, VT,
9941 : DAG.getPOISON(VT);
9942 case Intrinsic::amdgcn_cluster_id_y:
9943 return Subtarget->hasClusters()
9944 ? getPreloadedValue(DAG, *MFI, VT,
9946 : DAG.getPOISON(VT);
9947 case Intrinsic::amdgcn_cluster_id_z:
9948 return Subtarget->hasClusters()
9949 ? getPreloadedValue(DAG, *MFI, VT,
9951 : DAG.getPOISON(VT);
9952 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9953 return Subtarget->hasClusters()
9954 ? getPreloadedValue(
9955 DAG, *MFI, VT,
9957 : DAG.getPOISON(VT);
9958 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9959 return Subtarget->hasClusters()
9960 ? getPreloadedValue(
9961 DAG, *MFI, VT,
9963 : DAG.getPOISON(VT);
9964 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9965 return Subtarget->hasClusters()
9966 ? getPreloadedValue(
9967 DAG, *MFI, VT,
9969 : DAG.getPOISON(VT);
9970 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9971 return Subtarget->hasClusters()
9972 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9973 : SDValue();
9974 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9975 return Subtarget->hasClusters()
9976 ? getPreloadedValue(
9977 DAG, *MFI, VT,
9979 : DAG.getPOISON(VT);
9980 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9981 return Subtarget->hasClusters()
9982 ? getPreloadedValue(
9983 DAG, *MFI, VT,
9985 : DAG.getPOISON(VT);
9986 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9987 return Subtarget->hasClusters()
9988 ? getPreloadedValue(
9989 DAG, *MFI, VT,
9991 : DAG.getPOISON(VT);
9992 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9993 return Subtarget->hasClusters()
9994 ? getPreloadedValue(
9995 DAG, *MFI, VT,
9997 : DAG.getPOISON(VT);
9998 case Intrinsic::amdgcn_wave_id:
9999 return lowerWaveID(DAG, Op);
10000 case Intrinsic::amdgcn_lds_kernel_id: {
10001 if (MFI->isEntryFunction())
10002 return getLDSKernelId(DAG, DL);
10003 return getPreloadedValue(DAG, *MFI, VT,
10005 }
10006 case Intrinsic::amdgcn_workitem_id_x:
10007 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10008 case Intrinsic::amdgcn_workitem_id_y:
10009 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10010 case Intrinsic::amdgcn_workitem_id_z:
10011 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10012 case Intrinsic::amdgcn_wavefrontsize:
10013 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10014 SDLoc(Op), MVT::i32);
10015 case Intrinsic::amdgcn_s_buffer_load: {
10016 unsigned CPol = Op.getConstantOperandVal(3);
10017 // s_buffer_load, because of how it's optimized, can't be volatile
10018 // so reject ones with the volatile bit set.
10019 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10022 return Op;
10023 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10024 Op.getOperand(3), DAG);
10025 }
10026 case Intrinsic::amdgcn_fdiv_fast:
10027 return lowerFDIV_FAST(Op, DAG);
10028 case Intrinsic::amdgcn_sin:
10029 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10030
10031 case Intrinsic::amdgcn_cos:
10032 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10033
10034 case Intrinsic::amdgcn_mul_u24:
10035 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10036 Op.getOperand(2));
10037 case Intrinsic::amdgcn_mul_i24:
10038 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10039 Op.getOperand(2));
10040
10041 case Intrinsic::amdgcn_log_clamp: {
10042 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10043 return SDValue();
10044
10045 return emitRemovedIntrinsicError(DAG, DL, VT);
10046 }
10047 case Intrinsic::amdgcn_fract:
10048 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10049
10050 case Intrinsic::amdgcn_class:
10051 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10052 Op.getOperand(2));
10053 case Intrinsic::amdgcn_div_fmas:
10054 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10055 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10056
10057 case Intrinsic::amdgcn_div_fixup:
10058 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10059 Op.getOperand(2), Op.getOperand(3));
10060
10061 case Intrinsic::amdgcn_div_scale: {
10062 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10063
10064 // Translate to the operands expected by the machine instruction. The
10065 // first parameter must be the same as the first instruction.
10066 SDValue Numerator = Op.getOperand(1);
10067 SDValue Denominator = Op.getOperand(2);
10068
10069 // Note this order is opposite of the machine instruction's operations,
10070 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10071 // intrinsic has the numerator as the first operand to match a normal
10072 // division operation.
10073
10074 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10075
10076 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10077 Denominator, Numerator);
10078 }
10079 case Intrinsic::amdgcn_icmp: {
10080 // There is a Pat that handles this variant, so return it as-is.
10081 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10082 Op.getConstantOperandVal(2) == 0 &&
10083 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10084 return Op;
10085 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10086 }
10087 case Intrinsic::amdgcn_fcmp: {
10088 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10089 }
10090 case Intrinsic::amdgcn_ballot:
10091 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10092 case Intrinsic::amdgcn_fmed3:
10093 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10094 Op.getOperand(2), Op.getOperand(3));
10095 case Intrinsic::amdgcn_fdot2:
10096 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10097 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10098 case Intrinsic::amdgcn_fmul_legacy:
10099 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10100 Op.getOperand(2));
10101 case Intrinsic::amdgcn_sffbh:
10102 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10103 case Intrinsic::amdgcn_sbfe:
10104 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10105 Op.getOperand(2), Op.getOperand(3));
10106 case Intrinsic::amdgcn_ubfe:
10107 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10108 Op.getOperand(2), Op.getOperand(3));
10109 case Intrinsic::amdgcn_cvt_pkrtz:
10110 case Intrinsic::amdgcn_cvt_pknorm_i16:
10111 case Intrinsic::amdgcn_cvt_pknorm_u16:
10112 case Intrinsic::amdgcn_cvt_pk_i16:
10113 case Intrinsic::amdgcn_cvt_pk_u16: {
10114 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10115 EVT VT = Op.getValueType();
10116 unsigned Opcode;
10117
10118 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10119 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10120 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10121 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10122 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10123 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10124 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10125 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10126 else
10127 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10128
10129 if (isTypeLegal(VT))
10130 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10131
10132 SDValue Node =
10133 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10134 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10135 }
10136 case Intrinsic::amdgcn_fmad_ftz:
10137 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10138 Op.getOperand(2), Op.getOperand(3));
10139
10140 case Intrinsic::amdgcn_if_break:
10141 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10142 Op->getOperand(1), Op->getOperand(2)),
10143 0);
10144
10145 case Intrinsic::amdgcn_groupstaticsize: {
10147 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10148 return Op;
10149
10150 const Module *M = MF.getFunction().getParent();
10151 const GlobalValue *GV =
10152 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10153 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10155 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10156 }
10157 case Intrinsic::amdgcn_is_shared:
10158 case Intrinsic::amdgcn_is_private: {
10159 SDLoc SL(Op);
10160 SDValue SrcVec =
10161 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10162 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10163 DAG.getConstant(1, SL, MVT::i32));
10164
10165 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10167 : AMDGPUAS::PRIVATE_ADDRESS;
10168 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10169 Subtarget->hasGloballyAddressableScratch()) {
10170 SDValue FlatScratchBaseHi(
10171 DAG.getMachineNode(
10172 AMDGPU::S_MOV_B32, DL, MVT::i32,
10173 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10174 0);
10175 // Test bits 63..58 against the aperture address.
10176 return DAG.getSetCC(
10177 SL, MVT::i1,
10178 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10179 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10180 }
10181
10182 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10183 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10184 }
10185 case Intrinsic::amdgcn_perm:
10186 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10187 Op.getOperand(2), Op.getOperand(3));
10188 case Intrinsic::amdgcn_reloc_constant: {
10189 Module *M = MF.getFunction().getParent();
10190 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10191 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10192 auto *RelocSymbol = cast<GlobalVariable>(
10193 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10194 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10196 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10197 }
10198 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10199 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10200 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10201 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10202 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10203 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10204 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10205 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10206 if (Op.getOperand(4).getValueType() == MVT::i32)
10207 return SDValue();
10208
10209 SDLoc SL(Op);
10210 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10211 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10212 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10213 Op.getOperand(3), IndexKeyi32);
10214 }
10215 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10216 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10217 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10218 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10219 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10220 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10221 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10222 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10223 if (Op.getOperand(4).getValueType() == MVT::i64)
10224 return SDValue();
10225
10226 SDLoc SL(Op);
10227 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10228 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10229 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10230 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10231 Op.getOperand(6)});
10232 }
10233 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10234 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10235 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10236 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10237 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10238 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10239 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10240 ? MVT::i64
10241 : MVT::i32;
10242 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10243 return SDValue();
10244
10245 SDLoc SL(Op);
10246 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10247 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10248 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10249 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10250 IndexKey, Op.getOperand(7),
10251 Op.getOperand(8)}); // No clamp operand
10252 }
10253 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10254 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10255 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10256 if (Op.getOperand(6).getValueType() == MVT::i32)
10257 return SDValue();
10258
10259 SDLoc SL(Op);
10260 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10261 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10262 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10263 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10264 IndexKeyi32, Op.getOperand(7)});
10265 }
10266 case Intrinsic::amdgcn_addrspacecast_nonnull:
10267 return lowerADDRSPACECAST(Op, DAG);
10268 case Intrinsic::amdgcn_readlane:
10269 case Intrinsic::amdgcn_readfirstlane:
10270 case Intrinsic::amdgcn_writelane:
10271 case Intrinsic::amdgcn_permlane16:
10272 case Intrinsic::amdgcn_permlanex16:
10273 case Intrinsic::amdgcn_permlane64:
10274 case Intrinsic::amdgcn_set_inactive:
10275 case Intrinsic::amdgcn_set_inactive_chain_arg:
10276 case Intrinsic::amdgcn_mov_dpp8:
10277 case Intrinsic::amdgcn_update_dpp:
10278 return lowerLaneOp(*this, Op.getNode(), DAG);
10279 case Intrinsic::amdgcn_dead: {
10281 for (const EVT ValTy : Op.getNode()->values())
10282 Poisons.push_back(DAG.getPOISON(ValTy));
10283 return DAG.getMergeValues(Poisons, SDLoc(Op));
10284 }
10285 default:
10286 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10288 return lowerImage(Op, ImageDimIntr, DAG, false);
10289
10290 return Op;
10291 }
10292}
10293
10294// On targets not supporting constant in soffset field, turn zero to
10295// SGPR_NULL to avoid generating an extra s_mov with zero.
10297 const GCNSubtarget *Subtarget) {
10298 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10299 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10300 return SOffset;
10301}
10302
10303SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10304 SelectionDAG &DAG,
10305 unsigned NewOpcode) const {
10306 SDLoc DL(Op);
10307
10308 SDValue VData = Op.getOperand(2);
10309 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10310 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10311 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10312 SDValue Ops[] = {
10313 Op.getOperand(0), // Chain
10314 VData, // vdata
10315 Rsrc, // rsrc
10316 DAG.getConstant(0, DL, MVT::i32), // vindex
10317 VOffset, // voffset
10318 SOffset, // soffset
10319 Offset, // offset
10320 Op.getOperand(6), // cachepolicy
10321 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10322 };
10323
10324 auto *M = cast<MemSDNode>(Op);
10325
10326 EVT MemVT = VData.getValueType();
10327 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10328 M->getMemOperand());
10329}
10330
10331SDValue
10332SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10333 unsigned NewOpcode) const {
10334 SDLoc DL(Op);
10335
10336 SDValue VData = Op.getOperand(2);
10337 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10338 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10339 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10340 SDValue Ops[] = {
10341 Op.getOperand(0), // Chain
10342 VData, // vdata
10343 Rsrc, // rsrc
10344 Op.getOperand(4), // vindex
10345 VOffset, // voffset
10346 SOffset, // soffset
10347 Offset, // offset
10348 Op.getOperand(7), // cachepolicy
10349 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10350 };
10351
10352 auto *M = cast<MemSDNode>(Op);
10353
10354 EVT MemVT = VData.getValueType();
10355 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10356 M->getMemOperand());
10357}
10358
10359SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10360 SelectionDAG &DAG) const {
10361 unsigned IntrID = Op.getConstantOperandVal(1);
10362 SDLoc DL(Op);
10363
10364 switch (IntrID) {
10365 case Intrinsic::amdgcn_ds_ordered_add:
10366 case Intrinsic::amdgcn_ds_ordered_swap: {
10367 MemSDNode *M = cast<MemSDNode>(Op);
10368 SDValue Chain = M->getOperand(0);
10369 SDValue M0 = M->getOperand(2);
10370 SDValue Value = M->getOperand(3);
10371 unsigned IndexOperand = M->getConstantOperandVal(7);
10372 unsigned WaveRelease = M->getConstantOperandVal(8);
10373 unsigned WaveDone = M->getConstantOperandVal(9);
10374
10375 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10376 IndexOperand &= ~0x3f;
10377 unsigned CountDw = 0;
10378
10379 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10380 CountDw = (IndexOperand >> 24) & 0xf;
10381 IndexOperand &= ~(0xf << 24);
10382
10383 if (CountDw < 1 || CountDw > 4) {
10384 const Function &Fn = DAG.getMachineFunction().getFunction();
10385 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10386 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10387 DL.getDebugLoc()));
10388 CountDw = 1;
10389 }
10390 }
10391
10392 if (IndexOperand) {
10393 const Function &Fn = DAG.getMachineFunction().getFunction();
10394 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10395 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10396 }
10397
10398 if (WaveDone && !WaveRelease) {
10399 // TODO: Move this to IR verifier
10400 const Function &Fn = DAG.getMachineFunction().getFunction();
10401 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10402 Fn, "ds_ordered_count: wave_done requires wave_release",
10403 DL.getDebugLoc()));
10404 }
10405
10406 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10407 unsigned ShaderType =
10409 unsigned Offset0 = OrderedCountIndex << 2;
10410 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10411
10412 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10413 Offset1 |= (CountDw - 1) << 6;
10414
10415 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10416 Offset1 |= ShaderType << 2;
10417
10418 unsigned Offset = Offset0 | (Offset1 << 8);
10419
10420 SDValue Ops[] = {
10421 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10422 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10423 };
10424 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10425 M->getVTList(), Ops, M->getMemoryVT(),
10426 M->getMemOperand());
10427 }
10428 case Intrinsic::amdgcn_raw_buffer_load:
10429 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10430 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10431 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10432 case Intrinsic::amdgcn_raw_buffer_load_format:
10433 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10434 const bool IsFormat =
10435 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10436 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10437
10438 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10439 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10440 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10441 SDValue Ops[] = {
10442 Op.getOperand(0), // Chain
10443 Rsrc, // rsrc
10444 DAG.getConstant(0, DL, MVT::i32), // vindex
10445 VOffset, // voffset
10446 SOffset, // soffset
10447 Offset, // offset
10448 Op.getOperand(5), // cachepolicy, swizzled buffer
10449 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10450 };
10451
10452 auto *M = cast<MemSDNode>(Op);
10453 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10454 }
10455 case Intrinsic::amdgcn_struct_buffer_load:
10456 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10457 case Intrinsic::amdgcn_struct_buffer_load_format:
10458 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10459 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10460 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10461 const bool IsFormat =
10462 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10463 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10464
10465 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10466 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10467 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10468 SDValue Ops[] = {
10469 Op.getOperand(0), // Chain
10470 Rsrc, // rsrc
10471 Op.getOperand(3), // vindex
10472 VOffset, // voffset
10473 SOffset, // soffset
10474 Offset, // offset
10475 Op.getOperand(6), // cachepolicy, swizzled buffer
10476 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10477 };
10478
10479 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10480 }
10481 case Intrinsic::amdgcn_raw_tbuffer_load:
10482 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10483 MemSDNode *M = cast<MemSDNode>(Op);
10484 EVT LoadVT = Op.getValueType();
10485 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10486 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10487 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10488
10489 SDValue Ops[] = {
10490 Op.getOperand(0), // Chain
10491 Rsrc, // rsrc
10492 DAG.getConstant(0, DL, MVT::i32), // vindex
10493 VOffset, // voffset
10494 SOffset, // soffset
10495 Offset, // offset
10496 Op.getOperand(5), // format
10497 Op.getOperand(6), // cachepolicy, swizzled buffer
10498 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10499 };
10500
10501 if (LoadVT.getScalarType() == MVT::f16)
10502 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10503 Ops);
10504 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10505 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10506 DAG);
10507 }
10508 case Intrinsic::amdgcn_struct_tbuffer_load:
10509 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10510 MemSDNode *M = cast<MemSDNode>(Op);
10511 EVT LoadVT = Op.getValueType();
10512 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10513 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10514 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10515
10516 SDValue Ops[] = {
10517 Op.getOperand(0), // Chain
10518 Rsrc, // rsrc
10519 Op.getOperand(3), // vindex
10520 VOffset, // voffset
10521 SOffset, // soffset
10522 Offset, // offset
10523 Op.getOperand(6), // format
10524 Op.getOperand(7), // cachepolicy, swizzled buffer
10525 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10526 };
10527
10528 if (LoadVT.getScalarType() == MVT::f16)
10529 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10530 Ops);
10531 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10532 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10533 DAG);
10534 }
10535 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10536 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10537 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10538 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10539 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10540 return lowerStructBufferAtomicIntrin(Op, DAG,
10541 AMDGPUISD::BUFFER_ATOMIC_FADD);
10542 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10543 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10544 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10545 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10546 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10547 return lowerStructBufferAtomicIntrin(Op, DAG,
10548 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10549 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10550 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10551 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10552 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10553 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10554 return lowerStructBufferAtomicIntrin(Op, DAG,
10555 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10556 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10557 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10558 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10559 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10560 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10561 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10562 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10563 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10564 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10565 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10566 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10567 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10568 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10569 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10570 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10571 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10572 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10573 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10574 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10575 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10576 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10577 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10578 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10579 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10580 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10581 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10582 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10583 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10584 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10585 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10586 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10587 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10588 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10589 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10590 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10591 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10592 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10593 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10594 return lowerStructBufferAtomicIntrin(Op, DAG,
10595 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10596 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10597 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10598 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10599 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10600 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10601 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10602 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10603 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10604 return lowerStructBufferAtomicIntrin(Op, DAG,
10605 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10606 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10607 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10608 return lowerStructBufferAtomicIntrin(Op, DAG,
10609 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10610 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10611 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10612 return lowerStructBufferAtomicIntrin(Op, DAG,
10613 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10614 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10615 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10616 return lowerStructBufferAtomicIntrin(Op, DAG,
10617 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10618 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10620 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10621 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10622 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10623 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10624 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10625 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10626 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10627 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10628 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10629 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10630 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10631 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10632 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10633 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10634 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10635 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10636 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10637 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10638 return lowerStructBufferAtomicIntrin(Op, DAG,
10639 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10640 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10641 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10642 return lowerRawBufferAtomicIntrin(Op, DAG,
10643 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10644 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10645 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10646 return lowerStructBufferAtomicIntrin(Op, DAG,
10647 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10648 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10649 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10650 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10651 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10652 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10653 SDValue Ops[] = {
10654 Op.getOperand(0), // Chain
10655 Op.getOperand(2), // src
10656 Op.getOperand(3), // cmp
10657 Rsrc, // rsrc
10658 DAG.getConstant(0, DL, MVT::i32), // vindex
10659 VOffset, // voffset
10660 SOffset, // soffset
10661 Offset, // offset
10662 Op.getOperand(7), // cachepolicy
10663 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10664 };
10665 EVT VT = Op.getValueType();
10666 auto *M = cast<MemSDNode>(Op);
10667
10668 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10669 Op->getVTList(), Ops, VT,
10670 M->getMemOperand());
10671 }
10672 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10673 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10674 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10675 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10676 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10677 SDValue Ops[] = {
10678 Op.getOperand(0), // Chain
10679 Op.getOperand(2), // src
10680 Op.getOperand(3), // cmp
10681 Rsrc, // rsrc
10682 Op.getOperand(5), // vindex
10683 VOffset, // voffset
10684 SOffset, // soffset
10685 Offset, // offset
10686 Op.getOperand(8), // cachepolicy
10687 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10688 };
10689 EVT VT = Op.getValueType();
10690 auto *M = cast<MemSDNode>(Op);
10691
10692 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10693 Op->getVTList(), Ops, VT,
10694 M->getMemOperand());
10695 }
10696 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10697 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10698 MemSDNode *M = cast<MemSDNode>(Op);
10699 SDValue NodePtr = M->getOperand(2);
10700 SDValue RayExtent = M->getOperand(3);
10701 SDValue InstanceMask = M->getOperand(4);
10702 SDValue RayOrigin = M->getOperand(5);
10703 SDValue RayDir = M->getOperand(6);
10704 SDValue Offsets = M->getOperand(7);
10705 SDValue TDescr = M->getOperand(8);
10706
10707 assert(NodePtr.getValueType() == MVT::i64);
10708 assert(RayDir.getValueType() == MVT::v3f32);
10709
10710 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10711 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10712 return SDValue();
10713 }
10714
10715 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10716 const unsigned NumVDataDwords = 10;
10717 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10718 int Opcode = AMDGPU::getMIMGOpcode(
10719 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10720 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10721 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10722 assert(Opcode != -1);
10723
10725 Ops.push_back(NodePtr);
10726 Ops.push_back(DAG.getBuildVector(
10727 MVT::v2i32, DL,
10728 {DAG.getBitcast(MVT::i32, RayExtent),
10729 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10730 Ops.push_back(RayOrigin);
10731 Ops.push_back(RayDir);
10732 Ops.push_back(Offsets);
10733 Ops.push_back(TDescr);
10734 Ops.push_back(M->getChain());
10735
10736 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10737 MachineMemOperand *MemRef = M->getMemOperand();
10738 DAG.setNodeMemRefs(NewNode, {MemRef});
10739 return SDValue(NewNode, 0);
10740 }
10741 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10742 MemSDNode *M = cast<MemSDNode>(Op);
10743 SDValue NodePtr = M->getOperand(2);
10744 SDValue RayExtent = M->getOperand(3);
10745 SDValue RayOrigin = M->getOperand(4);
10746 SDValue RayDir = M->getOperand(5);
10747 SDValue RayInvDir = M->getOperand(6);
10748 SDValue TDescr = M->getOperand(7);
10749
10750 assert(NodePtr.getValueType() == MVT::i32 ||
10751 NodePtr.getValueType() == MVT::i64);
10752 assert(RayDir.getValueType() == MVT::v3f16 ||
10753 RayDir.getValueType() == MVT::v3f32);
10754
10755 if (!Subtarget->hasGFX10_AEncoding()) {
10756 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10757 return SDValue();
10758 }
10759
10760 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10761 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10762 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10763 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10764 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10765 const unsigned NumVDataDwords = 4;
10766 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10767 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10768 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10769 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10770 IsGFX12Plus;
10771 const unsigned BaseOpcodes[2][2] = {
10772 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10773 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10774 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10775 int Opcode;
10776 if (UseNSA) {
10777 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10778 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10779 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10780 : AMDGPU::MIMGEncGfx10NSA,
10781 NumVDataDwords, NumVAddrDwords);
10782 } else {
10783 assert(!IsGFX12Plus);
10784 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10785 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10786 : AMDGPU::MIMGEncGfx10Default,
10787 NumVDataDwords, NumVAddrDwords);
10788 }
10789 assert(Opcode != -1);
10790
10792
10793 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10795 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10796 if (Lanes[0].getValueSizeInBits() == 32) {
10797 for (unsigned I = 0; I < 3; ++I)
10798 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10799 } else {
10800 if (IsAligned) {
10801 Ops.push_back(DAG.getBitcast(
10802 MVT::i32,
10803 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10804 Ops.push_back(Lanes[2]);
10805 } else {
10806 SDValue Elt0 = Ops.pop_back_val();
10807 Ops.push_back(DAG.getBitcast(
10808 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10809 Ops.push_back(DAG.getBitcast(
10810 MVT::i32,
10811 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10812 }
10813 }
10814 };
10815
10816 if (UseNSA && IsGFX11Plus) {
10817 Ops.push_back(NodePtr);
10818 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10819 Ops.push_back(RayOrigin);
10820 if (IsA16) {
10821 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10822 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10823 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10824 for (unsigned I = 0; I < 3; ++I) {
10825 MergedLanes.push_back(DAG.getBitcast(
10826 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10827 {DirLanes[I], InvDirLanes[I]})));
10828 }
10829 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10830 } else {
10831 Ops.push_back(RayDir);
10832 Ops.push_back(RayInvDir);
10833 }
10834 } else {
10835 if (Is64)
10836 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10837 2);
10838 else
10839 Ops.push_back(NodePtr);
10840
10841 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10842 packLanes(RayOrigin, true);
10843 packLanes(RayDir, true);
10844 packLanes(RayInvDir, false);
10845 }
10846
10847 if (!UseNSA) {
10848 // Build a single vector containing all the operands so far prepared.
10849 if (NumVAddrDwords > 12) {
10850 SDValue Undef = DAG.getPOISON(MVT::i32);
10851 Ops.append(16 - Ops.size(), Undef);
10852 }
10853 assert(Ops.size() >= 8 && Ops.size() <= 12);
10854 SDValue MergedOps =
10855 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10856 Ops.clear();
10857 Ops.push_back(MergedOps);
10858 }
10859
10860 Ops.push_back(TDescr);
10861 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10862 Ops.push_back(M->getChain());
10863
10864 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10865 MachineMemOperand *MemRef = M->getMemOperand();
10866 DAG.setNodeMemRefs(NewNode, {MemRef});
10867 return SDValue(NewNode, 0);
10868 }
10869 case Intrinsic::amdgcn_global_atomic_fmin_num:
10870 case Intrinsic::amdgcn_global_atomic_fmax_num:
10871 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10872 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10873 MemSDNode *M = cast<MemSDNode>(Op);
10874 SDValue Ops[] = {
10875 M->getOperand(0), // Chain
10876 M->getOperand(2), // Ptr
10877 M->getOperand(3) // Value
10878 };
10879 unsigned Opcode = 0;
10880 switch (IntrID) {
10881 case Intrinsic::amdgcn_global_atomic_fmin_num:
10882 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10883 Opcode = ISD::ATOMIC_LOAD_FMIN;
10884 break;
10885 }
10886 case Intrinsic::amdgcn_global_atomic_fmax_num:
10887 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10888 Opcode = ISD::ATOMIC_LOAD_FMAX;
10889 break;
10890 }
10891 default:
10892 llvm_unreachable("unhandled atomic opcode");
10893 }
10894 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10895 Ops, M->getMemOperand());
10896 }
10897 case Intrinsic::amdgcn_s_get_barrier_state:
10898 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10899 SDValue Chain = Op->getOperand(0);
10901 unsigned Opc;
10902
10903 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10904 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10905 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10906 BarID = (BarID >> 4) & 0x3F;
10907 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10908 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10909 Ops.push_back(K);
10910 Ops.push_back(Chain);
10911 } else {
10912 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10913 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10914 SDValue M0Val;
10915 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10916 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10917 M0Val = SDValue(
10918 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10919 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10920 0);
10921 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10922 } else
10923 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10924 }
10925
10926 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10927 return SDValue(NewMI, 0);
10928 }
10929 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10930 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10931 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10932 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10933 SDValue Chain = Op->getOperand(0);
10934 SDValue Ptr = Op->getOperand(2);
10935 EVT VT = Op->getValueType(0);
10936 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10937 Chain, Ptr, MII->getMemOperand());
10938 }
10939 default:
10940
10941 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10943 return lowerImage(Op, ImageDimIntr, DAG, true);
10944
10945 return SDValue();
10946 }
10947}
10948
10949// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10950// dwordx4 if on SI and handle TFE loads.
10951SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10952 SDVTList VTList,
10953 ArrayRef<SDValue> Ops, EVT MemVT,
10954 MachineMemOperand *MMO,
10955 SelectionDAG &DAG) const {
10956 LLVMContext &C = *DAG.getContext();
10957 MachineFunction &MF = DAG.getMachineFunction();
10958 EVT VT = VTList.VTs[0];
10959
10960 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10961 bool IsTFE = VTList.NumVTs == 3;
10962 if (IsTFE) {
10963 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10964 unsigned NumOpDWords = NumValueDWords + 1;
10965 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10966 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10967 MachineMemOperand *OpDWordsMMO =
10968 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10969 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10970 OpDWordsVT, OpDWordsMMO, DAG);
10971 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10972 DAG.getVectorIdxConstant(NumValueDWords, DL));
10973 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10974 SDValue ValueDWords =
10975 NumValueDWords == 1
10976 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10978 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10979 ZeroIdx);
10980 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10981 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10982 }
10983
10984 if (!Subtarget->hasDwordx3LoadStores() &&
10985 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10986 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10987 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10988 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10989 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10990 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10991 WidenedMemVT, WidenedMMO);
10993 DAG.getVectorIdxConstant(0, DL));
10994 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10995 }
10996
10997 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10998}
10999
11000SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11001 bool ImageStore) const {
11002 EVT StoreVT = VData.getValueType();
11003
11004 // No change for f16 and legal vector D16 types.
11005 if (!StoreVT.isVector())
11006 return VData;
11007
11008 SDLoc DL(VData);
11009 unsigned NumElements = StoreVT.getVectorNumElements();
11010
11011 if (Subtarget->hasUnpackedD16VMem()) {
11012 // We need to unpack the packed data to store.
11013 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11014 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11015
11016 EVT EquivStoreVT =
11017 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11018 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11019 return DAG.UnrollVectorOp(ZExt.getNode());
11020 }
11021
11022 // The sq block of gfx8.1 does not estimate register use correctly for d16
11023 // image store instructions. The data operand is computed as if it were not a
11024 // d16 image instruction.
11025 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11026 // Bitcast to i16
11027 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11028 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11029
11030 // Decompose into scalars
11032 DAG.ExtractVectorElements(IntVData, Elts);
11033
11034 // Group pairs of i16 into v2i16 and bitcast to i32
11035 SmallVector<SDValue, 4> PackedElts;
11036 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11037 SDValue Pair =
11038 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11039 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11040 PackedElts.push_back(IntPair);
11041 }
11042 if ((NumElements % 2) == 1) {
11043 // Handle v3i16
11044 unsigned I = Elts.size() / 2;
11045 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11046 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11047 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11048 PackedElts.push_back(IntPair);
11049 }
11050
11051 // Pad using UNDEF
11052 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11053
11054 // Build final vector
11055 EVT VecVT =
11056 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11057 return DAG.getBuildVector(VecVT, DL, PackedElts);
11058 }
11059
11060 if (NumElements == 3) {
11061 EVT IntStoreVT =
11063 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11064
11065 EVT WidenedStoreVT = EVT::getVectorVT(
11066 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11067 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11068 WidenedStoreVT.getStoreSizeInBits());
11069 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11070 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11071 }
11072
11073 assert(isTypeLegal(StoreVT));
11074 return VData;
11075}
11076
11077SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11078 SelectionDAG &DAG) const {
11079 SDLoc DL(Op);
11080 SDValue Chain = Op.getOperand(0);
11081 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11082 MachineFunction &MF = DAG.getMachineFunction();
11083
11084 switch (IntrinsicID) {
11085 case Intrinsic::amdgcn_exp_compr: {
11086 if (!Subtarget->hasCompressedExport()) {
11087 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11089 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11090 }
11091 SDValue Src0 = Op.getOperand(4);
11092 SDValue Src1 = Op.getOperand(5);
11093 // Hack around illegal type on SI by directly selecting it.
11094 if (isTypeLegal(Src0.getValueType()))
11095 return SDValue();
11096
11097 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11098 SDValue Undef = DAG.getPOISON(MVT::f32);
11099 const SDValue Ops[] = {
11100 Op.getOperand(2), // tgt
11101 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11102 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11103 Undef, // src2
11104 Undef, // src3
11105 Op.getOperand(7), // vm
11106 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11107 Op.getOperand(3), // en
11108 Op.getOperand(0) // Chain
11109 };
11110
11111 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11112 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11113 }
11114
11115 case Intrinsic::amdgcn_struct_tbuffer_store:
11116 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11117 SDValue VData = Op.getOperand(2);
11118 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11119 if (IsD16)
11120 VData = handleD16VData(VData, DAG);
11121 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11122 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11123 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11124 SDValue Ops[] = {
11125 Chain,
11126 VData, // vdata
11127 Rsrc, // rsrc
11128 Op.getOperand(4), // vindex
11129 VOffset, // voffset
11130 SOffset, // soffset
11131 Offset, // offset
11132 Op.getOperand(7), // format
11133 Op.getOperand(8), // cachepolicy, swizzled buffer
11134 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11135 };
11136 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11137 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11138 MemSDNode *M = cast<MemSDNode>(Op);
11139 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11140 M->getMemoryVT(), M->getMemOperand());
11141 }
11142
11143 case Intrinsic::amdgcn_raw_tbuffer_store:
11144 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11145 SDValue VData = Op.getOperand(2);
11146 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11147 if (IsD16)
11148 VData = handleD16VData(VData, DAG);
11149 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11150 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11151 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11152 SDValue Ops[] = {
11153 Chain,
11154 VData, // vdata
11155 Rsrc, // rsrc
11156 DAG.getConstant(0, DL, MVT::i32), // vindex
11157 VOffset, // voffset
11158 SOffset, // soffset
11159 Offset, // offset
11160 Op.getOperand(6), // format
11161 Op.getOperand(7), // cachepolicy, swizzled buffer
11162 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11163 };
11164 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11165 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11166 MemSDNode *M = cast<MemSDNode>(Op);
11167 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11168 M->getMemoryVT(), M->getMemOperand());
11169 }
11170
11171 case Intrinsic::amdgcn_raw_buffer_store:
11172 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11173 case Intrinsic::amdgcn_raw_buffer_store_format:
11174 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11175 const bool IsFormat =
11176 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11177 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11178
11179 SDValue VData = Op.getOperand(2);
11180 EVT VDataVT = VData.getValueType();
11181 EVT EltType = VDataVT.getScalarType();
11182 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11183 if (IsD16) {
11184 VData = handleD16VData(VData, DAG);
11185 VDataVT = VData.getValueType();
11186 }
11187
11188 if (!isTypeLegal(VDataVT)) {
11189 VData =
11190 DAG.getNode(ISD::BITCAST, DL,
11191 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11192 }
11193
11194 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11195 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11196 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11197 SDValue Ops[] = {
11198 Chain,
11199 VData,
11200 Rsrc,
11201 DAG.getConstant(0, DL, MVT::i32), // vindex
11202 VOffset, // voffset
11203 SOffset, // soffset
11204 Offset, // offset
11205 Op.getOperand(6), // cachepolicy, swizzled buffer
11206 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11207 };
11208 unsigned Opc =
11209 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11210 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11211 MemSDNode *M = cast<MemSDNode>(Op);
11212
11213 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11214 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11215 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11216
11217 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11218 M->getMemoryVT(), M->getMemOperand());
11219 }
11220
11221 case Intrinsic::amdgcn_struct_buffer_store:
11222 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11223 case Intrinsic::amdgcn_struct_buffer_store_format:
11224 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11225 const bool IsFormat =
11226 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11227 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11228
11229 SDValue VData = Op.getOperand(2);
11230 EVT VDataVT = VData.getValueType();
11231 EVT EltType = VDataVT.getScalarType();
11232 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11233
11234 if (IsD16) {
11235 VData = handleD16VData(VData, DAG);
11236 VDataVT = VData.getValueType();
11237 }
11238
11239 if (!isTypeLegal(VDataVT)) {
11240 VData =
11241 DAG.getNode(ISD::BITCAST, DL,
11242 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11243 }
11244
11245 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11246 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11247 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11248 SDValue Ops[] = {
11249 Chain,
11250 VData,
11251 Rsrc,
11252 Op.getOperand(4), // vindex
11253 VOffset, // voffset
11254 SOffset, // soffset
11255 Offset, // offset
11256 Op.getOperand(7), // cachepolicy, swizzled buffer
11257 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11258 };
11259 unsigned Opc =
11260 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11261 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11262 MemSDNode *M = cast<MemSDNode>(Op);
11263
11264 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11265 EVT VDataType = VData.getValueType().getScalarType();
11266 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11267 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11268
11269 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11270 M->getMemoryVT(), M->getMemOperand());
11271 }
11272 case Intrinsic::amdgcn_raw_buffer_load_lds:
11273 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11274 case Intrinsic::amdgcn_struct_buffer_load_lds:
11275 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11276 if (!Subtarget->hasVMemToLDSLoad())
11277 return SDValue();
11278 unsigned Opc;
11279 bool HasVIndex =
11280 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11281 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11282 unsigned OpOffset = HasVIndex ? 1 : 0;
11283 SDValue VOffset = Op.getOperand(5 + OpOffset);
11284 bool HasVOffset = !isNullConstant(VOffset);
11285 unsigned Size = Op->getConstantOperandVal(4);
11286
11287 switch (Size) {
11288 default:
11289 return SDValue();
11290 case 1:
11291 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11292 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11293 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11294 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11295 break;
11296 case 2:
11297 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11298 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11299 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11300 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11301 break;
11302 case 4:
11303 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11304 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11305 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11306 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11307 break;
11308 case 12:
11309 if (!Subtarget->hasLDSLoadB96_B128())
11310 return SDValue();
11311 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11312 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11313 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11314 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11315 break;
11316 case 16:
11317 if (!Subtarget->hasLDSLoadB96_B128())
11318 return SDValue();
11319 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11320 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11321 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11322 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11323 break;
11324 }
11325
11326 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11327
11329
11330 if (HasVIndex && HasVOffset)
11331 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11332 {Op.getOperand(5), // VIndex
11333 VOffset}));
11334 else if (HasVIndex)
11335 Ops.push_back(Op.getOperand(5));
11336 else if (HasVOffset)
11337 Ops.push_back(VOffset);
11338
11339 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11340 Ops.push_back(Rsrc);
11341 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11342 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11343 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11344 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11345 Ops.push_back(DAG.getTargetConstant(
11346 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11347 DL, MVT::i8)); // cpol
11348 Ops.push_back(DAG.getTargetConstant(
11349 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11350 ? 1
11351 : 0,
11352 DL, MVT::i8)); // swz
11353 Ops.push_back(M0Val.getValue(0)); // Chain
11354 Ops.push_back(M0Val.getValue(1)); // Glue
11355
11356 auto *M = cast<MemSDNode>(Op);
11357 MachineMemOperand *LoadMMO = M->getMemOperand();
11358 // Don't set the offset value here because the pointer points to the base of
11359 // the buffer.
11360 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11361
11362 MachinePointerInfo StorePtrI = LoadPtrI;
11363 LoadPtrI.V = PoisonValue::get(
11367
11368 auto F = LoadMMO->getFlags() &
11370 LoadMMO =
11372 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11373
11374 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11375 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11376 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11377
11378 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11379 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11380
11381 return SDValue(Load, 0);
11382 }
11383 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11384 // for "trust me" that the remaining cases are global pointers until
11385 // such time as we can put two mem operands on an intrinsic.
11386 case Intrinsic::amdgcn_load_to_lds:
11387 case Intrinsic::amdgcn_global_load_lds: {
11388 if (!Subtarget->hasVMemToLDSLoad())
11389 return SDValue();
11390
11391 unsigned Opc;
11392 unsigned Size = Op->getConstantOperandVal(4);
11393 switch (Size) {
11394 default:
11395 return SDValue();
11396 case 1:
11397 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11398 break;
11399 case 2:
11400 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11401 break;
11402 case 4:
11403 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11404 break;
11405 case 12:
11406 if (!Subtarget->hasLDSLoadB96_B128())
11407 return SDValue();
11408 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11409 break;
11410 case 16:
11411 if (!Subtarget->hasLDSLoadB96_B128())
11412 return SDValue();
11413 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11414 break;
11415 }
11416
11417 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11418
11420
11421 SDValue Addr = Op.getOperand(2); // Global ptr
11422 SDValue VOffset;
11423 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11424 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11425 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11426 SDValue LHS = Addr.getOperand(0);
11427 SDValue RHS = Addr.getOperand(1);
11428
11429 if (LHS->isDivergent())
11430 std::swap(LHS, RHS);
11431
11432 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11433 RHS.getOperand(0).getValueType() == MVT::i32) {
11434 // add (i64 sgpr), (zero_extend (i32 vgpr))
11435 Addr = LHS;
11436 VOffset = RHS.getOperand(0);
11437 }
11438 }
11439
11440 Ops.push_back(Addr);
11441 if (!Addr->isDivergent()) {
11443 if (!VOffset)
11444 VOffset =
11445 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11446 DAG.getTargetConstant(0, DL, MVT::i32)),
11447 0);
11448 Ops.push_back(VOffset);
11449 }
11450
11451 Ops.push_back(Op.getOperand(5)); // Offset
11452
11453 unsigned Aux = Op.getConstantOperandVal(6);
11454 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11455 MVT::i32)); // CPol
11456
11457 Ops.push_back(M0Val.getValue(0)); // Chain
11458 Ops.push_back(M0Val.getValue(1)); // Glue
11459
11460 auto *M = cast<MemSDNode>(Op);
11461 MachineMemOperand *LoadMMO = M->getMemOperand();
11462 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11463 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11464 MachinePointerInfo StorePtrI = LoadPtrI;
11465 LoadPtrI.V = PoisonValue::get(
11469 auto F = LoadMMO->getFlags() &
11471 LoadMMO =
11473 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11474 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11475 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11476 LoadMMO->getAAInfo());
11477
11478 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11479 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11480
11481 return SDValue(Load, 0);
11482 }
11483 case Intrinsic::amdgcn_end_cf:
11484 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11485 Op->getOperand(2), Chain),
11486 0);
11487 case Intrinsic::amdgcn_s_barrier_init:
11488 case Intrinsic::amdgcn_s_barrier_signal_var: {
11489 // these two intrinsics have two operands: barrier pointer and member count
11490 SDValue Chain = Op->getOperand(0);
11492 SDValue BarOp = Op->getOperand(2);
11493 SDValue CntOp = Op->getOperand(3);
11494 SDValue M0Val;
11495 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11496 ? AMDGPU::S_BARRIER_INIT_M0
11497 : AMDGPU::S_BARRIER_SIGNAL_M0;
11498 // extract the BarrierID from bits 4-9 of BarOp
11499 SDValue BarID;
11500 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11501 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11502 BarID =
11503 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11504 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11505 0);
11506 // Member count should be put into M0[ShAmt:+6]
11507 // Barrier ID should be put into M0[5:0]
11508 M0Val =
11509 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11510 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11511 0);
11512 constexpr unsigned ShAmt = 16;
11513 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11514 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11515
11516 M0Val = SDValue(
11517 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11518
11519 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11520
11521 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11522 return SDValue(NewMI, 0);
11523 }
11524 case Intrinsic::amdgcn_s_wakeup_barrier: {
11525 if (!Subtarget->hasSWakeupBarrier())
11526 return SDValue();
11527 [[fallthrough]];
11528 }
11529 case Intrinsic::amdgcn_s_barrier_join: {
11530 // these three intrinsics have one operand: barrier pointer
11531 SDValue Chain = Op->getOperand(0);
11533 SDValue BarOp = Op->getOperand(2);
11534 unsigned Opc;
11535
11536 if (isa<ConstantSDNode>(BarOp)) {
11537 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11538 switch (IntrinsicID) {
11539 default:
11540 return SDValue();
11541 case Intrinsic::amdgcn_s_barrier_join:
11542 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11543 break;
11544 case Intrinsic::amdgcn_s_wakeup_barrier:
11545 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11546 break;
11547 }
11548 // extract the BarrierID from bits 4-9 of the immediate
11549 unsigned BarID = (BarVal >> 4) & 0x3F;
11550 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11551 Ops.push_back(K);
11552 Ops.push_back(Chain);
11553 } else {
11554 switch (IntrinsicID) {
11555 default:
11556 return SDValue();
11557 case Intrinsic::amdgcn_s_barrier_join:
11558 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11559 break;
11560 case Intrinsic::amdgcn_s_wakeup_barrier:
11561 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11562 break;
11563 }
11564 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11565 SDValue M0Val;
11566 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11567 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11568 M0Val =
11569 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11570 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11571 0);
11572 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11573 }
11574
11575 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11576 return SDValue(NewMI, 0);
11577 }
11578 case Intrinsic::amdgcn_s_prefetch_data: {
11579 // For non-global address space preserve the chain and remove the call.
11581 return Op.getOperand(0);
11582 return Op;
11583 }
11584 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11585 SDValue Ops[] = {
11586 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11587 Op.getOperand(3), // offset
11588 Op.getOperand(4), // length
11589 };
11590
11591 MemSDNode *M = cast<MemSDNode>(Op);
11592 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11593 Op->getVTList(), Ops, M->getMemoryVT(),
11594 M->getMemOperand());
11595 }
11596 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11597 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11598 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11599 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11600 SDValue Chain = Op->getOperand(0);
11601 SDValue Ptr = Op->getOperand(2);
11602 SDValue Val = Op->getOperand(3);
11603 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11604 Ptr, MII->getMemOperand());
11605 }
11606 default: {
11607 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11609 return lowerImage(Op, ImageDimIntr, DAG, true);
11610
11611 return Op;
11612 }
11613 }
11614}
11615
11616// Return whether the operation has NoUnsignedWrap property.
11617static bool isNoUnsignedWrap(SDValue Addr) {
11618 return (Addr.getOpcode() == ISD::ADD &&
11619 Addr->getFlags().hasNoUnsignedWrap()) ||
11620 Addr->getOpcode() == ISD::OR;
11621}
11622
11624 EVT PtrVT) const {
11625 return PtrVT == MVT::i64;
11626}
11627
11629 EVT PtrVT) const {
11630 return true;
11631}
11632
11633// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11634// offset (the offset that is included in bounds checking and swizzling, to be
11635// split between the instruction's voffset and immoffset fields) and soffset
11636// (the offset that is excluded from bounds checking and swizzling, to go in
11637// the instruction's soffset field). This function takes the first kind of
11638// offset and figures out how to split it between voffset and immoffset.
11639std::pair<SDValue, SDValue>
11640SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11641 SDLoc DL(Offset);
11642 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11643 SDValue N0 = Offset;
11644 ConstantSDNode *C1 = nullptr;
11645
11646 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11647 N0 = SDValue();
11648 else if (DAG.isBaseWithConstantOffset(N0)) {
11649 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11650 // being added, so we can only safely match a 32-bit addition with no
11651 // unsigned overflow.
11652 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11653 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11654 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11655 N0 = N0.getOperand(0);
11656 }
11657 }
11658
11659 if (C1) {
11660 unsigned ImmOffset = C1->getZExtValue();
11661 // If the immediate value is too big for the immoffset field, put only bits
11662 // that would normally fit in the immoffset field. The remaining value that
11663 // is copied/added for the voffset field is a large power of 2, and it
11664 // stands more chance of being CSEd with the copy/add for another similar
11665 // load/store.
11666 // However, do not do that rounding down if that is a negative
11667 // number, as it appears to be illegal to have a negative offset in the
11668 // vgpr, even if adding the immediate offset makes it positive.
11669 unsigned Overflow = ImmOffset & ~MaxImm;
11670 ImmOffset -= Overflow;
11671 if ((int32_t)Overflow < 0) {
11672 Overflow += ImmOffset;
11673 ImmOffset = 0;
11674 }
11675 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11676 if (Overflow) {
11677 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11678 if (!N0)
11679 N0 = OverflowVal;
11680 else {
11681 SDValue Ops[] = {N0, OverflowVal};
11682 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11683 }
11684 }
11685 }
11686 if (!N0)
11687 N0 = DAG.getConstant(0, DL, MVT::i32);
11688 if (!C1)
11689 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11690 return {N0, SDValue(C1, 0)};
11691}
11692
11693// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11694// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11695// pointed to by Offsets.
11696void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11697 SelectionDAG &DAG, SDValue *Offsets,
11698 Align Alignment) const {
11699 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11700 SDLoc DL(CombinedOffset);
11701 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11702 uint32_t Imm = C->getZExtValue();
11703 uint32_t SOffset, ImmOffset;
11704 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11705 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11706 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11707 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11708 return;
11709 }
11710 }
11711 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11712 SDValue N0 = CombinedOffset.getOperand(0);
11713 SDValue N1 = CombinedOffset.getOperand(1);
11714 uint32_t SOffset, ImmOffset;
11715 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11716 if (Offset >= 0 &&
11717 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11718 Offsets[0] = N0;
11719 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11720 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11721 return;
11722 }
11723 }
11724
11725 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11726 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11727 : DAG.getConstant(0, DL, MVT::i32);
11728
11729 Offsets[0] = CombinedOffset;
11730 Offsets[1] = SOffsetZero;
11731 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11732}
11733
11734SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11735 SelectionDAG &DAG) const {
11736 if (!MaybePointer.getValueType().isScalarInteger())
11737 return MaybePointer;
11738
11739 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11740 return Rsrc;
11741}
11742
11743// Wrap a global or flat pointer into a buffer intrinsic using the flags
11744// specified in the intrinsic.
11745SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11746 SelectionDAG &DAG) const {
11747 SDLoc Loc(Op);
11748
11749 SDValue Pointer = Op->getOperand(1);
11750 SDValue Stride = Op->getOperand(2);
11751 SDValue NumRecords = Op->getOperand(3);
11752 SDValue Flags = Op->getOperand(4);
11753
11754 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11755 SDValue Rsrc;
11756
11757 if (Subtarget->has45BitNumRecordsBufferResource()) {
11758 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11759 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11760 // num_records.
11761 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11762 SDValue NumRecordsLHS =
11763 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11764 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11765 SDValue LowHalf =
11766 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11767
11768 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11769 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11770 SDValue NumRecordsRHS =
11771 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11772 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11773 SDValue ShiftedStride =
11774 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11775 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11776 SDValue ExtShiftedStrideVec =
11777 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11778 SDValue ExtShiftedStride =
11779 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11780 SDValue ShiftedFlags =
11781 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11782 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11783 SDValue ExtShiftedFlagsVec =
11784 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11785 SDValue ExtShiftedFlags =
11786 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11787 SDValue CombinedFields =
11788 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11789 SDValue HighHalf =
11790 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11791
11792 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11793 } else {
11794 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11795 auto [LowHalf, HighHalf] =
11796 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11797 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11798 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11799 SDValue ShiftedStride =
11800 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11801 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11802 SDValue NewHighHalf =
11803 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11804
11805 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11806 NumRecords, Flags);
11807 }
11808
11809 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11810 return RsrcPtr;
11811}
11812
11813// Handle 8 bit and 16 bit buffer loads
11814SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11815 EVT LoadVT, SDLoc DL,
11817 MachineMemOperand *MMO,
11818 bool IsTFE) const {
11819 EVT IntVT = LoadVT.changeTypeToInteger();
11820
11821 if (IsTFE) {
11822 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11823 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11824 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11825 MachineFunction &MF = DAG.getMachineFunction();
11826 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11827 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11828 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11829 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11830 DAG.getConstant(1, DL, MVT::i32));
11831 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11832 DAG.getConstant(0, DL, MVT::i32));
11833 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11834 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11835 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11836 }
11837
11838 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11839 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11840 : AMDGPUISD::BUFFER_LOAD_USHORT;
11841
11842 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11843 SDValue BufferLoad =
11844 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11845 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11846 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11847
11848 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11849}
11850
11851// Handle 8 bit and 16 bit buffer stores
11852SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11853 EVT VDataType, SDLoc DL,
11854 SDValue Ops[],
11855 MemSDNode *M) const {
11856 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11857 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11858
11859 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11860 Ops[1] = BufferStoreExt;
11861 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11862 : AMDGPUISD::BUFFER_STORE_SHORT;
11863 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11864 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11865 M->getMemOperand());
11866}
11867
11869 SDValue Op, const SDLoc &SL, EVT VT) {
11870 if (VT.bitsLT(Op.getValueType()))
11871 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11872
11873 switch (ExtType) {
11874 case ISD::SEXTLOAD:
11875 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11876 case ISD::ZEXTLOAD:
11877 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11878 case ISD::EXTLOAD:
11879 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11880 case ISD::NON_EXTLOAD:
11881 return Op;
11882 }
11883
11884 llvm_unreachable("invalid ext type");
11885}
11886
11887// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11888// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11889SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11890 DAGCombinerInfo &DCI) const {
11891 SelectionDAG &DAG = DCI.DAG;
11892 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11893 return SDValue();
11894
11895 // FIXME: Constant loads should all be marked invariant.
11896 unsigned AS = Ld->getAddressSpace();
11897 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11899 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11900 return SDValue();
11901
11902 // Don't do this early, since it may interfere with adjacent load merging for
11903 // illegal types. We can avoid losing alignment information for exotic types
11904 // pre-legalize.
11905 EVT MemVT = Ld->getMemoryVT();
11906 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11907 MemVT.getSizeInBits() >= 32)
11908 return SDValue();
11909
11910 SDLoc SL(Ld);
11911
11912 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11913 "unexpected vector extload");
11914
11915 // TODO: Drop only high part of range.
11916 SDValue Ptr = Ld->getBasePtr();
11917 SDValue NewLoad = DAG.getLoad(
11918 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11919 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11920 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11921 nullptr); // Drop ranges
11922
11923 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11924 if (MemVT.isFloatingPoint()) {
11926 "unexpected fp extload");
11927 TruncVT = MemVT.changeTypeToInteger();
11928 }
11929
11930 SDValue Cvt = NewLoad;
11931 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11932 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11933 DAG.getValueType(TruncVT));
11934 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11936 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11937 } else {
11939 }
11940
11941 EVT VT = Ld->getValueType(0);
11942 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11943
11944 DCI.AddToWorklist(Cvt.getNode());
11945
11946 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11947 // the appropriate extension from the 32-bit load.
11948 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11949 DCI.AddToWorklist(Cvt.getNode());
11950
11951 // Handle conversion back to floating point if necessary.
11952 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11953
11954 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11955}
11956
11958 const SIMachineFunctionInfo &Info) {
11959 // TODO: Should check if the address can definitely not access stack.
11960 if (Info.isEntryFunction())
11961 return Info.getUserSGPRInfo().hasFlatScratchInit();
11962 return true;
11963}
11964
11965SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11966 SDLoc DL(Op);
11967 LoadSDNode *Load = cast<LoadSDNode>(Op);
11968 ISD::LoadExtType ExtType = Load->getExtensionType();
11969 EVT MemVT = Load->getMemoryVT();
11970 MachineMemOperand *MMO = Load->getMemOperand();
11971
11972 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11973 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11974 return SDValue();
11975
11976 // FIXME: Copied from PPC
11977 // First, load into 32 bits, then truncate to 1 bit.
11978
11979 SDValue Chain = Load->getChain();
11980 SDValue BasePtr = Load->getBasePtr();
11981
11982 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11983
11984 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11985 RealMemVT, MMO);
11986
11987 if (!MemVT.isVector()) {
11988 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11989 NewLD.getValue(1)};
11990
11991 return DAG.getMergeValues(Ops, DL);
11992 }
11993
11995 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11996 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11997 DAG.getConstant(I, DL, MVT::i32));
11998
11999 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12000 }
12001
12002 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12003
12004 return DAG.getMergeValues(Ops, DL);
12005 }
12006
12007 if (!MemVT.isVector())
12008 return SDValue();
12009
12010 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12011 "Custom lowering for non-i32 vectors hasn't been implemented.");
12012
12013 Align Alignment = Load->getAlign();
12014 unsigned AS = Load->getAddressSpace();
12015 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12016 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12017 return SplitVectorLoad(Op, DAG);
12018 }
12019
12020 MachineFunction &MF = DAG.getMachineFunction();
12021 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12022 // If there is a possibility that flat instruction access scratch memory
12023 // then we need to use the same legalization rules we use for private.
12024 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12025 !Subtarget->hasMultiDwordFlatScratchAddressing())
12026 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12029
12030 unsigned NumElements = MemVT.getVectorNumElements();
12031
12032 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12034 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12035 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12036 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12037 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12038 Alignment >= Align(4) && NumElements < 32) {
12039 if (MemVT.isPow2VectorType() ||
12040 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12041 return SDValue();
12042 return WidenOrSplitVectorLoad(Op, DAG);
12043 }
12044 // Non-uniform loads will be selected to MUBUF instructions, so they
12045 // have the same legalization requirements as global and private
12046 // loads.
12047 //
12048 }
12049 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12052 if (NumElements > 4)
12053 return SplitVectorLoad(Op, DAG);
12054 // v3 loads not supported on SI.
12055 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12056 return WidenOrSplitVectorLoad(Op, DAG);
12057
12058 // v3 and v4 loads are supported for private and global memory.
12059 return SDValue();
12060 }
12061 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12062 // Depending on the setting of the private_element_size field in the
12063 // resource descriptor, we can only make private accesses up to a certain
12064 // size.
12065 switch (Subtarget->getMaxPrivateElementSize()) {
12066 case 4: {
12067 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12068 return DAG.getMergeValues({Op0, Op1}, DL);
12069 }
12070 case 8:
12071 if (NumElements > 2)
12072 return SplitVectorLoad(Op, DAG);
12073 return SDValue();
12074 case 16:
12075 // Same as global/flat
12076 if (NumElements > 4)
12077 return SplitVectorLoad(Op, DAG);
12078 // v3 loads not supported on SI.
12079 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12080 return WidenOrSplitVectorLoad(Op, DAG);
12081
12082 return SDValue();
12083 default:
12084 llvm_unreachable("unsupported private_element_size");
12085 }
12086 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12087 unsigned Fast = 0;
12088 auto Flags = Load->getMemOperand()->getFlags();
12090 Load->getAlign(), Flags, &Fast) &&
12091 Fast > 1)
12092 return SDValue();
12093
12094 if (MemVT.isVector())
12095 return SplitVectorLoad(Op, DAG);
12096 }
12097
12099 MemVT, *Load->getMemOperand())) {
12100 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12101 return DAG.getMergeValues({Op0, Op1}, DL);
12102 }
12103
12104 return SDValue();
12105}
12106
12107SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12108 EVT VT = Op.getValueType();
12109 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12110 VT.getSizeInBits() == 512)
12111 return splitTernaryVectorOp(Op, DAG);
12112
12113 assert(VT.getSizeInBits() == 64);
12114
12115 SDLoc DL(Op);
12116 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12117
12118 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12119 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12120
12121 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12122 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12123
12124 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12125 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12126
12127 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12128
12129 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12130 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12131
12132 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12133
12134 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12135 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12136}
12137
12138// Catch division cases where we can use shortcuts with rcp and rsq
12139// instructions.
12140SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12141 SelectionDAG &DAG) const {
12142 SDLoc SL(Op);
12143 SDValue LHS = Op.getOperand(0);
12144 SDValue RHS = Op.getOperand(1);
12145 EVT VT = Op.getValueType();
12146 const SDNodeFlags Flags = Op->getFlags();
12147
12148 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12149
12150 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12151 // Without !fpmath accuracy information, we can't do more because we don't
12152 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12153 // f16 is always accurate enough
12154 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12155 return SDValue();
12156
12157 if (CLHS->isExactlyValue(1.0)) {
12158 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12159 // the CI documentation has a worst case error of 1 ulp.
12160 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12161 // use it as long as we aren't trying to use denormals.
12162 //
12163 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12164
12165 // 1.0 / sqrt(x) -> rsq(x)
12166
12167 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12168 // error seems really high at 2^29 ULP.
12169 // 1.0 / x -> rcp(x)
12170 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12171 }
12172
12173 // Same as for 1.0, but expand the sign out of the constant.
12174 if (CLHS->isExactlyValue(-1.0)) {
12175 // -1.0 / x -> rcp (fneg x)
12176 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12177 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12178 }
12179 }
12180
12181 // For f16 and bf16 require afn or arcp.
12182 // For f32 require afn.
12183 if (!AllowInaccurateRcp &&
12184 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12185 return SDValue();
12186
12187 // Turn into multiply by the reciprocal.
12188 // x / y -> x * (1.0 / y)
12189 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12190 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12191}
12192
12193SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12194 SelectionDAG &DAG) const {
12195 SDLoc SL(Op);
12196 SDValue X = Op.getOperand(0);
12197 SDValue Y = Op.getOperand(1);
12198 EVT VT = Op.getValueType();
12199 const SDNodeFlags Flags = Op->getFlags();
12200
12201 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12202 if (!AllowInaccurateDiv)
12203 return SDValue();
12204
12205 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12206 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12207
12208 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12209 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12210
12211 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12212 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12213 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12214 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12215 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12216 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12217}
12218
12219static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12220 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12221 SDNodeFlags Flags) {
12222 if (GlueChain->getNumValues() <= 1) {
12223 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12224 }
12225
12226 assert(GlueChain->getNumValues() == 3);
12227
12228 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12229 switch (Opcode) {
12230 default:
12231 llvm_unreachable("no chain equivalent for opcode");
12232 case ISD::FMUL:
12233 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12234 break;
12235 }
12236
12237 return DAG.getNode(Opcode, SL, VTList,
12238 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12239 Flags);
12240}
12241
12242static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12243 EVT VT, SDValue A, SDValue B, SDValue C,
12244 SDValue GlueChain, SDNodeFlags Flags) {
12245 if (GlueChain->getNumValues() <= 1) {
12246 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12247 }
12248
12249 assert(GlueChain->getNumValues() == 3);
12250
12251 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12252 switch (Opcode) {
12253 default:
12254 llvm_unreachable("no chain equivalent for opcode");
12255 case ISD::FMA:
12256 Opcode = AMDGPUISD::FMA_W_CHAIN;
12257 break;
12258 }
12259
12260 return DAG.getNode(Opcode, SL, VTList,
12261 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12262 Flags);
12263}
12264
12265SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12266 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12267 return FastLowered;
12268
12269 SDLoc SL(Op);
12270 EVT VT = Op.getValueType();
12271 SDValue LHS = Op.getOperand(0);
12272 SDValue RHS = Op.getOperand(1);
12273
12274 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12275 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12276
12277 if (VT == MVT::bf16) {
12278 SDValue ExtDiv =
12279 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12280 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12281 DAG.getTargetConstant(0, SL, MVT::i32));
12282 }
12283
12284 assert(VT == MVT::f16);
12285
12286 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12287 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12288 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12289 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12290 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12291 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12292 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12293 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12294 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12295 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12296 // q16.u = opx(V_CVT_F16_F32, q32.u);
12297 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12298
12299 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12300 unsigned FMADOpCode =
12302 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12303 SDValue Rcp =
12304 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12305 SDValue Quot =
12306 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12307 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12308 Op->getFlags());
12309 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12310 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12311 Op->getFlags());
12312 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12313 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12314 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12315 DAG.getConstant(0xff800000, SL, MVT::i32));
12316 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12317 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12318 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12319 DAG.getTargetConstant(0, SL, MVT::i32));
12320 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12321 Op->getFlags());
12322}
12323
12324// Faster 2.5 ULP division that does not support denormals.
12325SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12326 SDNodeFlags Flags = Op->getFlags();
12327 SDLoc SL(Op);
12328 SDValue LHS = Op.getOperand(1);
12329 SDValue RHS = Op.getOperand(2);
12330
12331 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12332
12333 const APFloat K0Val(0x1p+96f);
12334 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12335
12336 const APFloat K1Val(0x1p-32f);
12337 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12338
12339 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12340
12341 EVT SetCCVT =
12342 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12343
12344 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12345
12346 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12347
12348 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12349
12350 // rcp does not support denormals.
12351 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12352
12353 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12354
12355 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12356}
12357
12358// Returns immediate value for setting the F32 denorm mode when using the
12359// S_DENORM_MODE instruction.
12362 const GCNSubtarget *ST) {
12363 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12364 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12365 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12366 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12367}
12368
12369SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12370 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12371 return FastLowered;
12372
12373 // The selection matcher assumes anything with a chain selecting to a
12374 // mayRaiseFPException machine instruction. Since we're introducing a chain
12375 // here, we need to explicitly report nofpexcept for the regular fdiv
12376 // lowering.
12377 SDNodeFlags Flags = Op->getFlags();
12378 Flags.setNoFPExcept(true);
12379
12380 SDLoc SL(Op);
12381 SDValue LHS = Op.getOperand(0);
12382 SDValue RHS = Op.getOperand(1);
12383
12384 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12385
12386 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12387
12388 SDValue DenominatorScaled =
12389 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12390 SDValue NumeratorScaled =
12391 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12392
12393 // Denominator is scaled to not be denormal, so using rcp is ok.
12394 SDValue ApproxRcp =
12395 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12396 SDValue NegDivScale0 =
12397 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12398
12399 using namespace AMDGPU::Hwreg;
12400 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12401 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12402
12403 const MachineFunction &MF = DAG.getMachineFunction();
12404 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12405 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12406
12407 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12408 const bool HasDynamicDenormals =
12409 (DenormMode.Input == DenormalMode::Dynamic) ||
12410 (DenormMode.Output == DenormalMode::Dynamic);
12411
12412 SDValue SavedDenormMode;
12413
12414 if (!PreservesDenormals) {
12415 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12416 // lowering. The chain dependence is insufficient, and we need glue. We do
12417 // not need the glue variants in a strictfp function.
12418
12419 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12420
12421 SDValue Glue = DAG.getEntryNode();
12422 if (HasDynamicDenormals) {
12423 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12424 DAG.getVTList(MVT::i32, MVT::Glue),
12425 {BitField, Glue});
12426 SavedDenormMode = SDValue(GetReg, 0);
12427
12428 Glue = DAG.getMergeValues(
12429 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12430 }
12431
12432 SDNode *EnableDenorm;
12433 if (Subtarget->hasDenormModeInst()) {
12434 const SDValue EnableDenormValue =
12436
12437 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12438 EnableDenormValue)
12439 .getNode();
12440 } else {
12441 const SDValue EnableDenormValue =
12442 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12443 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12444 {EnableDenormValue, BitField, Glue});
12445 }
12446
12447 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12448 SDValue(EnableDenorm, 1)};
12449
12450 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12451 }
12452
12453 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12454 ApproxRcp, One, NegDivScale0, Flags);
12455
12456 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12457 ApproxRcp, Fma0, Flags);
12458
12459 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12460 Fma1, Flags);
12461
12462 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12463 NumeratorScaled, Mul, Flags);
12464
12465 SDValue Fma3 =
12466 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12467
12468 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12469 NumeratorScaled, Fma3, Flags);
12470
12471 if (!PreservesDenormals) {
12472 SDNode *DisableDenorm;
12473 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12474 const SDValue DisableDenormValue = getSPDenormModeValue(
12475 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12476
12477 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12478 DisableDenorm =
12479 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12480 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12481 .getNode();
12482 } else {
12483 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12484 const SDValue DisableDenormValue =
12485 HasDynamicDenormals
12486 ? SavedDenormMode
12487 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12488
12489 DisableDenorm = DAG.getMachineNode(
12490 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12491 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12492 }
12493
12494 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12495 SDValue(DisableDenorm, 0), DAG.getRoot());
12496 DAG.setRoot(OutputChain);
12497 }
12498
12499 SDValue Scale = NumeratorScaled.getValue(1);
12500 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12501 {Fma4, Fma1, Fma3, Scale}, Flags);
12502
12503 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12504}
12505
12506SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12507 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12508 return FastLowered;
12509
12510 SDLoc SL(Op);
12511 SDValue X = Op.getOperand(0);
12512 SDValue Y = Op.getOperand(1);
12513
12514 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12515
12516 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12517
12518 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12519
12520 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12521
12522 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12523
12524 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12525
12526 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12527
12528 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12529
12530 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12531
12532 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12533 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12534
12535 SDValue Fma4 =
12536 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12537
12538 SDValue Scale;
12539
12540 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12541 // Workaround a hardware bug on SI where the condition output from div_scale
12542 // is not usable.
12543
12544 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12545
12546 // Figure out if the scale to use for div_fmas.
12547 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12548 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12549 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12550 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12551
12552 SDValue NumHi =
12553 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12554 SDValue DenHi =
12555 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12556
12557 SDValue Scale0Hi =
12558 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12559 SDValue Scale1Hi =
12560 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12561
12562 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12563 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12564 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12565 } else {
12566 Scale = DivScale1.getValue(1);
12567 }
12568
12569 SDValue Fmas =
12570 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12571
12572 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12573}
12574
12575SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12576 EVT VT = Op.getValueType();
12577
12578 if (VT == MVT::f32)
12579 return LowerFDIV32(Op, DAG);
12580
12581 if (VT == MVT::f64)
12582 return LowerFDIV64(Op, DAG);
12583
12584 if (VT == MVT::f16 || VT == MVT::bf16)
12585 return LowerFDIV16(Op, DAG);
12586
12587 llvm_unreachable("Unexpected type for fdiv");
12588}
12589
12590SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12591 SDLoc dl(Op);
12592 SDValue Val = Op.getOperand(0);
12593 EVT VT = Val.getValueType();
12594 EVT ResultExpVT = Op->getValueType(1);
12595 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12596
12597 SDValue Mant = DAG.getNode(
12599 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12600
12601 SDValue Exp = DAG.getNode(
12602 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12603 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12604
12605 if (Subtarget->hasFractBug()) {
12606 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12607 SDValue Inf =
12609
12610 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12611 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12612 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12613 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12614 }
12615
12616 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12617 return DAG.getMergeValues({Mant, CastExp}, dl);
12618}
12619
12620SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12621 SDLoc DL(Op);
12622 StoreSDNode *Store = cast<StoreSDNode>(Op);
12623 EVT VT = Store->getMemoryVT();
12624
12625 if (VT == MVT::i1) {
12626 return DAG.getTruncStore(
12627 Store->getChain(), DL,
12628 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12629 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12630 }
12631
12632 assert(VT.isVector() &&
12633 Store->getValue().getValueType().getScalarType() == MVT::i32);
12634
12635 unsigned AS = Store->getAddressSpace();
12636 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12637 Store->getAlign().value() < VT.getStoreSize() &&
12638 VT.getSizeInBits() > 32) {
12639 return SplitVectorStore(Op, DAG);
12640 }
12641
12642 MachineFunction &MF = DAG.getMachineFunction();
12643 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12644 // If there is a possibility that flat instruction access scratch memory
12645 // then we need to use the same legalization rules we use for private.
12646 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12647 !Subtarget->hasMultiDwordFlatScratchAddressing())
12648 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12651
12652 unsigned NumElements = VT.getVectorNumElements();
12654 if (NumElements > 4)
12655 return SplitVectorStore(Op, DAG);
12656 // v3 stores not supported on SI.
12657 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12658 return SplitVectorStore(Op, DAG);
12659
12661 VT, *Store->getMemOperand()))
12662 return expandUnalignedStore(Store, DAG);
12663
12664 return SDValue();
12665 }
12666 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12667 switch (Subtarget->getMaxPrivateElementSize()) {
12668 case 4:
12669 return scalarizeVectorStore(Store, DAG);
12670 case 8:
12671 if (NumElements > 2)
12672 return SplitVectorStore(Op, DAG);
12673 return SDValue();
12674 case 16:
12675 if (NumElements > 4 ||
12676 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12677 return SplitVectorStore(Op, DAG);
12678 return SDValue();
12679 default:
12680 llvm_unreachable("unsupported private_element_size");
12681 }
12682 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12683 unsigned Fast = 0;
12684 auto Flags = Store->getMemOperand()->getFlags();
12686 Store->getAlign(), Flags, &Fast) &&
12687 Fast > 1)
12688 return SDValue();
12689
12690 if (VT.isVector())
12691 return SplitVectorStore(Op, DAG);
12692
12693 return expandUnalignedStore(Store, DAG);
12694 }
12695
12696 // Probably an invalid store. If so we'll end up emitting a selection error.
12697 return SDValue();
12698}
12699
12700// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12701SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12702 SDLoc SL(Op);
12703 assert(!Subtarget->has16BitInsts());
12704 SDNodeFlags Flags = Op->getFlags();
12705 SDValue Ext =
12706 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12707
12708 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12709 SDValue Sqrt =
12710 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12711
12712 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12713 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12714}
12715
12716SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12717 SDLoc DL(Op);
12718 SDNodeFlags Flags = Op->getFlags();
12719 MVT VT = Op.getValueType().getSimpleVT();
12720 const SDValue X = Op.getOperand(0);
12721
12722 if (allowApproxFunc(DAG, Flags)) {
12723 // Instruction is 1ulp but ignores denormals.
12724 return DAG.getNode(
12726 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12727 }
12728
12729 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12730 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12731
12732 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12733
12734 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12735
12736 SDValue SqrtX =
12737 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12738
12739 SDValue SqrtS;
12740 if (needsDenormHandlingF32(DAG, X, Flags)) {
12741 SDValue SqrtID =
12742 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12743 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12744
12745 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12746 SDValue SqrtSNextDownInt =
12747 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12748 DAG.getAllOnesConstant(DL, MVT::i32));
12749 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12750
12751 SDValue NegSqrtSNextDown =
12752 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12753
12754 SDValue SqrtVP =
12755 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12756
12757 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12758 DAG.getConstant(1, DL, MVT::i32));
12759 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12760
12761 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12762 SDValue SqrtVS =
12763 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12764
12765 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12766 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12767
12768 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12769 Flags);
12770
12771 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12772 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12773 Flags);
12774 } else {
12775 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12776
12777 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12778
12779 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12780 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12781 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12782
12783 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12784 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12785 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12786
12787 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12788 SDValue SqrtD =
12789 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12790 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12791 }
12792
12793 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12794
12795 SDValue ScaledDown =
12796 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12797
12798 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12799 SDValue IsZeroOrInf =
12800 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12801 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12802
12803 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12804}
12805
12806SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12807 // For double type, the SQRT and RSQ instructions don't have required
12808 // precision, we apply Goldschmidt's algorithm to improve the result:
12809 //
12810 // y0 = rsq(x)
12811 // g0 = x * y0
12812 // h0 = 0.5 * y0
12813 //
12814 // r0 = 0.5 - h0 * g0
12815 // g1 = g0 * r0 + g0
12816 // h1 = h0 * r0 + h0
12817 //
12818 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12819 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12820 // h2 = h1 * r1 + h1
12821 //
12822 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12823 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12824 //
12825 // sqrt(x) = g3
12826
12827 SDNodeFlags Flags = Op->getFlags();
12828
12829 SDLoc DL(Op);
12830
12831 SDValue X = Op.getOperand(0);
12832 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12833
12834 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12835
12836 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12837
12838 // Scale up input if it is too small.
12839 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12840 SDValue ScaleUp =
12841 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12842 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12843
12844 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12845
12846 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12847
12848 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12849 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12850
12851 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12852 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12853
12854 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12855
12856 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12857
12858 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12859 SDValue SqrtD0 =
12860 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12861
12862 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12863
12864 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12865 SDValue SqrtD1 =
12866 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12867
12868 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12869
12870 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12871 SDValue ScaleDown =
12872 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12873 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12874
12875 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12876 // with finite only or nsz because rsq(+/-0) = +/-inf
12877
12878 // TODO: Check for DAZ and expand to subnormals
12879 SDValue IsZeroOrInf =
12880 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12881 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12882
12883 // If x is +INF, +0, or -0, use its original value
12884 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12885 Flags);
12886}
12887
12888SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12889 SDLoc DL(Op);
12890 EVT VT = Op.getValueType();
12891 SDValue Arg = Op.getOperand(0);
12892 SDValue TrigVal;
12893
12894 // Propagate fast-math flags so that the multiply we introduce can be folded
12895 // if Arg is already the result of a multiply by constant.
12896 auto Flags = Op->getFlags();
12897
12898 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12899
12900 if (Subtarget->hasTrigReducedRange()) {
12901 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12902 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12903 } else {
12904 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12905 }
12906
12907 switch (Op.getOpcode()) {
12908 case ISD::FCOS:
12909 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12910 case ISD::FSIN:
12911 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12912 default:
12913 llvm_unreachable("Wrong trig opcode");
12914 }
12915}
12916
12917SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12918 SelectionDAG &DAG) const {
12919 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12920 assert(AtomicNode->isCompareAndSwap());
12921 unsigned AS = AtomicNode->getAddressSpace();
12922
12923 // No custom lowering required for local address space
12925 return Op;
12926
12927 // Non-local address space requires custom lowering for atomic compare
12928 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12929 SDLoc DL(Op);
12930 SDValue ChainIn = Op.getOperand(0);
12931 SDValue Addr = Op.getOperand(1);
12932 SDValue Old = Op.getOperand(2);
12933 SDValue New = Op.getOperand(3);
12934 EVT VT = Op.getValueType();
12935 MVT SimpleVT = VT.getSimpleVT();
12936 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12937
12938 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12939 SDValue Ops[] = {ChainIn, Addr, NewOld};
12940
12941 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
12942 Op->getVTList(), Ops, VT,
12943 AtomicNode->getMemOperand());
12944}
12945
12946//===----------------------------------------------------------------------===//
12947// Custom DAG optimizations
12948//===----------------------------------------------------------------------===//
12949
12950SDValue
12951SITargetLowering::performUCharToFloatCombine(SDNode *N,
12952 DAGCombinerInfo &DCI) const {
12953 EVT VT = N->getValueType(0);
12954 EVT ScalarVT = VT.getScalarType();
12955 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12956 return SDValue();
12957
12958 SelectionDAG &DAG = DCI.DAG;
12959 SDLoc DL(N);
12960
12961 SDValue Src = N->getOperand(0);
12962 EVT SrcVT = Src.getValueType();
12963
12964 // TODO: We could try to match extracting the higher bytes, which would be
12965 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12966 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12967 // about in practice.
12968 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12969 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12970 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12971 DCI.AddToWorklist(Cvt.getNode());
12972
12973 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12974 if (ScalarVT != MVT::f32) {
12975 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12976 DAG.getTargetConstant(0, DL, MVT::i32));
12977 }
12978 return Cvt;
12979 }
12980 }
12981
12982 return SDValue();
12983}
12984
12985SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12986 DAGCombinerInfo &DCI) const {
12987 SDValue MagnitudeOp = N->getOperand(0);
12988 SDValue SignOp = N->getOperand(1);
12989
12990 // The generic combine for fcopysign + fp cast is too conservative with
12991 // vectors, and also gets confused by the splitting we will perform here, so
12992 // peek through FP casts.
12993 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12994 SignOp.getOpcode() == ISD::FP_ROUND)
12995 SignOp = SignOp.getOperand(0);
12996
12997 SelectionDAG &DAG = DCI.DAG;
12998 SDLoc DL(N);
12999 EVT SignVT = SignOp.getValueType();
13000
13001 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13002 // lower half with a copy.
13003 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13004 EVT MagVT = MagnitudeOp.getValueType();
13005
13006 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13007
13008 if (MagVT.getScalarType() == MVT::f64) {
13009 EVT F32VT = MagVT.isVector()
13010 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13011 : MVT::v2f32;
13012
13013 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13014
13016 for (unsigned I = 0; I != NumElts; ++I) {
13017 SDValue MagLo =
13018 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13019 DAG.getConstant(2 * I, DL, MVT::i32));
13020 SDValue MagHi =
13021 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13022 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13023
13024 SDValue SignOpElt =
13025 MagVT.isVector()
13027 SignOp, DAG.getConstant(I, DL, MVT::i32))
13028 : SignOp;
13029
13030 SDValue HiOp =
13031 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13032
13033 SDValue Vector =
13034 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13035
13036 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13037 NewElts.push_back(NewElt);
13038 }
13039
13040 if (NewElts.size() == 1)
13041 return NewElts[0];
13042
13043 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13044 }
13045
13046 if (SignVT.getScalarType() != MVT::f64)
13047 return SDValue();
13048
13049 // Reduce width of sign operand, we only need the highest bit.
13050 //
13051 // fcopysign f64:x, f64:y ->
13052 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13053 // TODO: In some cases it might make sense to go all the way to f16.
13054
13055 EVT F32VT = MagVT.isVector()
13056 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13057 : MVT::v2f32;
13058
13059 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13060
13061 SmallVector<SDValue, 8> F32Signs;
13062 for (unsigned I = 0; I != NumElts; ++I) {
13063 // Take sign from odd elements of cast vector
13064 SDValue SignAsF32 =
13065 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13066 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13067 F32Signs.push_back(SignAsF32);
13068 }
13069
13070 SDValue NewSign =
13071 NumElts == 1
13072 ? F32Signs.back()
13074 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13075 F32Signs);
13076
13077 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13078 NewSign);
13079}
13080
13081// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13082// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13083// bits
13084
13085// This is a variant of
13086// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13087//
13088// The normal DAG combiner will do this, but only if the add has one use since
13089// that would increase the number of instructions.
13090//
13091// This prevents us from seeing a constant offset that can be folded into a
13092// memory instruction's addressing mode. If we know the resulting add offset of
13093// a pointer can be folded into an addressing offset, we can replace the pointer
13094// operand with the add of new constant offset. This eliminates one of the uses,
13095// and may allow the remaining use to also be simplified.
13096//
13097SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13098 EVT MemVT,
13099 DAGCombinerInfo &DCI) const {
13100 SDValue N0 = N->getOperand(0);
13101 SDValue N1 = N->getOperand(1);
13102
13103 // We only do this to handle cases where it's profitable when there are
13104 // multiple uses of the add, so defer to the standard combine.
13105 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13106 return SDValue();
13107
13108 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13109 if (!CN1)
13110 return SDValue();
13111
13112 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13113 if (!CAdd)
13114 return SDValue();
13115
13116 SelectionDAG &DAG = DCI.DAG;
13117
13118 if (N0->getOpcode() == ISD::OR &&
13119 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13120 return SDValue();
13121
13122 // If the resulting offset is too large, we can't fold it into the
13123 // addressing mode offset.
13124 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13125 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13126
13127 AddrMode AM;
13128 AM.HasBaseReg = true;
13129 AM.BaseOffs = Offset.getSExtValue();
13130 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13131 return SDValue();
13132
13133 SDLoc SL(N);
13134 EVT VT = N->getValueType(0);
13135
13136 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13137 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13138
13139 SDNodeFlags Flags;
13140 Flags.setNoUnsignedWrap(
13141 N->getFlags().hasNoUnsignedWrap() &&
13142 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13143
13144 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13145 // be sure that the new left operand is a proper base pointer.
13146 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13147}
13148
13149/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13150/// by the chain and intrinsic ID. Theoretically we would also need to check the
13151/// specific intrinsic, but they all place the pointer operand first.
13152static unsigned getBasePtrIndex(const MemSDNode *N) {
13153 switch (N->getOpcode()) {
13154 case ISD::STORE:
13157 return 2;
13158 default:
13159 return 1;
13160 }
13161}
13162
13163SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13164 DAGCombinerInfo &DCI) const {
13165 SelectionDAG &DAG = DCI.DAG;
13166
13167 unsigned PtrIdx = getBasePtrIndex(N);
13168 SDValue Ptr = N->getOperand(PtrIdx);
13169
13170 // TODO: We could also do this for multiplies.
13171 if (Ptr.getOpcode() == ISD::SHL) {
13172 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13173 N->getMemoryVT(), DCI);
13174 if (NewPtr) {
13175 SmallVector<SDValue, 8> NewOps(N->ops());
13176
13177 NewOps[PtrIdx] = NewPtr;
13178 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13179 }
13180 }
13181
13182 return SDValue();
13183}
13184
13185static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13186 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13187 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13188 (Opc == ISD::XOR && Val == 0);
13189}
13190
13191// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13192// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13193// integer combine opportunities since most 64-bit operations are decomposed
13194// this way. TODO: We won't want this for SALU especially if it is an inline
13195// immediate.
13196SDValue SITargetLowering::splitBinaryBitConstantOp(
13197 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13198 const ConstantSDNode *CRHS) const {
13199 uint64_t Val = CRHS->getZExtValue();
13200 uint32_t ValLo = Lo_32(Val);
13201 uint32_t ValHi = Hi_32(Val);
13202 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13203
13204 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13206 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13207 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13208 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13209 !CRHS->user_begin()->isDivergent())
13210 return SDValue();
13211
13212 // If we need to materialize a 64-bit immediate, it will be split up later
13213 // anyway. Avoid creating the harder to understand 64-bit immediate
13214 // materialization.
13215 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13216 }
13217
13218 return SDValue();
13219}
13220
13222 if (V.getValueType() != MVT::i1)
13223 return false;
13224 switch (V.getOpcode()) {
13225 default:
13226 break;
13227 case ISD::SETCC:
13228 case ISD::IS_FPCLASS:
13229 case AMDGPUISD::FP_CLASS:
13230 return true;
13231 case ISD::AND:
13232 case ISD::OR:
13233 case ISD::XOR:
13234 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13235 case ISD::SADDO:
13236 case ISD::UADDO:
13237 case ISD::SSUBO:
13238 case ISD::USUBO:
13239 case ISD::SMULO:
13240 case ISD::UMULO:
13241 return V.getResNo() == 1;
13243 unsigned IntrinsicID = V.getConstantOperandVal(0);
13244 switch (IntrinsicID) {
13245 case Intrinsic::amdgcn_is_shared:
13246 case Intrinsic::amdgcn_is_private:
13247 return true;
13248 default:
13249 return false;
13250 }
13251
13252 return false;
13253 }
13254 }
13255 return false;
13256}
13257
13258// If a constant has all zeroes or all ones within each byte return it.
13259// Otherwise return 0.
13261 // 0xff for any zero byte in the mask
13262 uint32_t ZeroByteMask = 0;
13263 if (!(C & 0x000000ff))
13264 ZeroByteMask |= 0x000000ff;
13265 if (!(C & 0x0000ff00))
13266 ZeroByteMask |= 0x0000ff00;
13267 if (!(C & 0x00ff0000))
13268 ZeroByteMask |= 0x00ff0000;
13269 if (!(C & 0xff000000))
13270 ZeroByteMask |= 0xff000000;
13271 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13272 if ((NonZeroByteMask & C) != NonZeroByteMask)
13273 return 0; // Partial bytes selected.
13274 return C;
13275}
13276
13277// Check if a node selects whole bytes from its operand 0 starting at a byte
13278// boundary while masking the rest. Returns select mask as in the v_perm_b32
13279// or -1 if not succeeded.
13280// Note byte select encoding:
13281// value 0-3 selects corresponding source byte;
13282// value 0xc selects zero;
13283// value 0xff selects 0xff.
13285 assert(V.getValueSizeInBits() == 32);
13286
13287 if (V.getNumOperands() != 2)
13288 return ~0;
13289
13290 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13291 if (!N1)
13292 return ~0;
13293
13294 uint32_t C = N1->getZExtValue();
13295
13296 switch (V.getOpcode()) {
13297 default:
13298 break;
13299 case ISD::AND:
13300 if (uint32_t ConstMask = getConstantPermuteMask(C))
13301 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13302 break;
13303
13304 case ISD::OR:
13305 if (uint32_t ConstMask = getConstantPermuteMask(C))
13306 return (0x03020100 & ~ConstMask) | ConstMask;
13307 break;
13308
13309 case ISD::SHL:
13310 if (C % 8)
13311 return ~0;
13312
13313 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13314
13315 case ISD::SRL:
13316 if (C % 8)
13317 return ~0;
13318
13319 return uint32_t(0x0c0c0c0c03020100ull >> C);
13320 }
13321
13322 return ~0;
13323}
13324
13325SDValue SITargetLowering::performAndCombine(SDNode *N,
13326 DAGCombinerInfo &DCI) const {
13327 if (DCI.isBeforeLegalize())
13328 return SDValue();
13329
13330 SelectionDAG &DAG = DCI.DAG;
13331 EVT VT = N->getValueType(0);
13332 SDValue LHS = N->getOperand(0);
13333 SDValue RHS = N->getOperand(1);
13334
13335 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13336 if (VT == MVT::i64 && CRHS) {
13337 if (SDValue Split =
13338 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13339 return Split;
13340 }
13341
13342 if (CRHS && VT == MVT::i32) {
13343 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13344 // nb = number of trailing zeroes in mask
13345 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13346 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13347 uint64_t Mask = CRHS->getZExtValue();
13348 unsigned Bits = llvm::popcount(Mask);
13349 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13350 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13351 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13352 unsigned Shift = CShift->getZExtValue();
13353 unsigned NB = CRHS->getAPIntValue().countr_zero();
13354 unsigned Offset = NB + Shift;
13355 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13356 SDLoc SL(N);
13357 SDValue BFE =
13358 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13359 DAG.getConstant(Offset, SL, MVT::i32),
13360 DAG.getConstant(Bits, SL, MVT::i32));
13361 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13362 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13363 DAG.getValueType(NarrowVT));
13364 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13365 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13366 return Shl;
13367 }
13368 }
13369 }
13370
13371 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13372 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13373 isa<ConstantSDNode>(LHS.getOperand(2))) {
13374 uint32_t Sel = getConstantPermuteMask(Mask);
13375 if (!Sel)
13376 return SDValue();
13377
13378 // Select 0xc for all zero bytes
13379 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13380 SDLoc DL(N);
13381 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13382 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13383 }
13384 }
13385
13386 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13387 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13388 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13389 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13390 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13391
13392 SDValue X = LHS.getOperand(0);
13393 SDValue Y = RHS.getOperand(0);
13394 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13395 !isTypeLegal(X.getValueType()))
13396 return SDValue();
13397
13398 if (LCC == ISD::SETO) {
13399 if (X != LHS.getOperand(1))
13400 return SDValue();
13401
13402 if (RCC == ISD::SETUNE) {
13403 const ConstantFPSDNode *C1 =
13404 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13405 if (!C1 || !C1->isInfinity() || C1->isNegative())
13406 return SDValue();
13407
13408 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13412
13413 static_assert(
13416 0x3ff) == Mask,
13417 "mask not equal");
13418
13419 SDLoc DL(N);
13420 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13421 DAG.getConstant(Mask, DL, MVT::i32));
13422 }
13423 }
13424 }
13425
13426 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13427 std::swap(LHS, RHS);
13428
13429 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13430 RHS.hasOneUse()) {
13431 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13432 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13433 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13434 // | n_nan)
13435 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13436 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13437 (RHS.getOperand(0) == LHS.getOperand(0) &&
13438 LHS.getOperand(0) == LHS.getOperand(1))) {
13439 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13440 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13441 : Mask->getZExtValue() & OrdMask;
13442
13443 SDLoc DL(N);
13444 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13445 DAG.getConstant(NewMask, DL, MVT::i32));
13446 }
13447 }
13448
13449 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13450 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13451 // and x, (sext cc from i1) => select cc, x, 0
13452 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13453 std::swap(LHS, RHS);
13454 if (isBoolSGPR(RHS.getOperand(0)))
13455 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13456 DAG.getConstant(0, SDLoc(N), MVT::i32));
13457 }
13458
13459 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13460 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13461 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13462 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13463 uint32_t LHSMask = getPermuteMask(LHS);
13464 uint32_t RHSMask = getPermuteMask(RHS);
13465 if (LHSMask != ~0u && RHSMask != ~0u) {
13466 // Canonicalize the expression in an attempt to have fewer unique masks
13467 // and therefore fewer registers used to hold the masks.
13468 if (LHSMask > RHSMask) {
13469 std::swap(LHSMask, RHSMask);
13470 std::swap(LHS, RHS);
13471 }
13472
13473 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13474 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13475 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13476 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13477
13478 // Check of we need to combine values from two sources within a byte.
13479 if (!(LHSUsedLanes & RHSUsedLanes) &&
13480 // If we select high and lower word keep it for SDWA.
13481 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13482 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13483 // Each byte in each mask is either selector mask 0-3, or has higher
13484 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13485 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13486 // mask which is not 0xff wins. By anding both masks we have a correct
13487 // result except that 0x0c shall be corrected to give 0x0c only.
13488 uint32_t Mask = LHSMask & RHSMask;
13489 for (unsigned I = 0; I < 32; I += 8) {
13490 uint32_t ByteSel = 0xff << I;
13491 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13492 Mask &= (0x0c << I) & 0xffffffff;
13493 }
13494
13495 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13496 // or 0x0c.
13497 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13498 SDLoc DL(N);
13499
13500 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13501 RHS.getOperand(0),
13502 DAG.getConstant(Sel, DL, MVT::i32));
13503 }
13504 }
13505 }
13506
13507 return SDValue();
13508}
13509
13510// A key component of v_perm is a mapping between byte position of the src
13511// operands, and the byte position of the dest. To provide such, we need: 1. the
13512// node that provides x byte of the dest of the OR, and 2. the byte of the node
13513// used to provide that x byte. calculateByteProvider finds which node provides
13514// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13515// and finds an ultimate src and byte position For example: The supported
13516// LoadCombine pattern for vector loads is as follows
13517// t1
13518// or
13519// / \
13520// t2 t3
13521// zext shl
13522// | | \
13523// t4 t5 16
13524// or anyext
13525// / \ |
13526// t6 t7 t8
13527// srl shl or
13528// / | / \ / \
13529// t9 t10 t11 t12 t13 t14
13530// trunc* 8 trunc* 8 and and
13531// | | / | | \
13532// t15 t16 t17 t18 t19 t20
13533// trunc* 255 srl -256
13534// | / \
13535// t15 t15 16
13536//
13537// *In this example, the truncs are from i32->i16
13538//
13539// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13540// respectively. calculateSrcByte would find (given node) -> ultimate src &
13541// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13542// After finding the mapping, we can combine the tree into vperm t15, t16,
13543// 0x05000407
13544
13545// Find the source and byte position from a node.
13546// \p DestByte is the byte position of the dest of the or that the src
13547// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13548// dest of the or byte. \p Depth tracks how many recursive iterations we have
13549// performed.
13550static const std::optional<ByteProvider<SDValue>>
13551calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13552 unsigned Depth = 0) {
13553 // We may need to recursively traverse a series of SRLs
13554 if (Depth >= 6)
13555 return std::nullopt;
13556
13557 if (Op.getValueSizeInBits() < 8)
13558 return std::nullopt;
13559
13560 if (Op.getValueType().isVector())
13561 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13562
13563 switch (Op->getOpcode()) {
13564 case ISD::TRUNCATE: {
13565 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13566 }
13567
13568 case ISD::SIGN_EXTEND:
13569 case ISD::ZERO_EXTEND:
13571 SDValue NarrowOp = Op->getOperand(0);
13572 auto NarrowVT = NarrowOp.getValueType();
13573 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13574 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13575 NarrowVT = VTSign->getVT();
13576 }
13577 if (!NarrowVT.isByteSized())
13578 return std::nullopt;
13579 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13580
13581 if (SrcIndex >= NarrowByteWidth)
13582 return std::nullopt;
13583 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13584 }
13585
13586 case ISD::SRA:
13587 case ISD::SRL: {
13588 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13589 if (!ShiftOp)
13590 return std::nullopt;
13591
13592 uint64_t BitShift = ShiftOp->getZExtValue();
13593
13594 if (BitShift % 8 != 0)
13595 return std::nullopt;
13596
13597 SrcIndex += BitShift / 8;
13598
13599 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13600 }
13601
13602 default: {
13603 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13604 }
13605 }
13606 llvm_unreachable("fully handled switch");
13607}
13608
13609// For a byte position in the result of an Or, traverse the tree and find the
13610// node (and the byte of the node) which ultimately provides this {Or,
13611// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13612// the byte position of the Op that corresponds with the originally requested
13613// byte of the Or \p Depth tracks how many recursive iterations we have
13614// performed. \p StartingIndex is the originally requested byte of the Or
13615static const std::optional<ByteProvider<SDValue>>
13616calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13617 unsigned StartingIndex = 0) {
13618 // Finding Src tree of RHS of or typically requires at least 1 additional
13619 // depth
13620 if (Depth > 6)
13621 return std::nullopt;
13622
13623 unsigned BitWidth = Op.getScalarValueSizeInBits();
13624 if (BitWidth % 8 != 0)
13625 return std::nullopt;
13626 if (Index > BitWidth / 8 - 1)
13627 return std::nullopt;
13628
13629 bool IsVec = Op.getValueType().isVector();
13630 switch (Op.getOpcode()) {
13631 case ISD::OR: {
13632 if (IsVec)
13633 return std::nullopt;
13634
13635 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13636 StartingIndex);
13637 if (!RHS)
13638 return std::nullopt;
13639 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13640 StartingIndex);
13641 if (!LHS)
13642 return std::nullopt;
13643 // A well formed Or will have two ByteProviders for each byte, one of which
13644 // is constant zero
13645 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13646 return std::nullopt;
13647 if (!LHS || LHS->isConstantZero())
13648 return RHS;
13649 if (!RHS || RHS->isConstantZero())
13650 return LHS;
13651 return std::nullopt;
13652 }
13653
13654 case ISD::AND: {
13655 if (IsVec)
13656 return std::nullopt;
13657
13658 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13659 if (!BitMaskOp)
13660 return std::nullopt;
13661
13662 uint32_t BitMask = BitMaskOp->getZExtValue();
13663 // Bits we expect for our StartingIndex
13664 uint32_t IndexMask = 0xFF << (Index * 8);
13665
13666 if ((IndexMask & BitMask) != IndexMask) {
13667 // If the result of the and partially provides the byte, then it
13668 // is not well formatted
13669 if (IndexMask & BitMask)
13670 return std::nullopt;
13672 }
13673
13674 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13675 }
13676
13677 case ISD::FSHR: {
13678 if (IsVec)
13679 return std::nullopt;
13680
13681 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13682 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13683 if (!ShiftOp || Op.getValueType().isVector())
13684 return std::nullopt;
13685
13686 uint64_t BitsProvided = Op.getValueSizeInBits();
13687 if (BitsProvided % 8 != 0)
13688 return std::nullopt;
13689
13690 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13691 if (BitShift % 8)
13692 return std::nullopt;
13693
13694 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13695 uint64_t ByteShift = BitShift / 8;
13696
13697 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13698 uint64_t BytesProvided = BitsProvided / 8;
13699 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13700 NewIndex %= BytesProvided;
13701 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13702 }
13703
13704 case ISD::SRA:
13705 case ISD::SRL: {
13706 if (IsVec)
13707 return std::nullopt;
13708
13709 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13710 if (!ShiftOp)
13711 return std::nullopt;
13712
13713 uint64_t BitShift = ShiftOp->getZExtValue();
13714 if (BitShift % 8)
13715 return std::nullopt;
13716
13717 auto BitsProvided = Op.getScalarValueSizeInBits();
13718 if (BitsProvided % 8 != 0)
13719 return std::nullopt;
13720
13721 uint64_t BytesProvided = BitsProvided / 8;
13722 uint64_t ByteShift = BitShift / 8;
13723 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13724 // If the byte we are trying to provide (as tracked by index) falls in this
13725 // range, then the SRL provides the byte. The byte of interest of the src of
13726 // the SRL is Index + ByteShift
13727 return BytesProvided - ByteShift > Index
13728 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13729 Index + ByteShift)
13731 }
13732
13733 case ISD::SHL: {
13734 if (IsVec)
13735 return std::nullopt;
13736
13737 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13738 if (!ShiftOp)
13739 return std::nullopt;
13740
13741 uint64_t BitShift = ShiftOp->getZExtValue();
13742 if (BitShift % 8 != 0)
13743 return std::nullopt;
13744 uint64_t ByteShift = BitShift / 8;
13745
13746 // If we are shifting by an amount greater than (or equal to)
13747 // the index we are trying to provide, then it provides 0s. If not,
13748 // then this bytes are not definitively 0s, and the corresponding byte
13749 // of interest is Index - ByteShift of the src
13750 return Index < ByteShift
13752 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13753 Depth + 1, StartingIndex);
13754 }
13755 case ISD::ANY_EXTEND:
13756 case ISD::SIGN_EXTEND:
13757 case ISD::ZERO_EXTEND:
13759 case ISD::AssertZext:
13760 case ISD::AssertSext: {
13761 if (IsVec)
13762 return std::nullopt;
13763
13764 SDValue NarrowOp = Op->getOperand(0);
13765 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13766 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13767 Op->getOpcode() == ISD::AssertZext ||
13768 Op->getOpcode() == ISD::AssertSext) {
13769 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13770 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13771 }
13772 if (NarrowBitWidth % 8 != 0)
13773 return std::nullopt;
13774 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13775
13776 if (Index >= NarrowByteWidth)
13777 return Op.getOpcode() == ISD::ZERO_EXTEND
13778 ? std::optional<ByteProvider<SDValue>>(
13780 : std::nullopt;
13781 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13782 }
13783
13784 case ISD::TRUNCATE: {
13785 if (IsVec)
13786 return std::nullopt;
13787
13788 uint64_t NarrowByteWidth = BitWidth / 8;
13789
13790 if (NarrowByteWidth >= Index) {
13791 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13792 StartingIndex);
13793 }
13794
13795 return std::nullopt;
13796 }
13797
13798 case ISD::CopyFromReg: {
13799 if (BitWidth / 8 > Index)
13800 return calculateSrcByte(Op, StartingIndex, Index);
13801
13802 return std::nullopt;
13803 }
13804
13805 case ISD::LOAD: {
13806 auto *L = cast<LoadSDNode>(Op.getNode());
13807
13808 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13809 if (NarrowBitWidth % 8 != 0)
13810 return std::nullopt;
13811 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13812
13813 // If the width of the load does not reach byte we are trying to provide for
13814 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13815 // question
13816 if (Index >= NarrowByteWidth) {
13817 return L->getExtensionType() == ISD::ZEXTLOAD
13818 ? std::optional<ByteProvider<SDValue>>(
13820 : std::nullopt;
13821 }
13822
13823 if (NarrowByteWidth > Index) {
13824 return calculateSrcByte(Op, StartingIndex, Index);
13825 }
13826
13827 return std::nullopt;
13828 }
13829
13830 case ISD::BSWAP: {
13831 if (IsVec)
13832 return std::nullopt;
13833
13834 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13835 Depth + 1, StartingIndex);
13836 }
13837
13839 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13840 if (!IdxOp)
13841 return std::nullopt;
13842 auto VecIdx = IdxOp->getZExtValue();
13843 auto ScalarSize = Op.getScalarValueSizeInBits();
13844 if (ScalarSize < 32)
13845 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13846 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13847 StartingIndex, Index);
13848 }
13849
13850 case AMDGPUISD::PERM: {
13851 if (IsVec)
13852 return std::nullopt;
13853
13854 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13855 if (!PermMask)
13856 return std::nullopt;
13857
13858 auto IdxMask =
13859 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13860 if (IdxMask > 0x07 && IdxMask != 0x0c)
13861 return std::nullopt;
13862
13863 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13864 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13865
13866 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13869 }
13870
13871 default: {
13872 return std::nullopt;
13873 }
13874 }
13875
13876 llvm_unreachable("fully handled switch");
13877}
13878
13879// Returns true if the Operand is a scalar and is 16 bits
13880static bool isExtendedFrom16Bits(SDValue &Operand) {
13881
13882 switch (Operand.getOpcode()) {
13883 case ISD::ANY_EXTEND:
13884 case ISD::SIGN_EXTEND:
13885 case ISD::ZERO_EXTEND: {
13886 auto OpVT = Operand.getOperand(0).getValueType();
13887 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13888 }
13889 case ISD::LOAD: {
13890 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13891 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13892 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13893 ExtType == ISD::EXTLOAD) {
13894 auto MemVT = L->getMemoryVT();
13895 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13896 }
13897 return L->getMemoryVT().getSizeInBits() == 16;
13898 }
13899 default:
13900 return false;
13901 }
13902}
13903
13904// Returns true if the mask matches consecutive bytes, and the first byte
13905// begins at a power of 2 byte offset from 0th byte
13906static bool addresses16Bits(int Mask) {
13907 int Low8 = Mask & 0xff;
13908 int Hi8 = (Mask & 0xff00) >> 8;
13909
13910 assert(Low8 < 8 && Hi8 < 8);
13911 // Are the bytes contiguous in the order of increasing addresses.
13912 bool IsConsecutive = (Hi8 - Low8 == 1);
13913 // Is the first byte at location that is aligned for 16 bit instructions.
13914 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13915 // In this case, we still need code to extract the 16 bit operand, so it
13916 // is better to use i8 v_perm
13917 bool Is16Aligned = !(Low8 % 2);
13918
13919 return IsConsecutive && Is16Aligned;
13920}
13921
13922// Do not lower into v_perm if the operands are actually 16 bit
13923// and the selected bits (based on PermMask) correspond with two
13924// easily addressable 16 bit operands.
13926 SDValue &OtherOp) {
13927 int Low16 = PermMask & 0xffff;
13928 int Hi16 = (PermMask & 0xffff0000) >> 16;
13929
13930 auto TempOp = peekThroughBitcasts(Op);
13931 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13932
13933 auto OpIs16Bit =
13934 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13935 if (!OpIs16Bit)
13936 return true;
13937
13938 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13939 isExtendedFrom16Bits(TempOtherOp);
13940 if (!OtherOpIs16Bit)
13941 return true;
13942
13943 // Do we cleanly address both
13944 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13945}
13946
13948 unsigned DWordOffset) {
13949 SDValue Ret;
13950
13951 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13952 // ByteProvider must be at least 8 bits
13953 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13954
13955 if (TypeSize <= 32)
13956 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13957
13958 if (Src.getValueType().isVector()) {
13959 auto ScalarTySize = Src.getScalarValueSizeInBits();
13960 auto ScalarTy = Src.getValueType().getScalarType();
13961 if (ScalarTySize == 32) {
13962 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13963 DAG.getConstant(DWordOffset, SL, MVT::i32));
13964 }
13965 if (ScalarTySize > 32) {
13966 Ret = DAG.getNode(
13967 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13968 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13969 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13970 if (ShiftVal)
13971 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13972 DAG.getConstant(ShiftVal, SL, MVT::i32));
13973 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13974 }
13975
13976 assert(ScalarTySize < 32);
13977 auto NumElements = TypeSize / ScalarTySize;
13978 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13979 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13980 auto NumElementsIn32 = 32 / ScalarTySize;
13981 auto NumAvailElements = DWordOffset < Trunc32Elements
13982 ? NumElementsIn32
13983 : NumElements - NormalizedTrunc;
13984
13986 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13987 NumAvailElements);
13988
13989 Ret = DAG.getBuildVector(
13990 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13991 VecSrcs);
13992 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13993 }
13994
13995 /// Scalar Type
13996 auto ShiftVal = 32 * DWordOffset;
13997 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13998 DAG.getConstant(ShiftVal, SL, MVT::i32));
13999 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14000}
14001
14003 SelectionDAG &DAG = DCI.DAG;
14004 [[maybe_unused]] EVT VT = N->getValueType(0);
14006
14007 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14008 assert(VT == MVT::i32);
14009 for (int i = 0; i < 4; i++) {
14010 // Find the ByteProvider that provides the ith byte of the result of OR
14011 std::optional<ByteProvider<SDValue>> P =
14012 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14013 // TODO support constantZero
14014 if (!P || P->isConstantZero())
14015 return SDValue();
14016
14017 PermNodes.push_back(*P);
14018 }
14019 if (PermNodes.size() != 4)
14020 return SDValue();
14021
14022 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14023 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14024 uint64_t PermMask = 0x00000000;
14025 for (size_t i = 0; i < PermNodes.size(); i++) {
14026 auto PermOp = PermNodes[i];
14027 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14028 // by sizeof(Src2) = 4
14029 int SrcByteAdjust = 4;
14030
14031 // If the Src uses a byte from a different DWORD, then it corresponds
14032 // with a difference source
14033 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14034 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14035 if (SecondSrc)
14036 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14037 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14038 return SDValue();
14039
14040 // Set the index of the second distinct Src node
14041 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14042 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14043 SrcByteAdjust = 0;
14044 }
14045 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14047 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14048 }
14049 SDLoc DL(N);
14050 SDValue Op = *PermNodes[FirstSrc.first].Src;
14051 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14052 assert(Op.getValueSizeInBits() == 32);
14053
14054 // Check that we are not just extracting the bytes in order from an op
14055 if (!SecondSrc) {
14056 int Low16 = PermMask & 0xffff;
14057 int Hi16 = (PermMask & 0xffff0000) >> 16;
14058
14059 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14060 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14061
14062 // The perm op would really just produce Op. So combine into Op
14063 if (WellFormedLow && WellFormedHi)
14064 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14065 }
14066
14067 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14068
14069 if (SecondSrc) {
14070 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14071 assert(OtherOp.getValueSizeInBits() == 32);
14072 }
14073
14074 // Check that we haven't just recreated the same FSHR node.
14075 if (N->getOpcode() == ISD::FSHR &&
14076 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14077 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14078 return SDValue();
14079
14080 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14081
14082 assert(Op.getValueType().isByteSized() &&
14083 OtherOp.getValueType().isByteSized());
14084
14085 // If the ultimate src is less than 32 bits, then we will only be
14086 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14087 // CalculateByteProvider would not have returned Op as source if we
14088 // used a byte that is outside its ValueType. Thus, we are free to
14089 // ANY_EXTEND as the extended bits are dont-cares.
14090 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14091 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14092
14093 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14094 DAG.getConstant(PermMask, DL, MVT::i32));
14095 }
14096 return SDValue();
14097}
14098
14099SDValue SITargetLowering::performOrCombine(SDNode *N,
14100 DAGCombinerInfo &DCI) const {
14101 SelectionDAG &DAG = DCI.DAG;
14102 SDValue LHS = N->getOperand(0);
14103 SDValue RHS = N->getOperand(1);
14104
14105 EVT VT = N->getValueType(0);
14106 if (VT == MVT::i1) {
14107 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14108 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14109 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14110 SDValue Src = LHS.getOperand(0);
14111 if (Src != RHS.getOperand(0))
14112 return SDValue();
14113
14114 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14115 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14116 if (!CLHS || !CRHS)
14117 return SDValue();
14118
14119 // Only 10 bits are used.
14120 static const uint32_t MaxMask = 0x3ff;
14121
14122 uint32_t NewMask =
14123 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14124 SDLoc DL(N);
14125 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14126 DAG.getConstant(NewMask, DL, MVT::i32));
14127 }
14128
14129 return SDValue();
14130 }
14131
14132 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14134 LHS.getOpcode() == AMDGPUISD::PERM &&
14135 isa<ConstantSDNode>(LHS.getOperand(2))) {
14136 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14137 if (!Sel)
14138 return SDValue();
14139
14140 Sel |= LHS.getConstantOperandVal(2);
14141 SDLoc DL(N);
14142 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14143 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14144 }
14145
14146 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14147 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14148 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14149 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14150
14151 // If all the uses of an or need to extract the individual elements, do not
14152 // attempt to lower into v_perm
14153 auto usesCombinedOperand = [](SDNode *OrUse) {
14154 // If we have any non-vectorized use, then it is a candidate for v_perm
14155 if (OrUse->getOpcode() != ISD::BITCAST ||
14156 !OrUse->getValueType(0).isVector())
14157 return true;
14158
14159 // If we have any non-vectorized use, then it is a candidate for v_perm
14160 for (auto *VUser : OrUse->users()) {
14161 if (!VUser->getValueType(0).isVector())
14162 return true;
14163
14164 // If the use of a vector is a store, then combining via a v_perm
14165 // is beneficial.
14166 // TODO -- whitelist more uses
14167 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14168 if (VUser->getOpcode() == VectorwiseOp)
14169 return true;
14170 }
14171 return false;
14172 };
14173
14174 if (!any_of(N->users(), usesCombinedOperand))
14175 return SDValue();
14176
14177 uint32_t LHSMask = getPermuteMask(LHS);
14178 uint32_t RHSMask = getPermuteMask(RHS);
14179
14180 if (LHSMask != ~0u && RHSMask != ~0u) {
14181 // Canonicalize the expression in an attempt to have fewer unique masks
14182 // and therefore fewer registers used to hold the masks.
14183 if (LHSMask > RHSMask) {
14184 std::swap(LHSMask, RHSMask);
14185 std::swap(LHS, RHS);
14186 }
14187
14188 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14189 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14190 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14191 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14192
14193 // Check of we need to combine values from two sources within a byte.
14194 if (!(LHSUsedLanes & RHSUsedLanes) &&
14195 // If we select high and lower word keep it for SDWA.
14196 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14197 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14198 // Kill zero bytes selected by other mask. Zero value is 0xc.
14199 LHSMask &= ~RHSUsedLanes;
14200 RHSMask &= ~LHSUsedLanes;
14201 // Add 4 to each active LHS lane
14202 LHSMask |= LHSUsedLanes & 0x04040404;
14203 // Combine masks
14204 uint32_t Sel = LHSMask | RHSMask;
14205 SDLoc DL(N);
14206
14207 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14208 RHS.getOperand(0),
14209 DAG.getConstant(Sel, DL, MVT::i32));
14210 }
14211 }
14212 if (LHSMask == ~0u || RHSMask == ~0u) {
14213 if (SDValue Perm = matchPERM(N, DCI))
14214 return Perm;
14215 }
14216 }
14217
14218 // Detect identity v2i32 OR and replace with identity source node.
14219 // Specifically an Or that has operands constructed from the same source node
14220 // via extract_vector_elt and build_vector. I.E.
14221 // v2i32 or(
14222 // v2i32 build_vector(
14223 // i32 extract_elt(%IdentitySrc, 0),
14224 // i32 0
14225 // ),
14226 // v2i32 build_vector(
14227 // i32 0,
14228 // i32 extract_elt(%IdentitySrc, 1)
14229 // ) )
14230 // =>
14231 // v2i32 %IdentitySrc
14232
14233 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14234 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14235
14236 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14237 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14238
14239 // Test for and normalise build vectors.
14240 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14241
14242 // Get the extract_vector_element operands.
14243 SDValue LEVE = LHS->getOperand(0);
14244 SDValue REVE = RHS->getOperand(1);
14245
14246 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14248 // Check that different elements from the same vector are
14249 // extracted.
14250 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14251 LEVE->getOperand(1) != REVE->getOperand(1)) {
14252 SDValue IdentitySrc = LEVE.getOperand(0);
14253 return IdentitySrc;
14254 }
14255 }
14256 }
14257 }
14258
14259 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14260 return SDValue();
14261
14262 // TODO: This could be a generic combine with a predicate for extracting the
14263 // high half of an integer being free.
14264
14265 // (or i64:x, (zero_extend i32:y)) ->
14266 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14267 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14268 RHS.getOpcode() != ISD::ZERO_EXTEND)
14269 std::swap(LHS, RHS);
14270
14271 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14272 SDValue ExtSrc = RHS.getOperand(0);
14273 EVT SrcVT = ExtSrc.getValueType();
14274 if (SrcVT == MVT::i32) {
14275 SDLoc SL(N);
14276 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14277 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14278
14279 DCI.AddToWorklist(LowOr.getNode());
14280 DCI.AddToWorklist(HiBits.getNode());
14281
14282 SDValue Vec =
14283 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14284 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14285 }
14286 }
14287
14288 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14289 if (CRHS) {
14290 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14291 N->getOperand(0), CRHS))
14292 return Split;
14293 }
14294
14295 return SDValue();
14296}
14297
14298SDValue SITargetLowering::performXorCombine(SDNode *N,
14299 DAGCombinerInfo &DCI) const {
14300 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14301 return RV;
14302
14303 SDValue LHS = N->getOperand(0);
14304 SDValue RHS = N->getOperand(1);
14305
14306 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14307 SelectionDAG &DAG = DCI.DAG;
14308
14309 EVT VT = N->getValueType(0);
14310 if (CRHS && VT == MVT::i64) {
14311 if (SDValue Split =
14312 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14313 return Split;
14314 }
14315
14316 // v2i32 (xor (vselect cc, x, y), K) ->
14317 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14318 // replaced with source modifiers when the select is lowered to CNDMASK.
14319 unsigned Opc = LHS.getOpcode();
14320 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14321 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14322 CRHS && CRHS->getAPIntValue().isSignMask()) {
14323 SDValue CC = LHS->getOperand(0);
14324 SDValue TRUE = LHS->getOperand(1);
14325 SDValue FALSE = LHS->getOperand(2);
14326 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14327 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14328 SDValue XSelect =
14329 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14330 return XSelect;
14331 }
14332
14333 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14334 // fneg-like xors into 64-bit select.
14335 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14336 // This looks like an fneg, try to fold as a source modifier.
14337 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14339 // xor (select c, a, b), 0x80000000 ->
14340 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14341 SDLoc DL(N);
14342 SDValue CastLHS =
14343 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14344 SDValue CastRHS =
14345 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14346 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14347 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14348 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14349 LHS->getOperand(0), FNegLHS, FNegRHS);
14350 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14351 }
14352 }
14353
14354 return SDValue();
14355}
14356
14357SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14358 DAGCombinerInfo &DCI) const {
14359 if (!Subtarget->has16BitInsts() ||
14360 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14361 return SDValue();
14362
14363 EVT VT = N->getValueType(0);
14364 if (VT != MVT::i32)
14365 return SDValue();
14366
14367 SDValue Src = N->getOperand(0);
14368 if (Src.getValueType() != MVT::i16)
14369 return SDValue();
14370
14371 return SDValue();
14372}
14373
14374SDValue
14375SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14376 DAGCombinerInfo &DCI) const {
14377 SDValue Src = N->getOperand(0);
14378 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14379
14380 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14381 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14382 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14383 VTSign->getVT() == MVT::i8) ||
14384 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14385 VTSign->getVT() == MVT::i16))) {
14386 assert(Subtarget->hasScalarSubwordLoads() &&
14387 "s_buffer_load_{u8, i8} are supported "
14388 "in GFX12 (or newer) architectures.");
14389 EVT VT = Src.getValueType();
14390 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14391 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14392 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14393 SDLoc DL(N);
14394 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14395 SDValue Ops[] = {
14396 Src.getOperand(0), // source register
14397 Src.getOperand(1), // offset
14398 Src.getOperand(2) // cachePolicy
14399 };
14400 auto *M = cast<MemSDNode>(Src);
14401 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14402 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14403 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14404 return LoadVal;
14405 }
14406 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14407 VTSign->getVT() == MVT::i8) ||
14408 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14409 VTSign->getVT() == MVT::i16)) &&
14410 Src.hasOneUse()) {
14411 auto *M = cast<MemSDNode>(Src);
14412 SDValue Ops[] = {Src.getOperand(0), // Chain
14413 Src.getOperand(1), // rsrc
14414 Src.getOperand(2), // vindex
14415 Src.getOperand(3), // voffset
14416 Src.getOperand(4), // soffset
14417 Src.getOperand(5), // offset
14418 Src.getOperand(6), Src.getOperand(7)};
14419 // replace with BUFFER_LOAD_BYTE/SHORT
14420 SDVTList ResList =
14421 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14422 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14423 ? AMDGPUISD::BUFFER_LOAD_BYTE
14424 : AMDGPUISD::BUFFER_LOAD_SHORT;
14425 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14426 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14427 return DCI.DAG.getMergeValues(
14428 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14429 }
14430 return SDValue();
14431}
14432
14433SDValue SITargetLowering::performClassCombine(SDNode *N,
14434 DAGCombinerInfo &DCI) const {
14435 SelectionDAG &DAG = DCI.DAG;
14436 SDValue Mask = N->getOperand(1);
14437
14438 // fp_class x, 0 -> false
14439 if (isNullConstant(Mask))
14440 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14441
14442 if (N->getOperand(0).isUndef())
14443 return DAG.getUNDEF(MVT::i1);
14444
14445 return SDValue();
14446}
14447
14448SDValue SITargetLowering::performRcpCombine(SDNode *N,
14449 DAGCombinerInfo &DCI) const {
14450 EVT VT = N->getValueType(0);
14451 SDValue N0 = N->getOperand(0);
14452
14453 if (N0.isUndef()) {
14454 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14455 SDLoc(N), VT);
14456 }
14457
14458 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14459 N0.getOpcode() == ISD::SINT_TO_FP)) {
14460 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14461 N->getFlags());
14462 }
14463
14464 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14465 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14466 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14467 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14468 N->getFlags());
14469 }
14470
14472}
14473
14475 unsigned MaxDepth) const {
14476 unsigned Opcode = Op.getOpcode();
14477 if (Opcode == ISD::FCANONICALIZE)
14478 return true;
14479
14480 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14481 const auto &F = CFP->getValueAPF();
14482 if (F.isNaN() && F.isSignaling())
14483 return false;
14484 if (!F.isDenormal())
14485 return true;
14486
14487 DenormalMode Mode =
14488 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14489 return Mode == DenormalMode::getIEEE();
14490 }
14491
14492 // If source is a result of another standard FP operation it is already in
14493 // canonical form.
14494 if (MaxDepth == 0)
14495 return false;
14496
14497 switch (Opcode) {
14498 // These will flush denorms if required.
14499 case ISD::FADD:
14500 case ISD::FSUB:
14501 case ISD::FMUL:
14502 case ISD::FCEIL:
14503 case ISD::FFLOOR:
14504 case ISD::FMA:
14505 case ISD::FMAD:
14506 case ISD::FSQRT:
14507 case ISD::FDIV:
14508 case ISD::FREM:
14509 case ISD::FP_ROUND:
14510 case ISD::FP_EXTEND:
14511 case ISD::FP16_TO_FP:
14512 case ISD::FP_TO_FP16:
14513 case ISD::BF16_TO_FP:
14514 case ISD::FP_TO_BF16:
14515 case ISD::FLDEXP:
14516 case AMDGPUISD::FMUL_LEGACY:
14517 case AMDGPUISD::FMAD_FTZ:
14518 case AMDGPUISD::RCP:
14519 case AMDGPUISD::RSQ:
14520 case AMDGPUISD::RSQ_CLAMP:
14521 case AMDGPUISD::RCP_LEGACY:
14522 case AMDGPUISD::RCP_IFLAG:
14523 case AMDGPUISD::LOG:
14524 case AMDGPUISD::EXP:
14525 case AMDGPUISD::DIV_SCALE:
14526 case AMDGPUISD::DIV_FMAS:
14527 case AMDGPUISD::DIV_FIXUP:
14528 case AMDGPUISD::FRACT:
14529 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14530 case AMDGPUISD::CVT_F32_UBYTE0:
14531 case AMDGPUISD::CVT_F32_UBYTE1:
14532 case AMDGPUISD::CVT_F32_UBYTE2:
14533 case AMDGPUISD::CVT_F32_UBYTE3:
14534 case AMDGPUISD::FP_TO_FP16:
14535 case AMDGPUISD::SIN_HW:
14536 case AMDGPUISD::COS_HW:
14537 return true;
14538
14539 // It can/will be lowered or combined as a bit operation.
14540 // Need to check their input recursively to handle.
14541 case ISD::FNEG:
14542 case ISD::FABS:
14543 case ISD::FCOPYSIGN:
14544 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14545
14546 case ISD::AND:
14547 if (Op.getValueType() == MVT::i32) {
14548 // Be careful as we only know it is a bitcast floating point type. It
14549 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14550 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14551 // is valid to optimize for all types.
14552 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14553 if (RHS->getZExtValue() == 0xffff0000) {
14554 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14555 }
14556 }
14557 }
14558 break;
14559
14560 case ISD::FSIN:
14561 case ISD::FCOS:
14562 case ISD::FSINCOS:
14563 return Op.getValueType().getScalarType() != MVT::f16;
14564
14565 case ISD::FMINNUM:
14566 case ISD::FMAXNUM:
14567 case ISD::FMINNUM_IEEE:
14568 case ISD::FMAXNUM_IEEE:
14569 case ISD::FMINIMUM:
14570 case ISD::FMAXIMUM:
14571 case ISD::FMINIMUMNUM:
14572 case ISD::FMAXIMUMNUM:
14573 case AMDGPUISD::CLAMP:
14574 case AMDGPUISD::FMED3:
14575 case AMDGPUISD::FMAX3:
14576 case AMDGPUISD::FMIN3:
14577 case AMDGPUISD::FMAXIMUM3:
14578 case AMDGPUISD::FMINIMUM3: {
14579 // FIXME: Shouldn't treat the generic operations different based these.
14580 // However, we aren't really required to flush the result from
14581 // minnum/maxnum..
14582
14583 // snans will be quieted, so we only need to worry about denormals.
14584 if (Subtarget->supportsMinMaxDenormModes() ||
14585 // FIXME: denormalsEnabledForType is broken for dynamic
14586 denormalsEnabledForType(DAG, Op.getValueType()))
14587 return true;
14588
14589 // Flushing may be required.
14590 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14591 // targets need to check their input recursively.
14592
14593 // FIXME: Does this apply with clamp? It's implemented with max.
14594 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14595 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14596 return false;
14597 }
14598
14599 return true;
14600 }
14601 case ISD::SELECT: {
14602 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14603 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14604 }
14605 case ISD::BUILD_VECTOR: {
14606 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14607 SDValue SrcOp = Op.getOperand(i);
14608 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14609 return false;
14610 }
14611
14612 return true;
14613 }
14616 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14617 }
14619 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14620 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14621 }
14622 case ISD::UNDEF:
14623 // Could be anything.
14624 return false;
14625
14626 case ISD::BITCAST:
14627 // TODO: This is incorrect as it loses track of the operand's type. We may
14628 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14629 // same bits that are canonicalized in one type need not be in the other.
14630 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14631 case ISD::TRUNCATE: {
14632 // Hack round the mess we make when legalizing extract_vector_elt
14633 if (Op.getValueType() == MVT::i16) {
14634 SDValue TruncSrc = Op.getOperand(0);
14635 if (TruncSrc.getValueType() == MVT::i32 &&
14636 TruncSrc.getOpcode() == ISD::BITCAST &&
14637 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14638 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14639 }
14640 }
14641 return false;
14642 }
14644 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14645 // TODO: Handle more intrinsics
14646 switch (IntrinsicID) {
14647 case Intrinsic::amdgcn_cvt_pkrtz:
14648 case Intrinsic::amdgcn_cubeid:
14649 case Intrinsic::amdgcn_frexp_mant:
14650 case Intrinsic::amdgcn_fdot2:
14651 case Intrinsic::amdgcn_rcp:
14652 case Intrinsic::amdgcn_rsq:
14653 case Intrinsic::amdgcn_rsq_clamp:
14654 case Intrinsic::amdgcn_rcp_legacy:
14655 case Intrinsic::amdgcn_rsq_legacy:
14656 case Intrinsic::amdgcn_trig_preop:
14657 case Intrinsic::amdgcn_tanh:
14658 case Intrinsic::amdgcn_log:
14659 case Intrinsic::amdgcn_exp2:
14660 case Intrinsic::amdgcn_sqrt:
14661 return true;
14662 default:
14663 break;
14664 }
14665
14666 break;
14667 }
14668 default:
14669 break;
14670 }
14671
14672 // FIXME: denormalsEnabledForType is broken for dynamic
14673 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14674 DAG.isKnownNeverSNaN(Op);
14675}
14676
14678 unsigned MaxDepth) const {
14679 const MachineRegisterInfo &MRI = MF.getRegInfo();
14680 MachineInstr *MI = MRI.getVRegDef(Reg);
14681 unsigned Opcode = MI->getOpcode();
14682
14683 if (Opcode == AMDGPU::G_FCANONICALIZE)
14684 return true;
14685
14686 std::optional<FPValueAndVReg> FCR;
14687 // Constant splat (can be padded with undef) or scalar constant.
14689 if (FCR->Value.isSignaling())
14690 return false;
14691 if (!FCR->Value.isDenormal())
14692 return true;
14693
14694 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14695 return Mode == DenormalMode::getIEEE();
14696 }
14697
14698 if (MaxDepth == 0)
14699 return false;
14700
14701 switch (Opcode) {
14702 case AMDGPU::G_FADD:
14703 case AMDGPU::G_FSUB:
14704 case AMDGPU::G_FMUL:
14705 case AMDGPU::G_FCEIL:
14706 case AMDGPU::G_FFLOOR:
14707 case AMDGPU::G_FRINT:
14708 case AMDGPU::G_FNEARBYINT:
14709 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14710 case AMDGPU::G_INTRINSIC_TRUNC:
14711 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14712 case AMDGPU::G_FMA:
14713 case AMDGPU::G_FMAD:
14714 case AMDGPU::G_FSQRT:
14715 case AMDGPU::G_FDIV:
14716 case AMDGPU::G_FREM:
14717 case AMDGPU::G_FPOW:
14718 case AMDGPU::G_FPEXT:
14719 case AMDGPU::G_FLOG:
14720 case AMDGPU::G_FLOG2:
14721 case AMDGPU::G_FLOG10:
14722 case AMDGPU::G_FPTRUNC:
14723 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14724 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14725 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14726 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14727 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14728 return true;
14729 case AMDGPU::G_FNEG:
14730 case AMDGPU::G_FABS:
14731 case AMDGPU::G_FCOPYSIGN:
14732 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14733 case AMDGPU::G_FMINNUM:
14734 case AMDGPU::G_FMAXNUM:
14735 case AMDGPU::G_FMINNUM_IEEE:
14736 case AMDGPU::G_FMAXNUM_IEEE:
14737 case AMDGPU::G_FMINIMUM:
14738 case AMDGPU::G_FMAXIMUM:
14739 case AMDGPU::G_FMINIMUMNUM:
14740 case AMDGPU::G_FMAXIMUMNUM: {
14741 if (Subtarget->supportsMinMaxDenormModes() ||
14742 // FIXME: denormalsEnabledForType is broken for dynamic
14743 denormalsEnabledForType(MRI.getType(Reg), MF))
14744 return true;
14745
14746 [[fallthrough]];
14747 }
14748 case AMDGPU::G_BUILD_VECTOR:
14749 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14750 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14751 return false;
14752 return true;
14753 case AMDGPU::G_INTRINSIC:
14754 case AMDGPU::G_INTRINSIC_CONVERGENT:
14755 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14756 case Intrinsic::amdgcn_fmul_legacy:
14757 case Intrinsic::amdgcn_fmad_ftz:
14758 case Intrinsic::amdgcn_sqrt:
14759 case Intrinsic::amdgcn_fmed3:
14760 case Intrinsic::amdgcn_sin:
14761 case Intrinsic::amdgcn_cos:
14762 case Intrinsic::amdgcn_log:
14763 case Intrinsic::amdgcn_exp2:
14764 case Intrinsic::amdgcn_log_clamp:
14765 case Intrinsic::amdgcn_rcp:
14766 case Intrinsic::amdgcn_rcp_legacy:
14767 case Intrinsic::amdgcn_rsq:
14768 case Intrinsic::amdgcn_rsq_clamp:
14769 case Intrinsic::amdgcn_rsq_legacy:
14770 case Intrinsic::amdgcn_div_scale:
14771 case Intrinsic::amdgcn_div_fmas:
14772 case Intrinsic::amdgcn_div_fixup:
14773 case Intrinsic::amdgcn_fract:
14774 case Intrinsic::amdgcn_cvt_pkrtz:
14775 case Intrinsic::amdgcn_cubeid:
14776 case Intrinsic::amdgcn_cubema:
14777 case Intrinsic::amdgcn_cubesc:
14778 case Intrinsic::amdgcn_cubetc:
14779 case Intrinsic::amdgcn_frexp_mant:
14780 case Intrinsic::amdgcn_fdot2:
14781 case Intrinsic::amdgcn_trig_preop:
14782 case Intrinsic::amdgcn_tanh:
14783 return true;
14784 default:
14785 break;
14786 }
14787
14788 [[fallthrough]];
14789 default:
14790 return false;
14791 }
14792
14793 llvm_unreachable("invalid operation");
14794}
14795
14796// Constant fold canonicalize.
14797SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14798 const SDLoc &SL, EVT VT,
14799 const APFloat &C) const {
14800 // Flush denormals to 0 if not enabled.
14801 if (C.isDenormal()) {
14802 DenormalMode Mode =
14803 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14804 if (Mode == DenormalMode::getPreserveSign()) {
14805 return DAG.getConstantFP(
14806 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14807 }
14808
14809 if (Mode != DenormalMode::getIEEE())
14810 return SDValue();
14811 }
14812
14813 if (C.isNaN()) {
14814 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14815 if (C.isSignaling()) {
14816 // Quiet a signaling NaN.
14817 // FIXME: Is this supposed to preserve payload bits?
14818 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14819 }
14820
14821 // Make sure it is the canonical NaN bitpattern.
14822 //
14823 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14824 // immediate?
14825 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14826 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14827 }
14828
14829 // Already canonical.
14830 return DAG.getConstantFP(C, SL, VT);
14831}
14832
14834 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14835}
14836
14837SDValue
14838SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14839 DAGCombinerInfo &DCI) const {
14840 SelectionDAG &DAG = DCI.DAG;
14841 SDValue N0 = N->getOperand(0);
14842 EVT VT = N->getValueType(0);
14843
14844 // fcanonicalize undef -> qnan
14845 if (N0.isUndef()) {
14847 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14848 }
14849
14850 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14851 EVT VT = N->getValueType(0);
14852 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14853 }
14854
14855 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14856 // (fcanonicalize k)
14857 //
14858 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14859
14860 // TODO: This could be better with wider vectors that will be split to v2f16,
14861 // and to consider uses since there aren't that many packed operations.
14862 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14863 isTypeLegal(MVT::v2f16)) {
14864 SDLoc SL(N);
14865 SDValue NewElts[2];
14866 SDValue Lo = N0.getOperand(0);
14867 SDValue Hi = N0.getOperand(1);
14868 EVT EltVT = Lo.getValueType();
14869
14871 for (unsigned I = 0; I != 2; ++I) {
14872 SDValue Op = N0.getOperand(I);
14873 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14874 NewElts[I] =
14875 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14876 } else if (Op.isUndef()) {
14877 // Handled below based on what the other operand is.
14878 NewElts[I] = Op;
14879 } else {
14880 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14881 }
14882 }
14883
14884 // If one half is undef, and one is constant, prefer a splat vector rather
14885 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14886 // cheaper to use and may be free with a packed operation.
14887 if (NewElts[0].isUndef()) {
14888 if (isa<ConstantFPSDNode>(NewElts[1]))
14889 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14890 ? NewElts[1]
14891 : DAG.getConstantFP(0.0f, SL, EltVT);
14892 }
14893
14894 if (NewElts[1].isUndef()) {
14895 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14896 ? NewElts[0]
14897 : DAG.getConstantFP(0.0f, SL, EltVT);
14898 }
14899
14900 return DAG.getBuildVector(VT, SL, NewElts);
14901 }
14902 }
14903
14904 return SDValue();
14905}
14906
14907static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14908 switch (Opc) {
14909 case ISD::FMAXNUM:
14910 case ISD::FMAXNUM_IEEE:
14911 case ISD::FMAXIMUMNUM:
14912 return AMDGPUISD::FMAX3;
14913 case ISD::FMAXIMUM:
14914 return AMDGPUISD::FMAXIMUM3;
14915 case ISD::SMAX:
14916 return AMDGPUISD::SMAX3;
14917 case ISD::UMAX:
14918 return AMDGPUISD::UMAX3;
14919 case ISD::FMINNUM:
14920 case ISD::FMINNUM_IEEE:
14921 case ISD::FMINIMUMNUM:
14922 return AMDGPUISD::FMIN3;
14923 case ISD::FMINIMUM:
14924 return AMDGPUISD::FMINIMUM3;
14925 case ISD::SMIN:
14926 return AMDGPUISD::SMIN3;
14927 case ISD::UMIN:
14928 return AMDGPUISD::UMIN3;
14929 default:
14930 llvm_unreachable("Not a min/max opcode");
14931 }
14932}
14933
14934SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14935 const SDLoc &SL, SDValue Src,
14936 SDValue MinVal,
14937 SDValue MaxVal,
14938 bool Signed) const {
14939
14940 // med3 comes from
14941 // min(max(x, K0), K1), K0 < K1
14942 // max(min(x, K0), K1), K1 < K0
14943 //
14944 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14945 // min/max op.
14946 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14947 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14948
14949 if (!MinK || !MaxK)
14950 return SDValue();
14951
14952 if (Signed) {
14953 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14954 return SDValue();
14955 } else {
14956 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14957 return SDValue();
14958 }
14959
14960 EVT VT = MinK->getValueType(0);
14961 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14962 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14963 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14964
14965 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14966 // not available, but this is unlikely to be profitable as constants
14967 // will often need to be materialized & extended, especially on
14968 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14969 return SDValue();
14970}
14971
14974 return C;
14975
14977 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14978 return C;
14979 }
14980
14981 return nullptr;
14982}
14983
14984SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14985 const SDLoc &SL, SDValue Op0,
14986 SDValue Op1) const {
14987 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14988 if (!K1)
14989 return SDValue();
14990
14991 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14992 if (!K0)
14993 return SDValue();
14994
14995 // Ordered >= (although NaN inputs should have folded away by now).
14996 if (K0->getValueAPF() > K1->getValueAPF())
14997 return SDValue();
14998
14999 // med3 with a nan input acts like
15000 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15001 //
15002 // So the result depends on whether the IEEE mode bit is enabled or not with a
15003 // signaling nan input.
15004 // ieee=1
15005 // s0 snan: yields s2
15006 // s1 snan: yields s2
15007 // s2 snan: qnan
15008
15009 // s0 qnan: min(s1, s2)
15010 // s1 qnan: min(s0, s2)
15011 // s2 qnan: min(s0, s1)
15012
15013 // ieee=0
15014 // s0 snan: min(s1, s2)
15015 // s1 snan: min(s0, s2)
15016 // s2 snan: qnan
15017
15018 // s0 qnan: min(s1, s2)
15019 // s1 qnan: min(s0, s2)
15020 // s2 qnan: min(s0, s1)
15021 const MachineFunction &MF = DAG.getMachineFunction();
15022 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15023
15024 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15025 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15026 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15027 EVT VT = Op0.getValueType();
15028 if (Info->getMode().DX10Clamp) {
15029 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15030 // hardware fmed3 behavior converting to a min.
15031 // FIXME: Should this be allowing -0.0?
15032 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15033 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15034 }
15035
15036 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15037 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15038 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15039 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15040 // then give the other result, which is different from med3 with a NaN
15041 // input.
15042 SDValue Var = Op0.getOperand(0);
15043 if (!DAG.isKnownNeverSNaN(Var))
15044 return SDValue();
15045
15046 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15047
15048 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15049 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15050 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15051 SDValue(K0, 0), SDValue(K1, 0));
15052 }
15053 }
15054
15055 return SDValue();
15056}
15057
15058/// \return true if the subtarget supports minimum3 and maximum3 with the given
15059/// base min/max opcode \p Opc for type \p VT.
15060static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15061 EVT VT) {
15062 switch (Opc) {
15063 case ISD::FMINNUM:
15064 case ISD::FMAXNUM:
15065 case ISD::FMINNUM_IEEE:
15066 case ISD::FMAXNUM_IEEE:
15067 case ISD::FMINIMUMNUM:
15068 case ISD::FMAXIMUMNUM:
15069 case AMDGPUISD::FMIN_LEGACY:
15070 case AMDGPUISD::FMAX_LEGACY:
15071 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15072 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15073 case ISD::FMINIMUM:
15074 case ISD::FMAXIMUM:
15075 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15076 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15077 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15078 case ISD::SMAX:
15079 case ISD::SMIN:
15080 case ISD::UMAX:
15081 case ISD::UMIN:
15082 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15083 default:
15084 return false;
15085 }
15086
15087 llvm_unreachable("not a min/max opcode");
15088}
15089
15090SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15091 DAGCombinerInfo &DCI) const {
15092 SelectionDAG &DAG = DCI.DAG;
15093
15094 EVT VT = N->getValueType(0);
15095 unsigned Opc = N->getOpcode();
15096 SDValue Op0 = N->getOperand(0);
15097 SDValue Op1 = N->getOperand(1);
15098
15099 // Only do this if the inner op has one use since this will just increases
15100 // register pressure for no benefit.
15101
15102 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15103 // max(max(a, b), c) -> max3(a, b, c)
15104 // min(min(a, b), c) -> min3(a, b, c)
15105 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15106 SDLoc DL(N);
15107 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15108 Op0.getOperand(0), Op0.getOperand(1), Op1);
15109 }
15110
15111 // Try commuted.
15112 // max(a, max(b, c)) -> max3(a, b, c)
15113 // min(a, min(b, c)) -> min3(a, b, c)
15114 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15115 SDLoc DL(N);
15116 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15117 Op0, Op1.getOperand(0), Op1.getOperand(1));
15118 }
15119 }
15120
15121 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15122 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15123 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15124 if (SDValue Med3 = performIntMed3ImmCombine(
15125 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15126 return Med3;
15127 }
15128 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15129 if (SDValue Med3 = performIntMed3ImmCombine(
15130 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15131 return Med3;
15132 }
15133
15134 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15135 if (SDValue Med3 = performIntMed3ImmCombine(
15136 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15137 return Med3;
15138 }
15139 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15140 if (SDValue Med3 = performIntMed3ImmCombine(
15141 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15142 return Med3;
15143 }
15144
15145 // if !is_snan(x):
15146 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15147 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15148 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15149 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15150 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15151 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
15152 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
15153 (Opc == AMDGPUISD::FMIN_LEGACY &&
15154 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15155 (VT == MVT::f32 || VT == MVT::f64 ||
15156 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15157 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15158 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15159 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15160 Op0.hasOneUse()) {
15161 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15162 return Res;
15163 }
15164
15165 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15166 // for some types, but at a higher cost since it's implemented with a 3
15167 // operand form.
15168 const SDNodeFlags Flags = N->getFlags();
15169 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
15170 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15171 unsigned NewOpc =
15172 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15173 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15174 }
15175
15176 return SDValue();
15177}
15178
15182 // FIXME: Should this be allowing -0.0?
15183 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15184 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15185 }
15186 }
15187
15188 return false;
15189}
15190
15191// FIXME: Should only worry about snans for version with chain.
15192SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15193 DAGCombinerInfo &DCI) const {
15194 EVT VT = N->getValueType(0);
15195 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15196 // NaNs. With a NaN input, the order of the operands may change the result.
15197
15198 SelectionDAG &DAG = DCI.DAG;
15199 SDLoc SL(N);
15200
15201 SDValue Src0 = N->getOperand(0);
15202 SDValue Src1 = N->getOperand(1);
15203 SDValue Src2 = N->getOperand(2);
15204
15205 if (isClampZeroToOne(Src0, Src1)) {
15206 // const_a, const_b, x -> clamp is safe in all cases including signaling
15207 // nans.
15208 // FIXME: Should this be allowing -0.0?
15209 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15210 }
15211
15212 const MachineFunction &MF = DAG.getMachineFunction();
15213 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15214
15215 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15216 // handling no dx10-clamp?
15217 if (Info->getMode().DX10Clamp) {
15218 // If NaNs is clamped to 0, we are free to reorder the inputs.
15219
15220 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15221 std::swap(Src0, Src1);
15222
15223 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15224 std::swap(Src1, Src2);
15225
15226 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15227 std::swap(Src0, Src1);
15228
15229 if (isClampZeroToOne(Src1, Src2))
15230 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15231 }
15232
15233 return SDValue();
15234}
15235
15236SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15237 DAGCombinerInfo &DCI) const {
15238 SDValue Src0 = N->getOperand(0);
15239 SDValue Src1 = N->getOperand(1);
15240 if (Src0.isUndef() && Src1.isUndef())
15241 return DCI.DAG.getUNDEF(N->getValueType(0));
15242 return SDValue();
15243}
15244
15245// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15246// expanded into a set of cmp/select instructions.
15248 unsigned NumElem,
15249 bool IsDivergentIdx,
15250 const GCNSubtarget *Subtarget) {
15252 return false;
15253
15254 unsigned VecSize = EltSize * NumElem;
15255
15256 // Sub-dword vectors of size 2 dword or less have better implementation.
15257 if (VecSize <= 64 && EltSize < 32)
15258 return false;
15259
15260 // Always expand the rest of sub-dword instructions, otherwise it will be
15261 // lowered via memory.
15262 if (EltSize < 32)
15263 return true;
15264
15265 // Always do this if var-idx is divergent, otherwise it will become a loop.
15266 if (IsDivergentIdx)
15267 return true;
15268
15269 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15270 unsigned NumInsts = NumElem /* Number of compares */ +
15271 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15272
15273 // On some architectures (GFX9) movrel is not available and it's better
15274 // to expand.
15275 if (Subtarget->useVGPRIndexMode())
15276 return NumInsts <= 16;
15277
15278 // If movrel is available, use it instead of expanding for vector of 8
15279 // elements.
15280 if (Subtarget->hasMovrel())
15281 return NumInsts <= 15;
15282
15283 return true;
15284}
15285
15287 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15288 if (isa<ConstantSDNode>(Idx))
15289 return false;
15290
15291 SDValue Vec = N->getOperand(0);
15292 EVT VecVT = Vec.getValueType();
15293 EVT EltVT = VecVT.getVectorElementType();
15294 unsigned EltSize = EltVT.getSizeInBits();
15295 unsigned NumElem = VecVT.getVectorNumElements();
15296
15298 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15299}
15300
15301SDValue
15302SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15303 DAGCombinerInfo &DCI) const {
15304 SDValue Vec = N->getOperand(0);
15305 SelectionDAG &DAG = DCI.DAG;
15306
15307 EVT VecVT = Vec.getValueType();
15308 EVT VecEltVT = VecVT.getVectorElementType();
15309 EVT ResVT = N->getValueType(0);
15310
15311 unsigned VecSize = VecVT.getSizeInBits();
15312 unsigned VecEltSize = VecEltVT.getSizeInBits();
15313
15314 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15316 SDLoc SL(N);
15317 SDValue Idx = N->getOperand(1);
15318 SDValue Elt =
15319 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15320 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15321 }
15322
15323 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15324 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15325 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15326 // depending on the shift operand. See e.g. performSraCombine().
15327 // This combine ensures that the optimisation is compatible with v2i32
15328 // legalised AND.
15329 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15330 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15331
15333 if (!C || C->getZExtValue() != 0x1f)
15334 return SDValue();
15335
15336 SDLoc SL(N);
15337 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15338 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15339 Vec->getOperand(0), N->getOperand(1));
15340 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15341 DAG.ReplaceAllUsesWith(N, A.getNode());
15342 }
15343
15344 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15345 // =>
15346 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15347 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15348 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15349 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15350 SDLoc SL(N);
15351 SDValue Idx = N->getOperand(1);
15352 unsigned Opc = Vec.getOpcode();
15353
15354 switch (Opc) {
15355 default:
15356 break;
15357 // TODO: Support other binary operations.
15358 case ISD::FADD:
15359 case ISD::FSUB:
15360 case ISD::FMUL:
15361 case ISD::ADD:
15362 case ISD::UMIN:
15363 case ISD::UMAX:
15364 case ISD::SMIN:
15365 case ISD::SMAX:
15366 case ISD::FMAXNUM:
15367 case ISD::FMINNUM:
15368 case ISD::FMAXNUM_IEEE:
15369 case ISD::FMINNUM_IEEE:
15370 case ISD::FMAXIMUM:
15371 case ISD::FMINIMUM: {
15372 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15373 Vec.getOperand(0), Idx);
15374 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15375 Vec.getOperand(1), Idx);
15376
15377 DCI.AddToWorklist(Elt0.getNode());
15378 DCI.AddToWorklist(Elt1.getNode());
15379 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15380 }
15381 }
15382 }
15383
15384 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15386 SDLoc SL(N);
15387 SDValue Idx = N->getOperand(1);
15388 SDValue V;
15389 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15390 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15391 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15392 if (I == 0)
15393 V = Elt;
15394 else
15395 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15396 }
15397 return V;
15398 }
15399
15400 if (!DCI.isBeforeLegalize())
15401 return SDValue();
15402
15403 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15404 // elements. This exposes more load reduction opportunities by replacing
15405 // multiple small extract_vector_elements with a single 32-bit extract.
15406 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15407 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15408 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15409 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15410
15411 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15412 unsigned EltIdx = BitIndex / 32;
15413 unsigned LeftoverBitIdx = BitIndex % 32;
15414 SDLoc SL(N);
15415
15416 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15417 DCI.AddToWorklist(Cast.getNode());
15418
15419 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15420 DAG.getConstant(EltIdx, SL, MVT::i32));
15421 DCI.AddToWorklist(Elt.getNode());
15422 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15423 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15424 DCI.AddToWorklist(Srl.getNode());
15425
15426 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15427 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15428 DCI.AddToWorklist(Trunc.getNode());
15429
15430 if (VecEltVT == ResVT) {
15431 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15432 }
15433
15434 assert(ResVT.isScalarInteger());
15435 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15436 }
15437
15438 return SDValue();
15439}
15440
15441SDValue
15442SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15443 DAGCombinerInfo &DCI) const {
15444 SDValue Vec = N->getOperand(0);
15445 SDValue Idx = N->getOperand(2);
15446 EVT VecVT = Vec.getValueType();
15447 EVT EltVT = VecVT.getVectorElementType();
15448
15449 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15450 // => BUILD_VECTOR n x select (e, const-idx)
15452 return SDValue();
15453
15454 SelectionDAG &DAG = DCI.DAG;
15455 SDLoc SL(N);
15456 SDValue Ins = N->getOperand(1);
15457 EVT IdxVT = Idx.getValueType();
15458
15460 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15461 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15462 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15463 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15464 Ops.push_back(V);
15465 }
15466
15467 return DAG.getBuildVector(VecVT, SL, Ops);
15468}
15469
15470/// Return the source of an fp_extend from f16 to f32, or a converted FP
15471/// constant.
15473 if (Src.getOpcode() == ISD::FP_EXTEND &&
15474 Src.getOperand(0).getValueType() == MVT::f16) {
15475 return Src.getOperand(0);
15476 }
15477
15478 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15479 APFloat Val = CFP->getValueAPF();
15480 bool LosesInfo = true;
15482 if (!LosesInfo)
15483 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15484 }
15485
15486 return SDValue();
15487}
15488
15489SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15490 DAGCombinerInfo &DCI) const {
15491 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15492 "combine only useful on gfx8");
15493
15494 SDValue TruncSrc = N->getOperand(0);
15495 EVT VT = N->getValueType(0);
15496 if (VT != MVT::f16)
15497 return SDValue();
15498
15499 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15500 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15501 return SDValue();
15502
15503 SelectionDAG &DAG = DCI.DAG;
15504 SDLoc SL(N);
15505
15506 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15507 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15508 // casting back.
15509
15510 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15511 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15512 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15513 if (!A)
15514 return SDValue();
15515
15516 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15517 if (!B)
15518 return SDValue();
15519
15520 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15521 if (!C)
15522 return SDValue();
15523
15524 // This changes signaling nan behavior. If an input is a signaling nan, it
15525 // would have been quieted by the fpext originally. We don't care because
15526 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15527 // we would be worse off than just doing the promotion.
15528 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15529 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15530 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15531 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15532}
15533
15534unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15535 const SDNode *N0,
15536 const SDNode *N1) const {
15537 EVT VT = N0->getValueType(0);
15538
15539 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15540 // support denormals ever.
15541 if (((VT == MVT::f32 &&
15543 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15546 return ISD::FMAD;
15547
15548 const TargetOptions &Options = DAG.getTarget().Options;
15549 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15550 (N0->getFlags().hasAllowContract() &&
15551 N1->getFlags().hasAllowContract())) &&
15553 return ISD::FMA;
15554 }
15555
15556 return 0;
15557}
15558
15559// For a reassociatable opcode perform:
15560// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15561SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15562 SelectionDAG &DAG) const {
15563 EVT VT = N->getValueType(0);
15564 if (VT != MVT::i32 && VT != MVT::i64)
15565 return SDValue();
15566
15567 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15568 return SDValue();
15569
15570 unsigned Opc = N->getOpcode();
15571 SDValue Op0 = N->getOperand(0);
15572 SDValue Op1 = N->getOperand(1);
15573
15574 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15575 return SDValue();
15576
15577 if (Op0->isDivergent())
15578 std::swap(Op0, Op1);
15579
15580 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15581 return SDValue();
15582
15583 SDValue Op2 = Op1.getOperand(1);
15584 Op1 = Op1.getOperand(0);
15585 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15586 return SDValue();
15587
15588 if (Op1->isDivergent())
15589 std::swap(Op1, Op2);
15590
15591 SDLoc SL(N);
15592 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15593 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15594}
15595
15596static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15597 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15599 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15600 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15601 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15602}
15603
15604// Fold
15605// y = lshr i64 x, 32
15606// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15607// with Const.hi == -1
15608// To
15609// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15611 SDValue MulLHS, SDValue MulRHS,
15612 SDValue AddRHS) {
15613 if (MulRHS.getOpcode() == ISD::SRL)
15614 std::swap(MulLHS, MulRHS);
15615
15616 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15617 return SDValue();
15618
15619 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15620 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15621 MulLHS.getOperand(0) != AddRHS)
15622 return SDValue();
15623
15625 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15626 return SDValue();
15627
15628 SDValue ConstMul =
15629 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15630 return getMad64_32(DAG, SL, MVT::i64,
15631 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15632 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15633}
15634
15635// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15636// multiplies, if any.
15637//
15638// Full 64-bit multiplies that feed into an addition are lowered here instead
15639// of using the generic expansion. The generic expansion ends up with
15640// a tree of ADD nodes that prevents us from using the "add" part of the
15641// MAD instruction. The expansion produced here results in a chain of ADDs
15642// instead of a tree.
15643SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15644 DAGCombinerInfo &DCI) const {
15645 assert(N->isAnyAdd());
15646
15647 SelectionDAG &DAG = DCI.DAG;
15648 EVT VT = N->getValueType(0);
15649 SDLoc SL(N);
15650 SDValue LHS = N->getOperand(0);
15651 SDValue RHS = N->getOperand(1);
15652
15653 if (VT.isVector())
15654 return SDValue();
15655
15656 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15657 // result in scalar registers for uniform values.
15658 if (!N->isDivergent() && Subtarget->hasSMulHi())
15659 return SDValue();
15660
15661 unsigned NumBits = VT.getScalarSizeInBits();
15662 if (NumBits <= 32 || NumBits > 64)
15663 return SDValue();
15664
15665 if (LHS.getOpcode() != ISD::MUL) {
15666 assert(RHS.getOpcode() == ISD::MUL);
15667 std::swap(LHS, RHS);
15668 }
15669
15670 // Avoid the fold if it would unduly increase the number of multiplies due to
15671 // multiple uses, except on hardware with full-rate multiply-add (which is
15672 // part of full-rate 64-bit ops).
15673 if (!Subtarget->hasFullRate64Ops()) {
15674 unsigned NumUsers = 0;
15675 for (SDNode *User : LHS->users()) {
15676 // There is a use that does not feed into addition, so the multiply can't
15677 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15678 if (!User->isAnyAdd())
15679 return SDValue();
15680
15681 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15682 // MUL + 3xADD + 3xADDC over 3xMAD.
15683 ++NumUsers;
15684 if (NumUsers >= 3)
15685 return SDValue();
15686 }
15687 }
15688
15689 SDValue MulLHS = LHS.getOperand(0);
15690 SDValue MulRHS = LHS.getOperand(1);
15691 SDValue AddRHS = RHS;
15692
15693 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15694 return FoldedMAD;
15695
15696 // Always check whether operands are small unsigned values, since that
15697 // knowledge is useful in more cases. Check for small signed values only if
15698 // doing so can unlock a shorter code sequence.
15699 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15700 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15701
15702 bool MulSignedLo = false;
15703 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15704 MulSignedLo =
15705 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15706 }
15707
15708 // The operands and final result all have the same number of bits. If
15709 // operands need to be extended, they can be extended with garbage. The
15710 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15711 // truncated away in the end.
15712 if (VT != MVT::i64) {
15713 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15714 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15715 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15716 }
15717
15718 // The basic code generated is conceptually straightforward. Pseudo code:
15719 //
15720 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15721 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15722 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15723 //
15724 // The second and third lines are optional, depending on whether the factors
15725 // are {sign,zero}-extended or not.
15726 //
15727 // The actual DAG is noisier than the pseudo code, but only due to
15728 // instructions that disassemble values into low and high parts, and
15729 // assemble the final result.
15730 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15731
15732 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15733 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15734 SDValue Accum =
15735 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15736
15737 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15738 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15739
15740 if (!MulLHSUnsigned32) {
15741 auto MulLHSHi =
15742 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15743 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15744 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15745 }
15746
15747 if (!MulRHSUnsigned32) {
15748 auto MulRHSHi =
15749 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15750 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15751 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15752 }
15753
15754 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15755 Accum = DAG.getBitcast(MVT::i64, Accum);
15756 }
15757
15758 if (VT != MVT::i64)
15759 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15760 return Accum;
15761}
15762
15763SDValue
15764SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15765 DAGCombinerInfo &DCI) const {
15766 SDValue RHS = N->getOperand(1);
15767 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15768 if (!CRHS)
15769 return SDValue();
15770
15771 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15772 // common.
15773 uint64_t Val = CRHS->getZExtValue();
15774 if (countr_zero(Val) >= 32) {
15775 SelectionDAG &DAG = DCI.DAG;
15776 SDLoc SL(N);
15777 SDValue LHS = N->getOperand(0);
15778
15779 // Avoid carry machinery if we know the low half of the add does not
15780 // contribute to the final result.
15781 //
15782 // add i64:x, K if computeTrailingZeros(K) >= 32
15783 // => build_pair (add x.hi, K.hi), x.lo
15784
15785 // Breaking the 64-bit add here with this strange constant is unlikely
15786 // to interfere with addressing mode patterns.
15787
15788 SDValue Hi = getHiHalf64(LHS, DAG);
15789 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15790 unsigned Opcode = N->getOpcode();
15791 if (Opcode == ISD::PTRADD)
15792 Opcode = ISD::ADD;
15793 SDValue AddHi =
15794 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15795
15796 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15797 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15798 }
15799
15800 return SDValue();
15801}
15802
15803// Collect the ultimate src of each of the mul node's operands, and confirm
15804// each operand is 8 bytes.
15805static std::optional<ByteProvider<SDValue>>
15806handleMulOperand(const SDValue &MulOperand) {
15807 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15808 if (!Byte0 || Byte0->isConstantZero()) {
15809 return std::nullopt;
15810 }
15811 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15812 if (Byte1 && !Byte1->isConstantZero()) {
15813 return std::nullopt;
15814 }
15815 return Byte0;
15816}
15817
15818static unsigned addPermMasks(unsigned First, unsigned Second) {
15819 unsigned FirstCs = First & 0x0c0c0c0c;
15820 unsigned SecondCs = Second & 0x0c0c0c0c;
15821 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15822 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15823
15824 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15825 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15826 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15827 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15828
15829 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15830}
15831
15832struct DotSrc {
15834 int64_t PermMask;
15836};
15837
15841 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15842
15843 assert(Src0.Src.has_value() && Src1.Src.has_value());
15844 // Src0s and Src1s are empty, just place arbitrarily.
15845 if (Step == 0) {
15846 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15847 Src0.SrcOffset / 4});
15848 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15849 Src1.SrcOffset / 4});
15850 return;
15851 }
15852
15853 for (int BPI = 0; BPI < 2; BPI++) {
15854 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15855 if (BPI == 1) {
15856 BPP = {Src1, Src0};
15857 }
15858 unsigned ZeroMask = 0x0c0c0c0c;
15859 unsigned FMask = 0xFF << (8 * (3 - Step));
15860
15861 unsigned FirstMask =
15862 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15863 unsigned SecondMask =
15864 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15865 // Attempt to find Src vector which contains our SDValue, if so, add our
15866 // perm mask to the existing one. If we are unable to find a match for the
15867 // first SDValue, attempt to find match for the second.
15868 int FirstGroup = -1;
15869 for (int I = 0; I < 2; I++) {
15870 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15871 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15872 return IterElt.SrcOp == *BPP.first.Src &&
15873 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15874 };
15875
15876 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15877 if (Match != Srcs.end()) {
15878 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15879 FirstGroup = I;
15880 break;
15881 }
15882 }
15883 if (FirstGroup != -1) {
15884 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15885 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15886 return IterElt.SrcOp == *BPP.second.Src &&
15887 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15888 };
15889 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15890 if (Match != Srcs.end()) {
15891 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15892 } else
15893 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15894 return;
15895 }
15896 }
15897
15898 // If we have made it here, then we could not find a match in Src0s or Src1s
15899 // for either Src0 or Src1, so just place them arbitrarily.
15900
15901 unsigned ZeroMask = 0x0c0c0c0c;
15902 unsigned FMask = 0xFF << (8 * (3 - Step));
15903
15904 Src0s.push_back(
15905 {*Src0.Src,
15906 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15907 Src0.SrcOffset / 4});
15908 Src1s.push_back(
15909 {*Src1.Src,
15910 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15911 Src1.SrcOffset / 4});
15912}
15913
15915 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15916 bool IsAny) {
15917
15918 // If we just have one source, just permute it accordingly.
15919 if (Srcs.size() == 1) {
15920 auto *Elt = Srcs.begin();
15921 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15922
15923 // v_perm will produce the original value
15924 if (Elt->PermMask == 0x3020100)
15925 return EltOp;
15926
15927 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15928 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15929 }
15930
15931 auto *FirstElt = Srcs.begin();
15932 auto *SecondElt = std::next(FirstElt);
15933
15935
15936 // If we have multiple sources in the chain, combine them via perms (using
15937 // calculated perm mask) and Ors.
15938 while (true) {
15939 auto FirstMask = FirstElt->PermMask;
15940 auto SecondMask = SecondElt->PermMask;
15941
15942 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15943 unsigned FirstPlusFour = FirstMask | 0x04040404;
15944 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15945 // original 0x0C.
15946 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15947
15948 auto PermMask = addPermMasks(FirstMask, SecondMask);
15949 auto FirstVal =
15950 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15951 auto SecondVal =
15952 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15953
15954 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15955 SecondVal,
15956 DAG.getConstant(PermMask, SL, MVT::i32)));
15957
15958 FirstElt = std::next(SecondElt);
15959 if (FirstElt == Srcs.end())
15960 break;
15961
15962 SecondElt = std::next(FirstElt);
15963 // If we only have a FirstElt, then just combine that into the cumulative
15964 // source node.
15965 if (SecondElt == Srcs.end()) {
15966 auto EltOp =
15967 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15968
15969 Perms.push_back(
15970 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15971 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15972 break;
15973 }
15974 }
15975
15976 assert(Perms.size() == 1 || Perms.size() == 2);
15977 return Perms.size() == 2
15978 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15979 : Perms[0];
15980}
15981
15982static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15983 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15984 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15985 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15986 EntryMask += ZeroMask;
15987 }
15988}
15989
15990static bool isMul(const SDValue Op) {
15991 auto Opcode = Op.getOpcode();
15992
15993 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15994 Opcode == AMDGPUISD::MUL_I24);
15995}
15996
15997static std::optional<bool>
15999 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16000 const SDValue &S1Op, const SelectionDAG &DAG) {
16001 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16002 // of the dot4 is irrelevant.
16003 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16004 return false;
16005
16006 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16007 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16008 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16009 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16010 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16011 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16012
16013 assert(!(S0IsUnsigned && S0IsSigned));
16014 assert(!(S1IsUnsigned && S1IsSigned));
16015
16016 // There are 9 possible permutations of
16017 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16018
16019 // In two permutations, the sign bits are known to be the same for both Ops,
16020 // so simply return Signed / Unsigned corresponding to the MSB
16021
16022 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16023 return S0IsSigned;
16024
16025 // In another two permutations, the sign bits are known to be opposite. In
16026 // this case return std::nullopt to indicate a bad match.
16027
16028 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16029 return std::nullopt;
16030
16031 // In the remaining five permutations, we don't know the value of the sign
16032 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16033 // the upper bits must be extension bits. Thus, the only ways for the sign
16034 // bit to be unknown is if it was sign extended from unknown value, or if it
16035 // was any extended. In either case, it is correct to use the signed
16036 // version of the signedness semantics of dot4
16037
16038 // In two of such permutations, we known the sign bit is set for
16039 // one op, and the other is unknown. It is okay to used signed version of
16040 // dot4.
16041 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16042 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16043 return true;
16044
16045 // In one such permutation, we don't know either of the sign bits. It is okay
16046 // to used the signed version of dot4.
16047 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16048 return true;
16049
16050 // In two of such permutations, we known the sign bit is unset for
16051 // one op, and the other is unknown. Return std::nullopt to indicate a
16052 // bad match.
16053 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16054 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16055 return std::nullopt;
16056
16057 llvm_unreachable("Fully covered condition");
16058}
16059
16060SDValue SITargetLowering::performAddCombine(SDNode *N,
16061 DAGCombinerInfo &DCI) const {
16062 SelectionDAG &DAG = DCI.DAG;
16063 EVT VT = N->getValueType(0);
16064 SDLoc SL(N);
16065 SDValue LHS = N->getOperand(0);
16066 SDValue RHS = N->getOperand(1);
16067
16068 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16069 if (Subtarget->hasMad64_32()) {
16070 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16071 return Folded;
16072 }
16073 }
16074
16075 if (SDValue V = reassociateScalarOps(N, DAG)) {
16076 return V;
16077 }
16078
16079 if (VT == MVT::i64) {
16080 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16081 return Folded;
16082 }
16083
16084 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16085 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16086 SDValue TempNode(N, 0);
16087 std::optional<bool> IsSigned;
16091
16092 // Match the v_dot4 tree, while collecting src nodes.
16093 int ChainLength = 0;
16094 for (int I = 0; I < 4; I++) {
16095 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16096 if (MulIdx == -1)
16097 break;
16098 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16099 if (!Src0)
16100 break;
16101 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16102 if (!Src1)
16103 break;
16104
16105 auto IterIsSigned = checkDot4MulSignedness(
16106 TempNode->getOperand(MulIdx), *Src0, *Src1,
16107 TempNode->getOperand(MulIdx)->getOperand(0),
16108 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16109 if (!IterIsSigned)
16110 break;
16111 if (!IsSigned)
16112 IsSigned = *IterIsSigned;
16113 if (*IterIsSigned != *IsSigned)
16114 break;
16115 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16116 auto AddIdx = 1 - MulIdx;
16117 // Allow the special case where add (add (mul24, 0), mul24) became ->
16118 // add (mul24, mul24).
16119 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16120 Src2s.push_back(TempNode->getOperand(AddIdx));
16121 auto Src0 =
16122 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16123 if (!Src0)
16124 break;
16125 auto Src1 =
16126 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16127 if (!Src1)
16128 break;
16129 auto IterIsSigned = checkDot4MulSignedness(
16130 TempNode->getOperand(AddIdx), *Src0, *Src1,
16131 TempNode->getOperand(AddIdx)->getOperand(0),
16132 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16133 if (!IterIsSigned)
16134 break;
16135 assert(IsSigned);
16136 if (*IterIsSigned != *IsSigned)
16137 break;
16138 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16139 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16140 ChainLength = I + 2;
16141 break;
16142 }
16143
16144 TempNode = TempNode->getOperand(AddIdx);
16145 Src2s.push_back(TempNode);
16146 ChainLength = I + 1;
16147 if (TempNode->getNumOperands() < 2)
16148 break;
16149 LHS = TempNode->getOperand(0);
16150 RHS = TempNode->getOperand(1);
16151 }
16152
16153 if (ChainLength < 2)
16154 return SDValue();
16155
16156 // Masks were constructed with assumption that we would find a chain of
16157 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16158 // 0x0c) so they do not affect dot calculation.
16159 if (ChainLength < 4) {
16160 fixMasks(Src0s, ChainLength);
16161 fixMasks(Src1s, ChainLength);
16162 }
16163
16164 SDValue Src0, Src1;
16165
16166 // If we are just using a single source for both, and have permuted the
16167 // bytes consistently, we can just use the sources without permuting
16168 // (commutation).
16169 bool UseOriginalSrc = false;
16170 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16171 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16172 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16173 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16174 SmallVector<unsigned, 4> SrcBytes;
16175 auto Src0Mask = Src0s.begin()->PermMask;
16176 SrcBytes.push_back(Src0Mask & 0xFF000000);
16177 bool UniqueEntries = true;
16178 for (auto I = 1; I < 4; I++) {
16179 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16180
16181 if (is_contained(SrcBytes, NextByte)) {
16182 UniqueEntries = false;
16183 break;
16184 }
16185 SrcBytes.push_back(NextByte);
16186 }
16187
16188 if (UniqueEntries) {
16189 UseOriginalSrc = true;
16190
16191 auto *FirstElt = Src0s.begin();
16192 auto FirstEltOp =
16193 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16194
16195 auto *SecondElt = Src1s.begin();
16196 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16197 SecondElt->DWordOffset);
16198
16199 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16200 MVT::getIntegerVT(32));
16201 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16202 MVT::getIntegerVT(32));
16203 }
16204 }
16205
16206 if (!UseOriginalSrc) {
16207 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16208 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16209 }
16210
16211 assert(IsSigned);
16212 SDValue Src2 =
16213 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16214
16215 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16216 : Intrinsic::amdgcn_udot4,
16217 SL, MVT::i64);
16218
16219 assert(!VT.isVector());
16220 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16221 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16222
16223 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16224 }
16225
16226 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16227 return SDValue();
16228
16229 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16230 // add x, sext (setcc) => usubo_carry x, 0, setcc
16231 unsigned Opc = LHS.getOpcode();
16234 std::swap(RHS, LHS);
16235
16236 Opc = RHS.getOpcode();
16237 switch (Opc) {
16238 default:
16239 break;
16240 case ISD::ZERO_EXTEND:
16241 case ISD::SIGN_EXTEND:
16242 case ISD::ANY_EXTEND: {
16243 auto Cond = RHS.getOperand(0);
16244 // If this won't be a real VOPC output, we would still need to insert an
16245 // extra instruction anyway.
16246 if (!isBoolSGPR(Cond))
16247 break;
16248 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16249 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16251 return DAG.getNode(Opc, SL, VTList, Args);
16252 }
16253 case ISD::UADDO_CARRY: {
16254 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16255 if (!isNullConstant(RHS.getOperand(1)))
16256 break;
16257 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16258 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16259 }
16260 }
16261 return SDValue();
16262}
16263
16264SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16265 DAGCombinerInfo &DCI) const {
16266 SelectionDAG &DAG = DCI.DAG;
16267 SDLoc DL(N);
16268 EVT VT = N->getValueType(0);
16269 SDValue N0 = N->getOperand(0);
16270 SDValue N1 = N->getOperand(1);
16271
16272 // The following folds transform PTRADDs into regular arithmetic in cases
16273 // where the PTRADD wouldn't be folded as an immediate offset into memory
16274 // instructions anyway. They are target-specific in that other targets might
16275 // prefer to not lose information about the pointer arithmetic.
16276
16277 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16278 // Adapted from DAGCombiner::visitADDLikeCommutative.
16279 SDValue V, K;
16280 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16281 SDNodeFlags ShlFlags = N1->getFlags();
16282 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16283 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16284 // preserved.
16285 SDNodeFlags NewShlFlags =
16286 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16288 : SDNodeFlags();
16289 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16290 DCI.AddToWorklist(Inner.getNode());
16291 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16292 }
16293
16294 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16295 // performAddCombine.
16296 if (N1.getOpcode() == ISD::MUL) {
16297 if (Subtarget->hasMad64_32()) {
16298 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16299 return Folded;
16300 }
16301 }
16302
16303 // If the 32 low bits of the constant are all zero, there is nothing to fold
16304 // into an immediate offset, so it's better to eliminate the unnecessary
16305 // addition for the lower 32 bits than to preserve the PTRADD.
16306 // Analogous to a fold in performAddCombine.
16307 if (VT == MVT::i64) {
16308 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16309 return Folded;
16310 }
16311
16312 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16313 return SDValue();
16314
16315 SDValue X = N0;
16316 SDValue Y = N1.getOperand(0);
16317 SDValue Z = N1.getOperand(1);
16318 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16319 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16320
16321 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16322 Y->isDivergent() != Z->isDivergent()) {
16323 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16324 // y are uniform and z isn't.
16325 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16326 // z are uniform and y isn't.
16327 // The goal is to push uniform operands up in the computation, so that they
16328 // can be handled with scalar operations. We can't use reassociateScalarOps
16329 // for this since it requires two identical commutative operations to
16330 // reassociate.
16331 if (Y->isDivergent())
16332 std::swap(Y, Z);
16333 // If both additions in the original were NUW, reassociation preserves that.
16334 SDNodeFlags ReassocFlags =
16335 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16336 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16337 DCI.AddToWorklist(UniformInner.getNode());
16338 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16339 }
16340
16341 return SDValue();
16342}
16343
16344SDValue SITargetLowering::performSubCombine(SDNode *N,
16345 DAGCombinerInfo &DCI) const {
16346 SelectionDAG &DAG = DCI.DAG;
16347 EVT VT = N->getValueType(0);
16348
16349 if (VT == MVT::i64) {
16350 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16351 return Folded;
16352 }
16353
16354 if (VT != MVT::i32)
16355 return SDValue();
16356
16357 SDLoc SL(N);
16358 SDValue LHS = N->getOperand(0);
16359 SDValue RHS = N->getOperand(1);
16360
16361 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16362 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16363 unsigned Opc = RHS.getOpcode();
16364 switch (Opc) {
16365 default:
16366 break;
16367 case ISD::ZERO_EXTEND:
16368 case ISD::SIGN_EXTEND:
16369 case ISD::ANY_EXTEND: {
16370 auto Cond = RHS.getOperand(0);
16371 // If this won't be a real VOPC output, we would still need to insert an
16372 // extra instruction anyway.
16373 if (!isBoolSGPR(Cond))
16374 break;
16375 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16376 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16378 return DAG.getNode(Opc, SL, VTList, Args);
16379 }
16380 }
16381
16382 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16383 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16384 if (!isNullConstant(LHS.getOperand(1)))
16385 return SDValue();
16386 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16387 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16388 }
16389 return SDValue();
16390}
16391
16392SDValue
16393SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16394 DAGCombinerInfo &DCI) const {
16395
16396 if (N->getValueType(0) != MVT::i32)
16397 return SDValue();
16398
16399 if (!isNullConstant(N->getOperand(1)))
16400 return SDValue();
16401
16402 SelectionDAG &DAG = DCI.DAG;
16403 SDValue LHS = N->getOperand(0);
16404
16405 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16406 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16407 unsigned LHSOpc = LHS.getOpcode();
16408 unsigned Opc = N->getOpcode();
16409 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16410 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16411 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16412 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16413 }
16414 return SDValue();
16415}
16416
16417SDValue SITargetLowering::performFAddCombine(SDNode *N,
16418 DAGCombinerInfo &DCI) const {
16419 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16420 return SDValue();
16421
16422 SelectionDAG &DAG = DCI.DAG;
16423 EVT VT = N->getValueType(0);
16424
16425 SDLoc SL(N);
16426 SDValue LHS = N->getOperand(0);
16427 SDValue RHS = N->getOperand(1);
16428
16429 // These should really be instruction patterns, but writing patterns with
16430 // source modifiers is a pain.
16431
16432 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16433 if (LHS.getOpcode() == ISD::FADD) {
16434 SDValue A = LHS.getOperand(0);
16435 if (A == LHS.getOperand(1)) {
16436 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16437 if (FusedOp != 0) {
16438 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16439 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16440 }
16441 }
16442 }
16443
16444 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16445 if (RHS.getOpcode() == ISD::FADD) {
16446 SDValue A = RHS.getOperand(0);
16447 if (A == RHS.getOperand(1)) {
16448 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16449 if (FusedOp != 0) {
16450 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16451 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16452 }
16453 }
16454 }
16455
16456 return SDValue();
16457}
16458
16459SDValue SITargetLowering::performFSubCombine(SDNode *N,
16460 DAGCombinerInfo &DCI) const {
16461 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16462 return SDValue();
16463
16464 SelectionDAG &DAG = DCI.DAG;
16465 SDLoc SL(N);
16466 EVT VT = N->getValueType(0);
16467 assert(!VT.isVector());
16468
16469 // Try to get the fneg to fold into the source modifier. This undoes generic
16470 // DAG combines and folds them into the mad.
16471 //
16472 // Only do this if we are not trying to support denormals. v_mad_f32 does
16473 // not support denormals ever.
16474 SDValue LHS = N->getOperand(0);
16475 SDValue RHS = N->getOperand(1);
16476 if (LHS.getOpcode() == ISD::FADD) {
16477 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16478 SDValue A = LHS.getOperand(0);
16479 if (A == LHS.getOperand(1)) {
16480 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16481 if (FusedOp != 0) {
16482 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16483 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16484
16485 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16486 }
16487 }
16488 }
16489
16490 if (RHS.getOpcode() == ISD::FADD) {
16491 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16492
16493 SDValue A = RHS.getOperand(0);
16494 if (A == RHS.getOperand(1)) {
16495 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16496 if (FusedOp != 0) {
16497 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16498 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16499 }
16500 }
16501 }
16502
16503 return SDValue();
16504}
16505
16506SDValue SITargetLowering::performFDivCombine(SDNode *N,
16507 DAGCombinerInfo &DCI) const {
16508 SelectionDAG &DAG = DCI.DAG;
16509 SDLoc SL(N);
16510 EVT VT = N->getValueType(0);
16511
16512 // fsqrt legality correlates to rsq availability.
16513 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
16514 return SDValue();
16515
16516 SDValue LHS = N->getOperand(0);
16517 SDValue RHS = N->getOperand(1);
16518
16519 SDNodeFlags Flags = N->getFlags();
16520 SDNodeFlags RHSFlags = RHS->getFlags();
16521 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16522 !RHS->hasOneUse())
16523 return SDValue();
16524
16525 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16526 bool IsNegative = false;
16527 if (CLHS->isExactlyValue(1.0) ||
16528 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16529 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16530 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16531 if (RHS.getOpcode() == ISD::FSQRT) {
16532 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16533 SDValue Rsq =
16534 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16535 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16536 }
16537 }
16538 }
16539
16540 return SDValue();
16541}
16542
16543SDValue SITargetLowering::performFMulCombine(SDNode *N,
16544 DAGCombinerInfo &DCI) const {
16545 SelectionDAG &DAG = DCI.DAG;
16546 EVT VT = N->getValueType(0);
16547 EVT ScalarVT = VT.getScalarType();
16548 EVT IntVT = VT.changeElementType(MVT::i32);
16549
16550 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16551 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16552 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16553 return SDValue();
16554 }
16555
16556 SDValue LHS = N->getOperand(0);
16557 SDValue RHS = N->getOperand(1);
16558
16559 // It is cheaper to realize i32 inline constants as compared against
16560 // materializing f16 or f64 (or even non-inline f32) values,
16561 // possible via ldexp usage, as shown below :
16562 //
16563 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16564 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16565 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16566 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16567 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16568 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16569 if (!TrueNode)
16570 return SDValue();
16571 const ConstantFPSDNode *FalseNode =
16572 isConstOrConstSplatFP(RHS.getOperand(2));
16573 if (!FalseNode)
16574 return SDValue();
16575
16576 if (TrueNode->isNegative() != FalseNode->isNegative())
16577 return SDValue();
16578
16579 // For f32, only non-inline constants should be transformed.
16580 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16581 if (ScalarVT == MVT::f32 &&
16582 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16583 TII->isInlineConstant(FalseNode->getValueAPF()))
16584 return SDValue();
16585
16586 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16587 if (TrueNodeExpVal == INT_MIN)
16588 return SDValue();
16589 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16590 if (FalseNodeExpVal == INT_MIN)
16591 return SDValue();
16592
16593 SDLoc SL(N);
16594 SDValue SelectNode =
16595 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16596 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16597 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16598
16599 LHS = TrueNode->isNegative()
16600 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16601 : LHS;
16602
16603 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16604 }
16605
16606 return SDValue();
16607}
16608
16609SDValue SITargetLowering::performFMACombine(SDNode *N,
16610 DAGCombinerInfo &DCI) const {
16611 SelectionDAG &DAG = DCI.DAG;
16612 EVT VT = N->getValueType(0);
16613 SDLoc SL(N);
16614
16615 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16616 return SDValue();
16617
16618 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16619 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16620 SDValue Op1 = N->getOperand(0);
16621 SDValue Op2 = N->getOperand(1);
16622 SDValue FMA = N->getOperand(2);
16623
16624 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16625 Op2.getOpcode() != ISD::FP_EXTEND)
16626 return SDValue();
16627
16628 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16629 // regardless of the denorm mode setting. Therefore,
16630 // fp-contract is sufficient to allow generating fdot2.
16631 const TargetOptions &Options = DAG.getTarget().Options;
16632 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16633 (N->getFlags().hasAllowContract() &&
16634 FMA->getFlags().hasAllowContract())) {
16635 Op1 = Op1.getOperand(0);
16636 Op2 = Op2.getOperand(0);
16637 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16639 return SDValue();
16640
16641 SDValue Vec1 = Op1.getOperand(0);
16642 SDValue Idx1 = Op1.getOperand(1);
16643 SDValue Vec2 = Op2.getOperand(0);
16644
16645 SDValue FMAOp1 = FMA.getOperand(0);
16646 SDValue FMAOp2 = FMA.getOperand(1);
16647 SDValue FMAAcc = FMA.getOperand(2);
16648
16649 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16650 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16651 return SDValue();
16652
16653 FMAOp1 = FMAOp1.getOperand(0);
16654 FMAOp2 = FMAOp2.getOperand(0);
16655 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16657 return SDValue();
16658
16659 SDValue Vec3 = FMAOp1.getOperand(0);
16660 SDValue Vec4 = FMAOp2.getOperand(0);
16661 SDValue Idx2 = FMAOp1.getOperand(1);
16662
16663 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16664 // Idx1 and Idx2 cannot be the same.
16665 Idx1 == Idx2)
16666 return SDValue();
16667
16668 if (Vec1 == Vec2 || Vec3 == Vec4)
16669 return SDValue();
16670
16671 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16672 return SDValue();
16673
16674 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16675 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16676 DAG.getTargetConstant(0, SL, MVT::i1));
16677 }
16678 }
16679 return SDValue();
16680}
16681
16682SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16683 DAGCombinerInfo &DCI) const {
16684 SelectionDAG &DAG = DCI.DAG;
16685 SDLoc SL(N);
16686
16687 SDValue LHS = N->getOperand(0);
16688 SDValue RHS = N->getOperand(1);
16689 EVT VT = LHS.getValueType();
16690 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16691
16692 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16693 if (!CRHS) {
16695 if (CRHS) {
16696 std::swap(LHS, RHS);
16697 CC = getSetCCSwappedOperands(CC);
16698 }
16699 }
16700
16701 if (CRHS) {
16702 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16703 isBoolSGPR(LHS.getOperand(0))) {
16704 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16705 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16706 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16707 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16708 if ((CRHS->isAllOnes() &&
16709 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16710 (CRHS->isZero() &&
16711 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16712 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16713 DAG.getAllOnesConstant(SL, MVT::i1));
16714 if ((CRHS->isAllOnes() &&
16715 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16716 (CRHS->isZero() &&
16717 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16718 return LHS.getOperand(0);
16719 }
16720
16721 const APInt &CRHSVal = CRHS->getAPIntValue();
16722 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16723 LHS.getOpcode() == ISD::SELECT &&
16724 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16725 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16726 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16727 isBoolSGPR(LHS.getOperand(0))) {
16728 // Given CT != FT:
16729 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16730 // setcc (select cc, CT, CF), CF, ne => cc
16731 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16732 // setcc (select cc, CT, CF), CT, eq => cc
16733 const APInt &CT = LHS.getConstantOperandAPInt(1);
16734 const APInt &CF = LHS.getConstantOperandAPInt(2);
16735
16736 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16737 (CT == CRHSVal && CC == ISD::SETNE))
16738 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16739 DAG.getAllOnesConstant(SL, MVT::i1));
16740 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16741 (CT == CRHSVal && CC == ISD::SETEQ))
16742 return LHS.getOperand(0);
16743 }
16744 }
16745
16746 // Eliminate setcc by using carryout from add/sub instruction
16747
16748 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
16749 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
16750 // similarly for subtraction
16751
16752 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16753 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16754
16755 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16757 (CC == ISD::SETUGT &&
16759 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16760 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16761 bool IsAdd = LHS.getOpcode() == ISD::ADD;
16762
16763 SDValue Op0 = LHS.getOperand(0);
16764 SDValue Op1 = LHS.getOperand(1);
16765
16766 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
16767 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
16768
16769 SDValue Op0Hi = getHiHalf64(Op0, DAG);
16770 SDValue Op1Hi = getHiHalf64(Op1, DAG);
16771
16772 SDValue NodeLo =
16773 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16774 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16775
16776 SDValue CarryInHi = NodeLo.getValue(1);
16777 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16778 SL, DAG.getVTList(MVT::i32, MVT::i1),
16779 {Op0Hi, Op1Hi, CarryInHi});
16780
16781 SDValue ResultLo = NodeLo.getValue(0);
16782 SDValue ResultHi = NodeHi.getValue(0);
16783
16784 SDValue JoinedResult =
16785 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
16786
16787 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16788 SDValue Overflow = NodeHi.getValue(1);
16789 DCI.CombineTo(LHS.getNode(), Result);
16790 return Overflow;
16791 }
16792
16793 if (VT != MVT::f32 && VT != MVT::f64 &&
16794 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16795 return SDValue();
16796
16797 // Match isinf/isfinite pattern
16798 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16799 // (fcmp one (fabs x), inf) -> (fp_class x,
16800 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16801 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16802 LHS.getOpcode() == ISD::FABS) {
16803 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16804 if (!CRHS)
16805 return SDValue();
16806
16807 const APFloat &APF = CRHS->getValueAPF();
16808 if (APF.isInfinity() && !APF.isNegative()) {
16809 const unsigned IsInfMask =
16811 const unsigned IsFiniteMask =
16815 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16816 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16817 DAG.getConstant(Mask, SL, MVT::i32));
16818 }
16819 }
16820
16821 return SDValue();
16822}
16823
16824SDValue
16825SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16826 DAGCombinerInfo &DCI) const {
16827 SelectionDAG &DAG = DCI.DAG;
16828 SDLoc SL(N);
16829 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16830
16831 SDValue Src = N->getOperand(0);
16832 SDValue Shift = N->getOperand(0);
16833
16834 // TODO: Extend type shouldn't matter (assuming legal types).
16835 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16836 Shift = Shift.getOperand(0);
16837
16838 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16839 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16840 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16841 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16842 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16843 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16844 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16845 SDValue Shifted = DAG.getZExtOrTrunc(
16846 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16847
16848 unsigned ShiftOffset = 8 * Offset;
16849 if (Shift.getOpcode() == ISD::SHL)
16850 ShiftOffset -= C->getZExtValue();
16851 else
16852 ShiftOffset += C->getZExtValue();
16853
16854 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16855 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16856 MVT::f32, Shifted);
16857 }
16858 }
16859 }
16860
16861 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16862 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16863 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16864 // We simplified Src. If this node is not dead, visit it again so it is
16865 // folded properly.
16866 if (N->getOpcode() != ISD::DELETED_NODE)
16867 DCI.AddToWorklist(N);
16868 return SDValue(N, 0);
16869 }
16870
16871 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16872 if (SDValue DemandedSrc =
16873 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16874 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16875
16876 return SDValue();
16877}
16878
16879SDValue SITargetLowering::performClampCombine(SDNode *N,
16880 DAGCombinerInfo &DCI) const {
16881 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16882 if (!CSrc)
16883 return SDValue();
16884
16885 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16886 const APFloat &F = CSrc->getValueAPF();
16887 APFloat Zero = APFloat::getZero(F.getSemantics());
16888 if (F < Zero ||
16889 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16890 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16891 }
16892
16893 APFloat One(F.getSemantics(), "1.0");
16894 if (F > One)
16895 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16896
16897 return SDValue(CSrc, 0);
16898}
16899
16900SDValue SITargetLowering::performSelectCombine(SDNode *N,
16901 DAGCombinerInfo &DCI) const {
16902
16903 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16904 // integer).
16905 // Detect when CMP and SELECT use the same constant and fold them to avoid
16906 // loading the constant twice. Specifically handles patterns like:
16907 // %cmp = icmp eq i32 %val, 4242
16908 // %sel = select i1 %cmp, i32 4242, i32 %other
16909 // It can be optimized to reuse %val instead of 4242 in select.
16910 SDValue Cond = N->getOperand(0);
16911 SDValue TrueVal = N->getOperand(1);
16912 SDValue FalseVal = N->getOperand(2);
16913
16914 // Check if condition is a comparison.
16915 if (Cond.getOpcode() != ISD::SETCC)
16916 return SDValue();
16917
16918 SDValue LHS = Cond.getOperand(0);
16919 SDValue RHS = Cond.getOperand(1);
16920 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16921
16922 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16923 bool isInteger = LHS.getValueType().isInteger();
16924
16925 // Handle simple floating-point and integer types only.
16926 if (!isFloatingPoint && !isInteger)
16927 return SDValue();
16928
16929 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16930 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16931 if (!isEquality && !isNonEquality)
16932 return SDValue();
16933
16934 SDValue ArgVal, ConstVal;
16935 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16936 (isInteger && isa<ConstantSDNode>(RHS))) {
16937 ConstVal = RHS;
16938 ArgVal = LHS;
16939 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16940 (isInteger && isa<ConstantSDNode>(LHS))) {
16941 ConstVal = LHS;
16942 ArgVal = RHS;
16943 } else {
16944 return SDValue();
16945 }
16946
16947 // Skip optimization for inlinable immediates.
16948 if (isFloatingPoint) {
16949 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16950 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16951 return SDValue();
16952 } else {
16954 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16955 return SDValue();
16956 }
16957
16958 // For equality and non-equality comparisons, patterns:
16959 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16960 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16961 if (!(isEquality && TrueVal == ConstVal) &&
16962 !(isNonEquality && FalseVal == ConstVal))
16963 return SDValue();
16964
16965 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16966 SDValue SelectRHS =
16967 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16968 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16969 SelectLHS, SelectRHS);
16970}
16971
16973 DAGCombinerInfo &DCI) const {
16974 switch (N->getOpcode()) {
16975 case ISD::ADD:
16976 case ISD::SUB:
16977 case ISD::SHL:
16978 case ISD::SRL:
16979 case ISD::SRA:
16980 case ISD::AND:
16981 case ISD::OR:
16982 case ISD::XOR:
16983 case ISD::MUL:
16984 case ISD::SETCC:
16985 case ISD::SELECT:
16986 case ISD::SMIN:
16987 case ISD::SMAX:
16988 case ISD::UMIN:
16989 case ISD::UMAX:
16990 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16991 return Res;
16992 break;
16993 default:
16994 break;
16995 }
16996
16997 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16998 return SDValue();
16999
17000 switch (N->getOpcode()) {
17001 case ISD::ADD:
17002 return performAddCombine(N, DCI);
17003 case ISD::PTRADD:
17004 return performPtrAddCombine(N, DCI);
17005 case ISD::SUB:
17006 return performSubCombine(N, DCI);
17007 case ISD::UADDO_CARRY:
17008 case ISD::USUBO_CARRY:
17009 return performAddCarrySubCarryCombine(N, DCI);
17010 case ISD::FADD:
17011 return performFAddCombine(N, DCI);
17012 case ISD::FSUB:
17013 return performFSubCombine(N, DCI);
17014 case ISD::FDIV:
17015 return performFDivCombine(N, DCI);
17016 case ISD::FMUL:
17017 return performFMulCombine(N, DCI);
17018 case ISD::SETCC:
17019 return performSetCCCombine(N, DCI);
17020 case ISD::SELECT:
17021 if (auto Res = performSelectCombine(N, DCI))
17022 return Res;
17023 break;
17024 case ISD::FMAXNUM:
17025 case ISD::FMINNUM:
17026 case ISD::FMAXNUM_IEEE:
17027 case ISD::FMINNUM_IEEE:
17028 case ISD::FMAXIMUM:
17029 case ISD::FMINIMUM:
17030 case ISD::FMAXIMUMNUM:
17031 case ISD::FMINIMUMNUM:
17032 case ISD::SMAX:
17033 case ISD::SMIN:
17034 case ISD::UMAX:
17035 case ISD::UMIN:
17036 case AMDGPUISD::FMIN_LEGACY:
17037 case AMDGPUISD::FMAX_LEGACY:
17038 return performMinMaxCombine(N, DCI);
17039 case ISD::FMA:
17040 return performFMACombine(N, DCI);
17041 case ISD::AND:
17042 return performAndCombine(N, DCI);
17043 case ISD::OR:
17044 return performOrCombine(N, DCI);
17045 case ISD::FSHR: {
17047 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17048 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17049 return matchPERM(N, DCI);
17050 }
17051 break;
17052 }
17053 case ISD::XOR:
17054 return performXorCombine(N, DCI);
17055 case ISD::ZERO_EXTEND:
17056 return performZeroExtendCombine(N, DCI);
17058 return performSignExtendInRegCombine(N, DCI);
17059 case AMDGPUISD::FP_CLASS:
17060 return performClassCombine(N, DCI);
17061 case ISD::FCANONICALIZE:
17062 return performFCanonicalizeCombine(N, DCI);
17063 case AMDGPUISD::RCP:
17064 return performRcpCombine(N, DCI);
17065 case ISD::FLDEXP:
17066 case AMDGPUISD::FRACT:
17067 case AMDGPUISD::RSQ:
17068 case AMDGPUISD::RCP_LEGACY:
17069 case AMDGPUISD::RCP_IFLAG:
17070 case AMDGPUISD::RSQ_CLAMP: {
17071 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17072 SDValue Src = N->getOperand(0);
17073 if (Src.isUndef())
17074 return Src;
17075 break;
17076 }
17077 case ISD::SINT_TO_FP:
17078 case ISD::UINT_TO_FP:
17079 return performUCharToFloatCombine(N, DCI);
17080 case ISD::FCOPYSIGN:
17081 return performFCopySignCombine(N, DCI);
17082 case AMDGPUISD::CVT_F32_UBYTE0:
17083 case AMDGPUISD::CVT_F32_UBYTE1:
17084 case AMDGPUISD::CVT_F32_UBYTE2:
17085 case AMDGPUISD::CVT_F32_UBYTE3:
17086 return performCvtF32UByteNCombine(N, DCI);
17087 case AMDGPUISD::FMED3:
17088 return performFMed3Combine(N, DCI);
17089 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17090 return performCvtPkRTZCombine(N, DCI);
17091 case AMDGPUISD::CLAMP:
17092 return performClampCombine(N, DCI);
17093 case ISD::SCALAR_TO_VECTOR: {
17094 SelectionDAG &DAG = DCI.DAG;
17095 EVT VT = N->getValueType(0);
17096
17097 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17098 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17099 SDLoc SL(N);
17100 SDValue Src = N->getOperand(0);
17101 EVT EltVT = Src.getValueType();
17102 if (EltVT != MVT::i16)
17103 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17104
17105 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17106 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17107 }
17108
17109 break;
17110 }
17112 return performExtractVectorEltCombine(N, DCI);
17114 return performInsertVectorEltCombine(N, DCI);
17115 case ISD::FP_ROUND:
17116 return performFPRoundCombine(N, DCI);
17117 case ISD::LOAD: {
17118 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17119 return Widened;
17120 [[fallthrough]];
17121 }
17122 default: {
17123 if (!DCI.isBeforeLegalize()) {
17124 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17125 return performMemSDNodeCombine(MemNode, DCI);
17126 }
17127
17128 break;
17129 }
17130 }
17131
17133}
17134
17135/// Helper function for adjustWritemask
17136static unsigned SubIdx2Lane(unsigned Idx) {
17137 switch (Idx) {
17138 default:
17139 return ~0u;
17140 case AMDGPU::sub0:
17141 return 0;
17142 case AMDGPU::sub1:
17143 return 1;
17144 case AMDGPU::sub2:
17145 return 2;
17146 case AMDGPU::sub3:
17147 return 3;
17148 case AMDGPU::sub4:
17149 return 4; // Possible with TFE/LWE
17150 }
17151}
17152
17153/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17154SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17155 SelectionDAG &DAG) const {
17156 unsigned Opcode = Node->getMachineOpcode();
17157
17158 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17159 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17160 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17161 return Node; // not implemented for D16
17162
17163 SDNode *Users[5] = {nullptr};
17164 unsigned Lane = 0;
17165 unsigned DmaskIdx =
17166 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17167 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17168 unsigned NewDmask = 0;
17169 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17170 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17171 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17172 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17173 unsigned TFCLane = 0;
17174 bool HasChain = Node->getNumValues() > 1;
17175
17176 if (OldDmask == 0) {
17177 // These are folded out, but on the chance it happens don't assert.
17178 return Node;
17179 }
17180
17181 unsigned OldBitsSet = llvm::popcount(OldDmask);
17182 // Work out which is the TFE/LWE lane if that is enabled.
17183 if (UsesTFC) {
17184 TFCLane = OldBitsSet;
17185 }
17186
17187 // Try to figure out the used register components
17188 for (SDUse &Use : Node->uses()) {
17189
17190 // Don't look at users of the chain.
17191 if (Use.getResNo() != 0)
17192 continue;
17193
17194 SDNode *User = Use.getUser();
17195
17196 // Abort if we can't understand the usage
17197 if (!User->isMachineOpcode() ||
17198 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17199 return Node;
17200
17201 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17202 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17203 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17204 // set, etc.
17205 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17206 if (Lane == ~0u)
17207 return Node;
17208
17209 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17210 if (UsesTFC && Lane == TFCLane) {
17211 Users[Lane] = User;
17212 } else {
17213 // Set which texture component corresponds to the lane.
17214 unsigned Comp;
17215 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17216 Comp = llvm::countr_zero(Dmask);
17217 Dmask &= ~(1 << Comp);
17218 }
17219
17220 // Abort if we have more than one user per component.
17221 if (Users[Lane])
17222 return Node;
17223
17224 Users[Lane] = User;
17225 NewDmask |= 1 << Comp;
17226 }
17227 }
17228
17229 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17230 bool NoChannels = !NewDmask;
17231 if (NoChannels) {
17232 if (!UsesTFC) {
17233 // No uses of the result and not using TFC. Then do nothing.
17234 return Node;
17235 }
17236 // If the original dmask has one channel - then nothing to do
17237 if (OldBitsSet == 1)
17238 return Node;
17239 // Use an arbitrary dmask - required for the instruction to work
17240 NewDmask = 1;
17241 }
17242 // Abort if there's no change
17243 if (NewDmask == OldDmask)
17244 return Node;
17245
17246 unsigned BitsSet = llvm::popcount(NewDmask);
17247
17248 // Check for TFE or LWE - increase the number of channels by one to account
17249 // for the extra return value
17250 // This will need adjustment for D16 if this is also included in
17251 // adjustWriteMask (this function) but at present D16 are excluded.
17252 unsigned NewChannels = BitsSet + UsesTFC;
17253
17254 int NewOpcode =
17255 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17256 assert(NewOpcode != -1 &&
17257 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17258 "failed to find equivalent MIMG op");
17259
17260 // Adjust the writemask in the node
17262 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17263 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17264 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17265
17266 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17267
17268 MVT ResultVT = NewChannels == 1
17269 ? SVT
17270 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17271 : NewChannels == 5 ? 8
17272 : NewChannels);
17273 SDVTList NewVTList =
17274 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17275
17276 MachineSDNode *NewNode =
17277 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17278
17279 if (HasChain) {
17280 // Update chain.
17281 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17282 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17283 }
17284
17285 if (NewChannels == 1) {
17286 assert(Node->hasNUsesOfValue(1, 0));
17287 SDNode *Copy =
17288 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17289 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17290 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17291 return nullptr;
17292 }
17293
17294 // Update the users of the node with the new indices
17295 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17296 SDNode *User = Users[i];
17297 if (!User) {
17298 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17299 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17300 if (i || !NoChannels)
17301 continue;
17302 } else {
17303 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17304 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17305 if (NewUser != User) {
17306 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17307 DAG.RemoveDeadNode(User);
17308 }
17309 }
17310
17311 switch (Idx) {
17312 default:
17313 break;
17314 case AMDGPU::sub0:
17315 Idx = AMDGPU::sub1;
17316 break;
17317 case AMDGPU::sub1:
17318 Idx = AMDGPU::sub2;
17319 break;
17320 case AMDGPU::sub2:
17321 Idx = AMDGPU::sub3;
17322 break;
17323 case AMDGPU::sub3:
17324 Idx = AMDGPU::sub4;
17325 break;
17326 }
17327 }
17328
17329 DAG.RemoveDeadNode(Node);
17330 return nullptr;
17331}
17332
17334 if (Op.getOpcode() == ISD::AssertZext)
17335 Op = Op.getOperand(0);
17336
17337 return isa<FrameIndexSDNode>(Op);
17338}
17339
17340/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17341/// with frame index operands.
17342/// LLVM assumes that inputs are to these instructions are registers.
17343SDNode *
17345 SelectionDAG &DAG) const {
17346 if (Node->getOpcode() == ISD::CopyToReg) {
17347 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17348 SDValue SrcVal = Node->getOperand(2);
17349
17350 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17351 // to try understanding copies to physical registers.
17352 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17353 SDLoc SL(Node);
17355 SDValue VReg = DAG.getRegister(
17356 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17357
17358 SDNode *Glued = Node->getGluedNode();
17359 SDValue ToVReg = DAG.getCopyToReg(
17360 Node->getOperand(0), SL, VReg, SrcVal,
17361 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17362 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17363 VReg, ToVReg.getValue(1));
17364 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17365 DAG.RemoveDeadNode(Node);
17366 return ToResultReg.getNode();
17367 }
17368 }
17369
17371 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17372 if (!isFrameIndexOp(Node->getOperand(i))) {
17373 Ops.push_back(Node->getOperand(i));
17374 continue;
17375 }
17376
17377 SDLoc DL(Node);
17378 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17379 Node->getOperand(i).getValueType(),
17380 Node->getOperand(i)),
17381 0));
17382 }
17383
17384 return DAG.UpdateNodeOperands(Node, Ops);
17385}
17386
17387/// Fold the instructions after selecting them.
17388/// Returns null if users were already updated.
17390 SelectionDAG &DAG) const {
17392 unsigned Opcode = Node->getMachineOpcode();
17393
17394 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17395 !TII->isGather4(Opcode) &&
17396 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17397 return adjustWritemask(Node, DAG);
17398 }
17399
17400 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17402 return Node;
17403 }
17404
17405 switch (Opcode) {
17406 case AMDGPU::V_DIV_SCALE_F32_e64:
17407 case AMDGPU::V_DIV_SCALE_F64_e64: {
17408 // Satisfy the operand register constraint when one of the inputs is
17409 // undefined. Ordinarily each undef value will have its own implicit_def of
17410 // a vreg, so force these to use a single register.
17411 SDValue Src0 = Node->getOperand(1);
17412 SDValue Src1 = Node->getOperand(3);
17413 SDValue Src2 = Node->getOperand(5);
17414
17415 if ((Src0.isMachineOpcode() &&
17416 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17417 (Src0 == Src1 || Src0 == Src2))
17418 break;
17419
17420 MVT VT = Src0.getValueType().getSimpleVT();
17421 const TargetRegisterClass *RC =
17422 getRegClassFor(VT, Src0.getNode()->isDivergent());
17423
17425 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17426
17427 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17428 Src0, SDValue());
17429
17430 // src0 must be the same register as src1 or src2, even if the value is
17431 // undefined, so make sure we don't violate this constraint.
17432 if (Src0.isMachineOpcode() &&
17433 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17434 if (Src1.isMachineOpcode() &&
17435 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17436 Src0 = Src1;
17437 else if (Src2.isMachineOpcode() &&
17438 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17439 Src0 = Src2;
17440 else {
17441 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17442 Src0 = UndefReg;
17443 Src1 = UndefReg;
17444 }
17445 } else
17446 break;
17447
17449 Ops[1] = Src0;
17450 Ops[3] = Src1;
17451 Ops[5] = Src2;
17452 Ops.push_back(ImpDef.getValue(1));
17453 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17454 }
17455 default:
17456 break;
17457 }
17458
17459 return Node;
17460}
17461
17462// Any MIMG instructions that use tfe or lwe require an initialization of the
17463// result register that will be written in the case of a memory access failure.
17464// The required code is also added to tie this init code to the result of the
17465// img instruction.
17468 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17469 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17470 MachineBasicBlock &MBB = *MI.getParent();
17471
17472 int DstIdx =
17473 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17474 unsigned InitIdx = 0;
17475
17476 if (TII->isImage(MI)) {
17477 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17478 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17479 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17480
17481 if (!TFE && !LWE) // intersect_ray
17482 return;
17483
17484 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17485 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17486 unsigned D16Val = D16 ? D16->getImm() : 0;
17487
17488 if (!TFEVal && !LWEVal)
17489 return;
17490
17491 // At least one of TFE or LWE are non-zero
17492 // We have to insert a suitable initialization of the result value and
17493 // tie this to the dest of the image instruction.
17494
17495 // Calculate which dword we have to initialize to 0.
17496 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17497
17498 // check that dmask operand is found.
17499 assert(MO_Dmask && "Expected dmask operand in instruction");
17500
17501 unsigned dmask = MO_Dmask->getImm();
17502 // Determine the number of active lanes taking into account the
17503 // Gather4 special case
17504 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17505
17506 bool Packed = !Subtarget->hasUnpackedD16VMem();
17507
17508 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17509
17510 // Abandon attempt if the dst size isn't large enough
17511 // - this is in fact an error but this is picked up elsewhere and
17512 // reported correctly.
17513 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17514
17515 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
17516 if (DstSize < InitIdx)
17517 return;
17518 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17519 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17520 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
17521 } else {
17522 return;
17523 }
17524
17525 const DebugLoc &DL = MI.getDebugLoc();
17526
17527 // Create a register for the initialization value.
17528 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17529 unsigned NewDst = 0; // Final initialized value will be in here
17530
17531 // If PRTStrictNull feature is enabled (the default) then initialize
17532 // all the result registers to 0, otherwise just the error indication
17533 // register (VGPRn+1)
17534 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17535 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17536
17537 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17538 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17539 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17540 // Initialize dword
17541 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17542 // clang-format off
17543 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17544 .addImm(0);
17545 // clang-format on
17546 // Insert into the super-reg
17547 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17548 .addReg(PrevDst)
17549 .addReg(SubReg)
17551
17552 PrevDst = NewDst;
17553 }
17554
17555 // Add as an implicit operand
17556 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17557
17558 // Tie the just added implicit operand to the dst
17559 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17560}
17561
17562/// Assign the register class depending on the number of
17563/// bits set in the writemask
17565 SDNode *Node) const {
17567
17568 MachineFunction *MF = MI.getMF();
17570
17571 if (TII->isVOP3(MI.getOpcode())) {
17572 // Make sure constant bus requirements are respected.
17573 TII->legalizeOperandsVOP3(MRI, MI);
17574
17575 if (TII->isMAI(MI)) {
17576 // The ordinary src0, src1, src2 were legalized above.
17577 //
17578 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17579 // as a separate instruction.
17580 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17581 AMDGPU::OpName::scale_src0);
17582 if (Src0Idx != -1) {
17583 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17584 AMDGPU::OpName::scale_src1);
17585 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17586 TII->usesConstantBus(MRI, MI, Src1Idx))
17587 TII->legalizeOpWithMove(MI, Src1Idx);
17588 }
17589 }
17590
17591 return;
17592 }
17593
17594 if (TII->isImage(MI))
17595 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17596}
17597
17599 uint64_t Val) {
17600 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17601 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17602}
17603
17605 const SDLoc &DL,
17606 SDValue Ptr) const {
17608
17609 // Build the half of the subregister with the constants before building the
17610 // full 128-bit register. If we are building multiple resource descriptors,
17611 // this will allow CSEing of the 2-component register.
17612 const SDValue Ops0[] = {
17613 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17614 buildSMovImm32(DAG, DL, 0),
17615 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17616 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17617 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17618
17619 SDValue SubRegHi = SDValue(
17620 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17621
17622 // Combine the constants and the pointer.
17623 const SDValue Ops1[] = {
17624 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17625 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17626 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17627
17628 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17629}
17630
17631/// Return a resource descriptor with the 'Add TID' bit enabled
17632/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17633/// of the resource descriptor) to create an offset, which is added to
17634/// the resource pointer.
17636 SDValue Ptr, uint32_t RsrcDword1,
17637 uint64_t RsrcDword2And3) const {
17638 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17639 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17640 if (RsrcDword1) {
17641 PtrHi =
17642 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17643 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17644 0);
17645 }
17646
17647 SDValue DataLo =
17648 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17649 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17650
17651 const SDValue Ops[] = {
17652 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17653 PtrLo,
17654 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17655 PtrHi,
17656 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17657 DataLo,
17658 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17659 DataHi,
17660 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17661
17662 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17663}
17664
17665//===----------------------------------------------------------------------===//
17666// SI Inline Assembly Support
17667//===----------------------------------------------------------------------===//
17668
17669std::pair<unsigned, const TargetRegisterClass *>
17671 StringRef Constraint,
17672 MVT VT) const {
17673 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17674
17675 const TargetRegisterClass *RC = nullptr;
17676 if (Constraint.size() == 1) {
17677 // Check if we cannot determine the bit size of the given value type. This
17678 // can happen, for example, in this situation where we have an empty struct
17679 // (size 0): `call void asm "", "v"({} poison)`-
17680 if (VT == MVT::Other)
17681 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17682 const unsigned BitWidth = VT.getSizeInBits();
17683 switch (Constraint[0]) {
17684 default:
17685 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17686 case 's':
17687 case 'r':
17688 switch (BitWidth) {
17689 case 16:
17690 RC = &AMDGPU::SReg_32RegClass;
17691 break;
17692 case 64:
17693 RC = &AMDGPU::SGPR_64RegClass;
17694 break;
17695 default:
17697 if (!RC)
17698 return std::pair(0U, nullptr);
17699 break;
17700 }
17701 break;
17702 case 'v':
17703 switch (BitWidth) {
17704 case 1:
17705 return std::pair(0U, nullptr);
17706 case 16:
17707 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17708 : &AMDGPU::VGPR_32_Lo256RegClass;
17709 break;
17710 default:
17711 RC = Subtarget->has1024AddressableVGPRs()
17712 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17713 : TRI->getVGPRClassForBitWidth(BitWidth);
17714 if (!RC)
17715 return std::pair(0U, nullptr);
17716 break;
17717 }
17718 break;
17719 case 'a':
17720 if (!Subtarget->hasMAIInsts())
17721 break;
17722 switch (BitWidth) {
17723 case 1:
17724 return std::pair(0U, nullptr);
17725 case 16:
17726 RC = &AMDGPU::AGPR_32RegClass;
17727 break;
17728 default:
17729 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17730 if (!RC)
17731 return std::pair(0U, nullptr);
17732 break;
17733 }
17734 break;
17735 }
17736 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17737 const unsigned BitWidth = VT.getSizeInBits();
17738 switch (BitWidth) {
17739 case 16:
17740 RC = &AMDGPU::AV_32RegClass;
17741 break;
17742 default:
17743 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17744 if (!RC)
17745 return std::pair(0U, nullptr);
17746 break;
17747 }
17748 }
17749
17750 // We actually support i128, i16 and f16 as inline parameters
17751 // even if they are not reported as legal
17752 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17753 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17754 return std::pair(0U, RC);
17755
17756 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17757 if (Kind != '\0') {
17758 if (Kind == 'v') {
17759 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17760 } else if (Kind == 's') {
17761 RC = &AMDGPU::SGPR_32RegClass;
17762 } else if (Kind == 'a') {
17763 RC = &AMDGPU::AGPR_32RegClass;
17764 }
17765
17766 if (RC) {
17767 if (NumRegs > 1) {
17768 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17769 return std::pair(0U, nullptr);
17770
17771 uint32_t Width = NumRegs * 32;
17772 // Prohibit constraints for register ranges with a width that does not
17773 // match the required type.
17774 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17775 return std::pair(0U, nullptr);
17776
17777 MCRegister Reg = RC->getRegister(Idx);
17779 RC = TRI->getVGPRClassForBitWidth(Width);
17780 else if (SIRegisterInfo::isSGPRClass(RC))
17781 RC = TRI->getSGPRClassForBitWidth(Width);
17782 else if (SIRegisterInfo::isAGPRClass(RC))
17783 RC = TRI->getAGPRClassForBitWidth(Width);
17784 if (RC) {
17785 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17786 if (!Reg) {
17787 // The register class does not contain the requested register,
17788 // e.g., because it is an SGPR pair that would violate alignment
17789 // requirements.
17790 return std::pair(0U, nullptr);
17791 }
17792 return std::pair(Reg, RC);
17793 }
17794 }
17795
17796 // Check for lossy scalar/vector conversions.
17797 if (VT.isVector() && VT.getSizeInBits() != 32)
17798 return std::pair(0U, nullptr);
17799 if (Idx < RC->getNumRegs())
17800 return std::pair(RC->getRegister(Idx), RC);
17801 return std::pair(0U, nullptr);
17802 }
17803 }
17804
17805 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17806 if (Ret.first)
17807 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17808
17809 return Ret;
17810}
17811
17812static bool isImmConstraint(StringRef Constraint) {
17813 if (Constraint.size() == 1) {
17814 switch (Constraint[0]) {
17815 default:
17816 break;
17817 case 'I':
17818 case 'J':
17819 case 'A':
17820 case 'B':
17821 case 'C':
17822 return true;
17823 }
17824 } else if (Constraint == "DA" || Constraint == "DB") {
17825 return true;
17826 }
17827 return false;
17828}
17829
17832 if (Constraint.size() == 1) {
17833 switch (Constraint[0]) {
17834 default:
17835 break;
17836 case 's':
17837 case 'v':
17838 case 'a':
17839 return C_RegisterClass;
17840 }
17841 } else if (Constraint.size() == 2) {
17842 if (Constraint == "VA")
17843 return C_RegisterClass;
17844 }
17845 if (isImmConstraint(Constraint)) {
17846 return C_Other;
17847 }
17848 return TargetLowering::getConstraintType(Constraint);
17849}
17850
17851static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17853 Val = Val & maskTrailingOnes<uint64_t>(Size);
17854 }
17855 return Val;
17856}
17857
17859 StringRef Constraint,
17860 std::vector<SDValue> &Ops,
17861 SelectionDAG &DAG) const {
17862 if (isImmConstraint(Constraint)) {
17863 uint64_t Val;
17864 if (getAsmOperandConstVal(Op, Val) &&
17865 checkAsmConstraintVal(Op, Constraint, Val)) {
17866 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17867 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17868 }
17869 } else {
17871 }
17872}
17873
17875 unsigned Size = Op.getScalarValueSizeInBits();
17876 if (Size > 64)
17877 return false;
17878
17879 if (Size == 16 && !Subtarget->has16BitInsts())
17880 return false;
17881
17883 Val = C->getSExtValue();
17884 return true;
17885 }
17887 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17888 return true;
17889 }
17891 if (Size != 16 || Op.getNumOperands() != 2)
17892 return false;
17893 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17894 return false;
17895 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17896 Val = C->getSExtValue();
17897 return true;
17898 }
17899 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17900 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17901 return true;
17902 }
17903 }
17904
17905 return false;
17906}
17907
17909 uint64_t Val) const {
17910 if (Constraint.size() == 1) {
17911 switch (Constraint[0]) {
17912 case 'I':
17914 case 'J':
17915 return isInt<16>(Val);
17916 case 'A':
17917 return checkAsmConstraintValA(Op, Val);
17918 case 'B':
17919 return isInt<32>(Val);
17920 case 'C':
17921 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17923 default:
17924 break;
17925 }
17926 } else if (Constraint.size() == 2) {
17927 if (Constraint == "DA") {
17928 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17929 int64_t LoBits = static_cast<int32_t>(Val);
17930 return checkAsmConstraintValA(Op, HiBits, 32) &&
17931 checkAsmConstraintValA(Op, LoBits, 32);
17932 }
17933 if (Constraint == "DB") {
17934 return true;
17935 }
17936 }
17937 llvm_unreachable("Invalid asm constraint");
17938}
17939
17941 unsigned MaxSize) const {
17942 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17943 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17944 if (Size == 16) {
17945 MVT VT = Op.getSimpleValueType();
17946 switch (VT.SimpleTy) {
17947 default:
17948 return false;
17949 case MVT::i16:
17950 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17951 case MVT::f16:
17952 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17953 case MVT::bf16:
17954 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17955 case MVT::v2i16:
17956 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17957 case MVT::v2f16:
17958 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17959 case MVT::v2bf16:
17960 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17961 }
17962 }
17963 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17964 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17965 return true;
17966 return false;
17967}
17968
17969static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17970 switch (UnalignedClassID) {
17971 case AMDGPU::VReg_64RegClassID:
17972 return AMDGPU::VReg_64_Align2RegClassID;
17973 case AMDGPU::VReg_96RegClassID:
17974 return AMDGPU::VReg_96_Align2RegClassID;
17975 case AMDGPU::VReg_128RegClassID:
17976 return AMDGPU::VReg_128_Align2RegClassID;
17977 case AMDGPU::VReg_160RegClassID:
17978 return AMDGPU::VReg_160_Align2RegClassID;
17979 case AMDGPU::VReg_192RegClassID:
17980 return AMDGPU::VReg_192_Align2RegClassID;
17981 case AMDGPU::VReg_224RegClassID:
17982 return AMDGPU::VReg_224_Align2RegClassID;
17983 case AMDGPU::VReg_256RegClassID:
17984 return AMDGPU::VReg_256_Align2RegClassID;
17985 case AMDGPU::VReg_288RegClassID:
17986 return AMDGPU::VReg_288_Align2RegClassID;
17987 case AMDGPU::VReg_320RegClassID:
17988 return AMDGPU::VReg_320_Align2RegClassID;
17989 case AMDGPU::VReg_352RegClassID:
17990 return AMDGPU::VReg_352_Align2RegClassID;
17991 case AMDGPU::VReg_384RegClassID:
17992 return AMDGPU::VReg_384_Align2RegClassID;
17993 case AMDGPU::VReg_512RegClassID:
17994 return AMDGPU::VReg_512_Align2RegClassID;
17995 case AMDGPU::VReg_1024RegClassID:
17996 return AMDGPU::VReg_1024_Align2RegClassID;
17997 case AMDGPU::AReg_64RegClassID:
17998 return AMDGPU::AReg_64_Align2RegClassID;
17999 case AMDGPU::AReg_96RegClassID:
18000 return AMDGPU::AReg_96_Align2RegClassID;
18001 case AMDGPU::AReg_128RegClassID:
18002 return AMDGPU::AReg_128_Align2RegClassID;
18003 case AMDGPU::AReg_160RegClassID:
18004 return AMDGPU::AReg_160_Align2RegClassID;
18005 case AMDGPU::AReg_192RegClassID:
18006 return AMDGPU::AReg_192_Align2RegClassID;
18007 case AMDGPU::AReg_256RegClassID:
18008 return AMDGPU::AReg_256_Align2RegClassID;
18009 case AMDGPU::AReg_512RegClassID:
18010 return AMDGPU::AReg_512_Align2RegClassID;
18011 case AMDGPU::AReg_1024RegClassID:
18012 return AMDGPU::AReg_1024_Align2RegClassID;
18013 default:
18014 return -1;
18015 }
18016}
18017
18018// Figure out which registers should be reserved for stack access. Only after
18019// the function is legalized do we know all of the non-spill stack objects or if
18020// calls are present.
18024 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18025 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18026 const SIInstrInfo *TII = ST.getInstrInfo();
18027
18028 if (Info->isEntryFunction()) {
18029 // Callable functions have fixed registers used for stack access.
18031 }
18032
18033 // TODO: Move this logic to getReservedRegs()
18034 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18035 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18036 Register SReg = ST.isWave32()
18037 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18038 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18039 &AMDGPU::SGPR_64RegClass);
18040 Info->setSGPRForEXECCopy(SReg);
18041
18042 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18043 Info->getStackPtrOffsetReg()));
18044 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18045 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18046
18047 // We need to worry about replacing the default register with itself in case
18048 // of MIR testcases missing the MFI.
18049 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18050 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18051
18052 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18053 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18054
18055 Info->limitOccupancy(MF);
18056
18057 if (ST.isWave32() && !MF.empty()) {
18058 for (auto &MBB : MF) {
18059 for (auto &MI : MBB) {
18060 TII->fixImplicitOperands(MI);
18061 }
18062 }
18063 }
18064
18065 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18066 // classes if required. Ideally the register class constraints would differ
18067 // per-subtarget, but there's no easy way to achieve that right now. This is
18068 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18069 // from using them as the register class for legal types.
18070 if (ST.needsAlignedVGPRs()) {
18071 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18072 const Register Reg = Register::index2VirtReg(I);
18073 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18074 if (!RC)
18075 continue;
18076 int NewClassID = getAlignedAGPRClassID(RC->getID());
18077 if (NewClassID != -1)
18078 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18079 }
18080 }
18081
18083}
18084
18086 KnownBits &Known,
18087 const APInt &DemandedElts,
18088 const SelectionDAG &DAG,
18089 unsigned Depth) const {
18090 Known.resetAll();
18091 unsigned Opc = Op.getOpcode();
18092 switch (Opc) {
18094 unsigned IID = Op.getConstantOperandVal(0);
18095 switch (IID) {
18096 case Intrinsic::amdgcn_mbcnt_lo:
18097 case Intrinsic::amdgcn_mbcnt_hi: {
18098 const GCNSubtarget &ST =
18100 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18101 // most 31 + src1.
18102 Known.Zero.setBitsFrom(
18103 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18104 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18105 Known = KnownBits::add(Known, Known2);
18106 return;
18107 }
18108 }
18109 break;
18110 }
18111 }
18113 Op, Known, DemandedElts, DAG, Depth);
18114}
18115
18117 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18119
18120 // Set the high bits to zero based on the maximum allowed scratch size per
18121 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18122 // calculation won't overflow, so assume the sign bit is never set.
18123 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18124}
18125
18127 GISelValueTracking &VT, KnownBits &Known,
18128 unsigned Dim) {
18129 unsigned MaxValue =
18130 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18131 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18132}
18133
18135 KnownBits &Known, const APInt &DemandedElts,
18136 unsigned BFEWidth, bool SExt, unsigned Depth) {
18138 const MachineOperand &Src1 = MI.getOperand(2);
18139
18140 unsigned Src1Cst = 0;
18141 if (Src1.isImm()) {
18142 Src1Cst = Src1.getImm();
18143 } else if (Src1.isReg()) {
18144 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18145 if (!Cst)
18146 return;
18147 Src1Cst = Cst->Value.getZExtValue();
18148 } else {
18149 return;
18150 }
18151
18152 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18153 // Width is always [22:16].
18154 const unsigned Offset =
18155 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18156 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18157
18158 if (Width >= BFEWidth) // Ill-formed.
18159 return;
18160
18161 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18162 Depth + 1);
18163
18164 Known = Known.extractBits(Width, Offset);
18165
18166 if (SExt)
18167 Known = Known.sext(BFEWidth);
18168 else
18169 Known = Known.zext(BFEWidth);
18170}
18171
18173 GISelValueTracking &VT, Register R, KnownBits &Known,
18174 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18175 unsigned Depth) const {
18176 Known.resetAll();
18177 const MachineInstr *MI = MRI.getVRegDef(R);
18178 switch (MI->getOpcode()) {
18179 case AMDGPU::S_BFE_I32:
18180 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18181 /*SExt=*/true, Depth);
18182 case AMDGPU::S_BFE_U32:
18183 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18184 /*SExt=*/false, Depth);
18185 case AMDGPU::S_BFE_I64:
18186 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18187 /*SExt=*/true, Depth);
18188 case AMDGPU::S_BFE_U64:
18189 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18190 /*SExt=*/false, Depth);
18191 case AMDGPU::G_INTRINSIC:
18192 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18193 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18194 switch (IID) {
18195 case Intrinsic::amdgcn_workitem_id_x:
18196 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18197 break;
18198 case Intrinsic::amdgcn_workitem_id_y:
18199 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18200 break;
18201 case Intrinsic::amdgcn_workitem_id_z:
18202 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18203 break;
18204 case Intrinsic::amdgcn_mbcnt_lo:
18205 case Intrinsic::amdgcn_mbcnt_hi: {
18206 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18207 // most 31 + src1.
18208 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18209 ? getSubtarget()->getWavefrontSizeLog2()
18210 : 5);
18211 KnownBits Known2;
18212 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18213 Depth + 1);
18214 Known = KnownBits::add(Known, Known2);
18215 break;
18216 }
18217 case Intrinsic::amdgcn_groupstaticsize: {
18218 // We can report everything over the maximum size as 0. We can't report
18219 // based on the actual size because we don't know if it's accurate or not
18220 // at any given point.
18221 Known.Zero.setHighBits(
18222 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18223 break;
18224 }
18225 }
18226 break;
18227 }
18228 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18229 Known.Zero.setHighBits(24);
18230 break;
18231 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18232 Known.Zero.setHighBits(16);
18233 break;
18234 case AMDGPU::G_AMDGPU_SMED3:
18235 case AMDGPU::G_AMDGPU_UMED3: {
18236 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18237
18238 KnownBits Known2;
18239 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18240 if (Known2.isUnknown())
18241 break;
18242
18243 KnownBits Known1;
18244 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18245 if (Known1.isUnknown())
18246 break;
18247
18248 KnownBits Known0;
18249 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18250 if (Known0.isUnknown())
18251 break;
18252
18253 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18254 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18255 Known.One = Known0.One & Known1.One & Known2.One;
18256 break;
18257 }
18258 }
18259}
18260
18263 unsigned Depth) const {
18264 const MachineInstr *MI = MRI.getVRegDef(R);
18265 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18266 // FIXME: Can this move to generic code? What about the case where the call
18267 // site specifies a lower alignment?
18268 Intrinsic::ID IID = GI->getIntrinsicID();
18270 AttributeList Attrs =
18271 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18272 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18273 return *RetAlign;
18274 }
18275 return Align(1);
18276}
18277
18280 const Align CacheLineAlign = Align(64);
18281
18282 // Pre-GFX10 target did not benefit from loop alignment
18283 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18284 getSubtarget()->hasInstFwdPrefetchBug())
18285 return PrefAlign;
18286
18287 // On GFX10 I$ is 4 x 64 bytes cache lines.
18288 // By default prefetcher keeps one cache line behind and reads two ahead.
18289 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18290 // behind and one ahead.
18291 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18292 // If loop fits 64 bytes it always spans no more than two cache lines and
18293 // does not need an alignment.
18294 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18295 // Else if loop is less or equal 192 bytes we need two lines behind.
18296
18298 const MachineBasicBlock *Header = ML->getHeader();
18299 if (Header->getAlignment() != PrefAlign)
18300 return Header->getAlignment(); // Already processed.
18301
18302 unsigned LoopSize = 0;
18303 for (const MachineBasicBlock *MBB : ML->blocks()) {
18304 // If inner loop block is aligned assume in average half of the alignment
18305 // size to be added as nops.
18306 if (MBB != Header)
18307 LoopSize += MBB->getAlignment().value() / 2;
18308
18309 for (const MachineInstr &MI : *MBB) {
18310 LoopSize += TII->getInstSizeInBytes(MI);
18311 if (LoopSize > 192)
18312 return PrefAlign;
18313 }
18314 }
18315
18316 if (LoopSize <= 64)
18317 return PrefAlign;
18318
18319 if (LoopSize <= 128)
18320 return CacheLineAlign;
18321
18322 // If any of parent loops is surrounded by prefetch instructions do not
18323 // insert new for inner loop, which would reset parent's settings.
18324 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18325 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18326 auto I = Exit->getFirstNonDebugInstr();
18327 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18328 return CacheLineAlign;
18329 }
18330 }
18331
18332 MachineBasicBlock *Pre = ML->getLoopPreheader();
18333 MachineBasicBlock *Exit = ML->getExitBlock();
18334
18335 if (Pre && Exit) {
18336 auto PreTerm = Pre->getFirstTerminator();
18337 if (PreTerm == Pre->begin() ||
18338 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18339 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18340 .addImm(1); // prefetch 2 lines behind PC
18341
18342 auto ExitHead = Exit->getFirstNonDebugInstr();
18343 if (ExitHead == Exit->end() ||
18344 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18345 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18346 .addImm(2); // prefetch 1 line behind PC
18347 }
18348
18349 return CacheLineAlign;
18350}
18351
18352[[maybe_unused]]
18353static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18354 assert(N->getOpcode() == ISD::CopyFromReg);
18355 do {
18356 // Follow the chain until we find an INLINEASM node.
18357 N = N->getOperand(0).getNode();
18358 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18359 return true;
18360 } while (N->getOpcode() == ISD::CopyFromReg);
18361 return false;
18362}
18363
18366 UniformityInfo *UA) const {
18367 switch (N->getOpcode()) {
18368 case ISD::CopyFromReg: {
18369 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18370 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18371 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18372 Register Reg = R->getReg();
18373
18374 // FIXME: Why does this need to consider isLiveIn?
18375 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18376 return !TRI->isSGPRReg(MRI, Reg);
18377
18378 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18379 return UA->isDivergent(V);
18380
18382 return !TRI->isSGPRReg(MRI, Reg);
18383 }
18384 case ISD::LOAD: {
18385 const LoadSDNode *L = cast<LoadSDNode>(N);
18386 unsigned AS = L->getAddressSpace();
18387 // A flat load may access private memory.
18389 }
18390 case ISD::CALLSEQ_END:
18391 return true;
18393 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18395 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18396 case AMDGPUISD::ATOMIC_CMP_SWAP:
18397 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18398 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18399 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18400 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18401 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18402 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18403 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18404 case AMDGPUISD::BUFFER_ATOMIC_AND:
18405 case AMDGPUISD::BUFFER_ATOMIC_OR:
18406 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18407 case AMDGPUISD::BUFFER_ATOMIC_INC:
18408 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18409 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18410 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18411 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18412 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18413 // Target-specific read-modify-write atomics are sources of divergence.
18414 return true;
18415 default:
18416 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18417 // Generic read-modify-write atomics are sources of divergence.
18418 return A->readMem() && A->writeMem();
18419 }
18420 return false;
18421 }
18422}
18423
18425 EVT VT) const {
18426 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18427 case MVT::f32:
18429 case MVT::f64:
18430 case MVT::f16:
18432 default:
18433 return false;
18434 }
18435}
18436
18438 LLT Ty, const MachineFunction &MF) const {
18439 switch (Ty.getScalarSizeInBits()) {
18440 case 32:
18441 return !denormalModeIsFlushAllF32(MF);
18442 case 64:
18443 case 16:
18444 return !denormalModeIsFlushAllF64F16(MF);
18445 default:
18446 return false;
18447 }
18448}
18449
18451 const APInt &DemandedElts,
18452 const SelectionDAG &DAG,
18453 bool SNaN,
18454 unsigned Depth) const {
18455 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18456 const MachineFunction &MF = DAG.getMachineFunction();
18458
18459 if (Info->getMode().DX10Clamp)
18460 return true; // Clamped to 0.
18461 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18462 }
18463
18465 DAG, SNaN, Depth);
18466}
18467
18468// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18469// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18471 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18472 return true;
18473
18474 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
18475 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18476 if (DenormMode == DenormalMode::getPreserveSign())
18477 return true;
18478
18479 // TODO: Remove this.
18480 return RMW->getFunction()
18481 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18482 .getValueAsBool();
18483}
18484
18486 LLVMContext &Ctx = RMW->getContext();
18487 StringRef MemScope =
18488 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18489
18490 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18491 << "Hardware instruction generated for atomic "
18492 << RMW->getOperationName(RMW->getOperation())
18493 << " operation at memory scope " << MemScope;
18494}
18495
18496static bool isV2F16OrV2BF16(Type *Ty) {
18497 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18498 Type *EltTy = VT->getElementType();
18499 return VT->getNumElements() == 2 &&
18500 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18501 }
18502
18503 return false;
18504}
18505
18506static bool isV2F16(Type *Ty) {
18508 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18509}
18510
18511static bool isV2BF16(Type *Ty) {
18513 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18514}
18515
18516/// \return true if atomicrmw integer ops work for the type.
18517static bool isAtomicRMWLegalIntTy(Type *Ty) {
18518 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18519 unsigned BW = IT->getBitWidth();
18520 return BW == 32 || BW == 64;
18521 }
18522
18523 return false;
18524}
18525
18526/// \return true if this atomicrmw xchg type can be selected.
18527static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18528 Type *Ty = RMW->getType();
18529 if (isAtomicRMWLegalIntTy(Ty))
18530 return true;
18531
18532 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18533 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18534 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18535 return BW == 32 || BW == 64;
18536 }
18537
18538 if (Ty->isFloatTy() || Ty->isDoubleTy())
18539 return true;
18540
18542 return VT->getNumElements() == 2 &&
18543 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18544 }
18545
18546 return false;
18547}
18548
18549/// \returns true if it's valid to emit a native instruction for \p RMW, based
18550/// on the properties of the target memory.
18551static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18552 const AtomicRMWInst *RMW,
18553 bool HasSystemScope) {
18554 // The remote/fine-grained access logic is different from the integer
18555 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18556 // fine-grained access does not work, even for a device local allocation.
18557 //
18558 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18559 // allocations work.
18560 if (HasSystemScope) {
18562 RMW->hasMetadata("amdgpu.no.remote.memory"))
18563 return true;
18564 if (Subtarget.hasEmulatedSystemScopeAtomics())
18565 return true;
18567 return true;
18568
18569 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18570}
18571
18572/// \return Action to perform on AtomicRMWInsts for integer operations.
18579
18580/// Return if a flat address space atomicrmw can access private memory.
18582 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18583 return !MD ||
18585}
18586
18594
18597 unsigned AS = RMW->getPointerAddressSpace();
18598 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18600
18601 // 64-bit flat atomics that dynamically reside in private memory will silently
18602 // be dropped.
18603 //
18604 // Note that we will emit a new copy of the original atomic in the expansion,
18605 // which will be incrementally relegalized.
18606 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18607 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18608 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18611
18612 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18614 ORE.emit([=]() {
18615 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18616 });
18617 return Kind;
18618 };
18619
18620 auto SSID = RMW->getSyncScopeID();
18621 bool HasSystemScope =
18622 SSID == SyncScope::System ||
18623 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18624
18625 auto Op = RMW->getOperation();
18626 switch (Op) {
18628 // PCIe supports add and xchg for system atomics.
18629 return isAtomicRMWLegalXChgTy(RMW)
18632 case AtomicRMWInst::Add:
18633 // PCIe supports add and xchg for system atomics.
18635 case AtomicRMWInst::Sub:
18636 case AtomicRMWInst::And:
18637 case AtomicRMWInst::Or:
18638 case AtomicRMWInst::Xor:
18639 case AtomicRMWInst::Max:
18640 case AtomicRMWInst::Min:
18647 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
18649 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
18652 auto *IT = dyn_cast<IntegerType>(RMW->getType());
18653 if (!IT || IT->getBitWidth() != 32)
18655 }
18656
18659 if (Subtarget->hasEmulatedSystemScopeAtomics())
18661
18662 // On most subtargets, for atomicrmw operations other than add/xchg,
18663 // whether or not the instructions will behave correctly depends on where
18664 // the address physically resides and what interconnect is used in the
18665 // system configuration. On some some targets the instruction will nop,
18666 // and in others synchronization will only occur at degraded device scope.
18667 //
18668 // If the allocation is known local to the device, the instructions should
18669 // work correctly.
18670 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18672
18673 // If fine-grained remote memory works at device scope, we don't need to
18674 // do anything.
18675 if (!HasSystemScope &&
18676 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18678
18679 // If we are targeting a remote allocated address, it depends what kind of
18680 // allocation the address belongs to.
18681 //
18682 // If the allocation is fine-grained (in host memory, or in PCIe peer
18683 // device memory), the operation will fail depending on the target.
18684 //
18685 // Note fine-grained host memory access does work on APUs or if XGMI is
18686 // used, but we do not know if we are targeting an APU or the system
18687 // configuration from the ISA version/target-cpu.
18688 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18690
18693 // Atomic sub/or/xor do not work over PCI express, but atomic add
18694 // does. InstCombine transforms these with 0 to or, so undo that.
18695 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18696 ConstVal && ConstVal->isNullValue())
18698 }
18699
18700 // If the allocation could be in remote, fine-grained memory, the rmw
18701 // instructions may fail. cmpxchg should work, so emit that. On some
18702 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18703 // even work, so you're out of luck anyway.
18704
18705 // In summary:
18706 //
18707 // Cases that may fail:
18708 // - fine-grained pinned host memory
18709 // - fine-grained migratable host memory
18710 // - fine-grained PCIe peer device
18711 //
18712 // Cases that should work, but may be treated overly conservatively.
18713 // - fine-grained host memory on an APU
18714 // - fine-grained XGMI peer device
18716 }
18717
18719 }
18720 case AtomicRMWInst::FAdd: {
18721 Type *Ty = RMW->getType();
18722
18723 // TODO: Handle REGION_ADDRESS
18724 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18725 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18726 // is fixed to round-to-nearest-even.
18727 //
18728 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18729 // round-to-nearest-even.
18730 //
18731 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18732 // suggests it is OK if the floating-point mode may not match the calling
18733 // thread.
18734 if (Ty->isFloatTy()) {
18735 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18737 }
18738
18739 if (Ty->isDoubleTy()) {
18740 // Ignores denormal mode, but we don't consider flushing mandatory.
18741 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18743 }
18744
18745 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18747
18749 }
18750
18751 // LDS atomics respect the denormal mode from the mode register.
18752 //
18753 // Traditionally f32 global/buffer memory atomics would unconditionally
18754 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18755 // flush.
18756 //
18757 // On targets with flat atomic fadd, denormals would flush depending on
18758 // whether the target address resides in LDS or global memory. We consider
18759 // this flat-maybe-flush as will-flush.
18760 if (Ty->isFloatTy() &&
18761 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18764
18765 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18766 // safe. The message phrasing also should be better.
18767 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18768 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18769 // gfx942, gfx12
18770 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18771 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18772 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18773 // gfx90a, gfx942, gfx12
18774 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18775 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18776
18777 // gfx942, gfx12
18778 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18779 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18780 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18781 // gfx90a, gfx942, gfx12
18782 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18783 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18784
18785 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18786 // buffer. gfx12 does have the buffer version.
18787 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18788 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18789 }
18790
18791 // global and flat atomic fadd f64: gfx90a, gfx942.
18792 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18793 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18794
18795 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18796 if (Ty->isFloatTy()) {
18797 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18798 // gfx11+.
18799 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18800 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18801 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18802 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18803 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18804 } else {
18805 // gfx908
18806 if (RMW->use_empty() &&
18807 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18808 isV2F16(Ty))
18809 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18810 }
18811 }
18812
18813 // flat atomic fadd f32: gfx942, gfx11+.
18814 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18815 if (Subtarget->hasFlatAtomicFaddF32Inst())
18816 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18817
18818 // If it is in flat address space, and the type is float, we will try to
18819 // expand it, if the target supports global and lds atomic fadd. The
18820 // reason we need that is, in the expansion, we emit the check of
18821 // address space. If it is in global address space, we emit the global
18822 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18823 // fadd.
18824 if (Subtarget->hasLDSFPAtomicAddF32()) {
18825 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18827 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18829 }
18830 }
18831 }
18832
18834 }
18836 case AtomicRMWInst::FMax: {
18837 Type *Ty = RMW->getType();
18838
18839 // LDS float and double fmin/fmax were always supported.
18840 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18841 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18843 }
18844
18845 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18846 // For flat and global cases:
18847 // float, double in gfx7. Manual claims denormal support.
18848 // Removed in gfx8.
18849 // float, double restored in gfx10.
18850 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18851 //
18852 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18853 // no f32.
18854 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18855 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18856 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18857 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18858 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18859 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18861 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18862 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18863 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18864 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18865 }
18866 }
18867
18869 }
18872 default:
18874 }
18875
18876 llvm_unreachable("covered atomicrmw op switch");
18877}
18878
18885
18892
18895 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18896 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18898
18899 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18901
18902 const DataLayout &DL = CmpX->getDataLayout();
18903
18904 Type *ValTy = CmpX->getNewValOperand()->getType();
18905
18906 // If a 64-bit flat atomic may alias private, we need to avoid using the
18907 // atomic in the private case.
18908 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18910}
18911
18912const TargetRegisterClass *
18913SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18915 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18916 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18917 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18918 : &AMDGPU::SReg_32RegClass;
18919 if (!TRI->isSGPRClass(RC) && !isDivergent)
18920 return TRI->getEquivalentSGPRClass(RC);
18921 if (TRI->isSGPRClass(RC) && isDivergent) {
18922 if (Subtarget->hasGFX90AInsts())
18923 return TRI->getEquivalentAVClass(RC);
18924 return TRI->getEquivalentVGPRClass(RC);
18925 }
18926
18927 return RC;
18928}
18929
18930// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18931// uniform values (as produced by the mask results of control flow intrinsics)
18932// used outside of divergent blocks. The phi users need to also be treated as
18933// always uniform.
18934//
18935// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18936static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18937 unsigned WaveSize) {
18938 // FIXME: We assume we never cast the mask results of a control flow
18939 // intrinsic.
18940 // Early exit if the type won't be consistent as a compile time hack.
18941 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18942 if (!IT || IT->getBitWidth() != WaveSize)
18943 return false;
18944
18945 if (!isa<Instruction>(V))
18946 return false;
18947 if (!Visited.insert(V).second)
18948 return false;
18949 bool Result = false;
18950 for (const auto *U : V->users()) {
18952 if (V == U->getOperand(1)) {
18953 switch (Intrinsic->getIntrinsicID()) {
18954 default:
18955 Result = false;
18956 break;
18957 case Intrinsic::amdgcn_if_break:
18958 case Intrinsic::amdgcn_if:
18959 case Intrinsic::amdgcn_else:
18960 Result = true;
18961 break;
18962 }
18963 }
18964 if (V == U->getOperand(0)) {
18965 switch (Intrinsic->getIntrinsicID()) {
18966 default:
18967 Result = false;
18968 break;
18969 case Intrinsic::amdgcn_end_cf:
18970 case Intrinsic::amdgcn_loop:
18971 Result = true;
18972 break;
18973 }
18974 }
18975 } else {
18976 Result = hasCFUser(U, Visited, WaveSize);
18977 }
18978 if (Result)
18979 break;
18980 }
18981 return Result;
18982}
18983
18985 const Value *V) const {
18986 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18987 if (CI->isInlineAsm()) {
18988 // FIXME: This cannot give a correct answer. This should only trigger in
18989 // the case where inline asm returns mixed SGPR and VGPR results, used
18990 // outside the defining block. We don't have a specific result to
18991 // consider, so this assumes if any value is SGPR, the overall register
18992 // also needs to be SGPR.
18993 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18995 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18996 for (auto &TC : TargetConstraints) {
18997 if (TC.Type == InlineAsm::isOutput) {
18999 const TargetRegisterClass *RC =
19000 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
19001 TC.ConstraintVT)
19002 .second;
19003 if (RC && SIRI->isSGPRClass(RC))
19004 return true;
19005 }
19006 }
19007 }
19008 }
19010 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19011}
19012
19014 for (SDUse &Use : N->uses()) {
19016 if (getBasePtrIndex(M) == Use.getOperandNo())
19017 return true;
19018 }
19019 }
19020 return false;
19021}
19022
19024 SDValue N1) const {
19025 if (!N0.hasOneUse())
19026 return false;
19027 // Take care of the opportunity to keep N0 uniform
19028 if (N0->isDivergent() || !N1->isDivergent())
19029 return true;
19030 // Check if we have a good chance to form the memory access pattern with the
19031 // base and offset
19032 return (DAG.isBaseWithConstantOffset(N0) &&
19034}
19035
19037 Register N0, Register N1) const {
19038 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19039}
19040
19043 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19045 if (I.getMetadata("amdgpu.noclobber"))
19046 Flags |= MONoClobber;
19047 if (I.getMetadata("amdgpu.last.use"))
19048 Flags |= MOLastUse;
19049 return Flags;
19050}
19051
19053 Instruction *AI) const {
19054 // Given: atomicrmw fadd ptr %addr, float %val ordering
19055 //
19056 // With this expansion we produce the following code:
19057 // [...]
19058 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19059 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19060 //
19061 // atomicrmw.shared:
19062 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19063 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19064 // float %val ordering
19065 // br label %atomicrmw.phi
19066 //
19067 // atomicrmw.check.private:
19068 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19069 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19070 //
19071 // atomicrmw.private:
19072 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19073 // %loaded.private = load float, ptr addrspace(5) %cast.private
19074 // %val.new = fadd float %loaded.private, %val
19075 // store float %val.new, ptr addrspace(5) %cast.private
19076 // br label %atomicrmw.phi
19077 //
19078 // atomicrmw.global:
19079 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19080 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19081 // float %val ordering
19082 // br label %atomicrmw.phi
19083 //
19084 // atomicrmw.phi:
19085 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19086 // [ %loaded.private, %atomicrmw.private ],
19087 // [ %loaded.global, %atomicrmw.global ]
19088 // br label %atomicrmw.end
19089 //
19090 // atomicrmw.end:
19091 // [...]
19092 //
19093 //
19094 // For 64-bit atomics which may reside in private memory, we perform a simpler
19095 // version that only inserts the private check, and uses the flat operation.
19096
19097 IRBuilder<> Builder(AI);
19098 LLVMContext &Ctx = Builder.getContext();
19099
19100 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19101 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19103 Value *Addr = AI->getOperand(PtrOpIdx);
19104
19105 /// TODO: Only need to check private, then emit flat-known-not private (no
19106 /// need for shared block, or cast to global).
19108
19109 Align Alignment;
19110 if (RMW)
19111 Alignment = RMW->getAlign();
19112 else if (CX)
19113 Alignment = CX->getAlign();
19114 else
19115 llvm_unreachable("unhandled atomic operation");
19116
19117 // FullFlatEmulation is true if we need to issue the private, shared, and
19118 // global cases.
19119 //
19120 // If this is false, we are only dealing with the flat-targeting-private case,
19121 // where we only insert a check for private and still use the flat instruction
19122 // for global and shared.
19123
19124 bool FullFlatEmulation =
19125 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19126 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19127 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19128 RMW->getType()->isDoubleTy()));
19129
19130 // If the return value isn't used, do not introduce a false use in the phi.
19131 bool ReturnValueIsUsed = !AI->use_empty();
19132
19133 BasicBlock *BB = Builder.GetInsertBlock();
19134 Function *F = BB->getParent();
19135 BasicBlock *ExitBB =
19136 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19137 BasicBlock *SharedBB = nullptr;
19138
19139 BasicBlock *CheckPrivateBB = BB;
19140 if (FullFlatEmulation) {
19141 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19142 CheckPrivateBB =
19143 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19144 }
19145
19146 BasicBlock *PrivateBB =
19147 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19148 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19149 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19150
19151 std::prev(BB->end())->eraseFromParent();
19152 Builder.SetInsertPoint(BB);
19153
19154 Value *LoadedShared = nullptr;
19155 if (FullFlatEmulation) {
19156 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19157 {Addr}, nullptr, "is.shared");
19158 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19159 Builder.SetInsertPoint(SharedBB);
19160 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19162
19163 Instruction *Clone = AI->clone();
19164 Clone->insertInto(SharedBB, SharedBB->end());
19165 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19166 LoadedShared = Clone;
19167
19168 Builder.CreateBr(PhiBB);
19169 Builder.SetInsertPoint(CheckPrivateBB);
19170 }
19171
19172 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19173 {Addr}, nullptr, "is.private");
19174 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19175
19176 Builder.SetInsertPoint(PrivateBB);
19177
19178 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19180
19181 Value *LoadedPrivate;
19182 if (RMW) {
19183 LoadedPrivate = Builder.CreateAlignedLoad(
19184 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19185
19186 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19187 LoadedPrivate, RMW->getValOperand());
19188
19189 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19190 } else {
19191 auto [ResultLoad, Equal] =
19192 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19193 CX->getNewValOperand(), CX->getAlign());
19194
19195 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19196 ResultLoad, 0);
19197 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19198 }
19199
19200 Builder.CreateBr(PhiBB);
19201
19202 Builder.SetInsertPoint(GlobalBB);
19203
19204 // Continue using a flat instruction if we only emitted the check for private.
19205 Instruction *LoadedGlobal = AI;
19206 if (FullFlatEmulation) {
19207 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19209 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19210 }
19211
19212 AI->removeFromParent();
19213 AI->insertInto(GlobalBB, GlobalBB->end());
19214
19215 // The new atomicrmw may go through another round of legalization later.
19216 if (!FullFlatEmulation) {
19217 // We inserted the runtime check already, make sure we do not try to
19218 // re-expand this.
19219 // TODO: Should union with any existing metadata.
19220 MDBuilder MDB(F->getContext());
19221 MDNode *RangeNotPrivate =
19224 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19225 RangeNotPrivate);
19226 }
19227
19228 Builder.CreateBr(PhiBB);
19229
19230 Builder.SetInsertPoint(PhiBB);
19231
19232 if (ReturnValueIsUsed) {
19233 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19234 AI->replaceAllUsesWith(Loaded);
19235 if (FullFlatEmulation)
19236 Loaded->addIncoming(LoadedShared, SharedBB);
19237 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19238 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19239 Loaded->takeName(AI);
19240 }
19241
19242 Builder.CreateBr(ExitBB);
19243}
19244
19246 unsigned PtrOpIdx) {
19247 Value *PtrOp = I->getOperand(PtrOpIdx);
19250
19251 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19252 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19253 I->getIterator());
19254 I->setOperand(PtrOpIdx, ASCast);
19255}
19256
19259
19262
19265 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19266 ConstVal && ConstVal->isNullValue()) {
19267 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19269
19270 // We may still need the private-alias-flat handling below.
19271
19272 // TODO: Skip this for cases where we cannot access remote memory.
19273 }
19274 }
19275
19276 // The non-flat expansions should only perform the de-canonicalization of
19277 // identity values.
19279 return;
19280
19282}
19283
19290
19294
19296 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19297}
19298
19300 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19301 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19302
19304 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19305}
19306
19307LoadInst *
19309 IRBuilder<> Builder(AI);
19310 auto Order = AI->getOrdering();
19311
19312 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19313 // must be flushed if the atomic ordering had a release semantics. This is
19314 // not necessary a fence, a release fence just coincides to do that flush.
19315 // Avoid replacing of an atomicrmw with a release semantics.
19316 if (isReleaseOrStronger(Order))
19317 return nullptr;
19318
19319 LoadInst *LI = Builder.CreateAlignedLoad(
19320 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19321 LI->setAtomic(Order, AI->getSyncScopeID());
19322 LI->copyMetadata(*AI);
19323 LI->takeName(AI);
19324 AI->replaceAllUsesWith(LI);
19325 AI->eraseFromParent();
19326 return LI;
19327}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1258
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1255
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1102
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6053
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
bool isNegative() const
Definition APFloat.h:1431
bool isNormal() const
Definition APFloat.h:1435
APInt bitcastToAPInt() const
Definition APFloat.h:1335
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1120
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1061
bool isInfinity() const
Definition APFloat.h:1428
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:215
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:806
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1078
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:226
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:223
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:422
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:153
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:541
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
@ BRCOND
X86 conditional branches.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs